# Uploading, Streaming, and Sampling Data

## Uploading small amounts of data into memory

In [1]:
with open("Colors.txt", 'rb') as open_file:
    print 'Colors.txt content:\n' + open_file.read()

SyntaxError: invalid syntax (<ipython-input-1-56f11cd09800>, line 2)

## Streaming large amounts of data into memory

In [3]:
with open("Colors.txt", 'rb') as open_file:
    for observation in open_file:
        print 'Reading Data: ' + observation

TypeError: must be str, not bytes

## Sampling data

### Fixed samples

In [3]:
n = 3
with open("Colors.txt", 'rb') as open_file:
    for j, observation in enumerate(open_file):
        if j % n==0:
            print('Reading Line: ' + str(j) + 
            ' Content: ' + observation)

Reading Line: 0 Content: Color	Value

Reading Line: 3 Content: Yellow	3

Reading Line: 6 Content: Purple	6



### Random samples

In [4]:
from random import random
sample_size = 0.25
with open("Colors.txt", 'rb') as open_file:
    for j, observation in enumerate(open_file):
        if random()<=sample_size:
            print('Reading Line: ' + str(j) + 
            ' Content: ' + observation)

Reading Line: 2 Content: Orange	2

Reading Line: 3 Content: Yellow	3



# Accessing Data in Structured Flat File Form

## Reading from a text file

In [5]:
import pandas as pd
color_table = pd.io.parsers.read_table("Colors.txt")
print color_table

    Color  Value
0     Red      1
1  Orange      2
2  Yellow      3
3   Green      4
4    Blue      5
5  Purple      6
6   Black      7
7   White      8


## Reading CSV delimited format

In [6]:
import pandas as pd
titanic = pd.io.parsers.read_csv("Titanic.csv")
X = titanic[['age']]
#X = titanic[['age']].values
print X

            age
0       29.0000
1        0.9167
2        2.0000
3       30.0000
4       25.0000
5       48.0000
6       63.0000
7       39.0000
8       53.0000
9       71.0000
10      47.0000
11      18.0000
12      24.0000
13      26.0000
14      80.0000
15    9999.0000
16      24.0000
17      50.0000
18      32.0000
19      36.0000
20      37.0000
21      47.0000
22      26.0000
23      42.0000
24      29.0000
25      25.0000
26      25.0000
27      19.0000
28      35.0000
29      28.0000
...         ...
1279    14.0000
1280    22.0000
1281    22.0000
1282  9999.0000
1283  9999.0000
1284  9999.0000
1285    32.5000
1286    38.0000
1287    51.0000
1288    18.0000
1289    21.0000
1290    47.0000
1291  9999.0000
1292  9999.0000
1293  9999.0000
1294    28.5000
1295    21.0000
1296    27.0000
1297  9999.0000
1298    36.0000
1299    27.0000
1300    15.0000
1301    45.5000
1302  9999.0000
1303  9999.0000
1304    14.5000
1305  9999.0000
1306    26.5000
1307    27.0000
1308    29.0000

[1309 r

## Reading Excel and other Microsoft Office files

In [7]:
import pandas as pd
xls = pd.ExcelFile("Values.xls")
trig_values = xls.parse('Sheet1', index_col=None, 
                        na_values=['NA'])
#trig_values = pd.read_excel("Values.xls", 'Sheet1', index_col=None, na_values=['NA'])
print trig_values

    Angle (Degrees)      Sine    Cosine    Tangent
0        138.550574  0.661959 -0.749540  -0.883153
1        305.535745 -0.813753  0.581211  -1.400100
2        280.518695 -0.983195  0.182556  -5.385709
3        216.363795 -0.592910 -0.805269   0.736289
4         36.389247  0.593268  0.805005   0.736974
5         31.474311  0.522116  0.852874   0.612184
6        120.121669  0.864962 -0.501838  -1.723588
7        293.947055 -0.913921  0.405892  -2.251634
8        179.882632  0.002048 -0.999998  -0.002048
9        120.927562  0.857818 -0.513954  -1.669056
10        71.349485  0.947487  0.319795   2.962796
11       241.971082 -0.882711 -0.469917   1.878439
12       297.208817 -0.889346  0.457235  -1.945053
13       142.004551  0.615599 -0.788060  -0.781158
14       173.770696  0.108508 -0.994096  -0.109152
15       229.232002 -0.757360 -0.652998   1.159820
16        67.926976  0.926706  0.375788   2.466033
17       261.866575 -0.989941 -0.141479   6.997102
18        59.185450  0.858830  

# Sending Data in Unstructured File Form

## Rendering the image

In [8]:
from skimage.io import imread
from skimage.transform import resize 
from matplotlib import pyplot as plt
import matplotlib.cm as cm

example_file = ("http://upload.wikimedia.org/" +
    "wikipedia/commons/7/7d/Dog_face.png")
image = imread(example_file, as_grey=True)
plt.imshow(image, cmap=cm.gray)
plt.show()

## Displaying the image information

In [9]:
print("data type: %s, shape: %s" % 
      (type(image), image.shape))

data type: <type 'numpy.ndarray'>, shape: (90L, 90L)


## Cropping the image

In [10]:
image2 = image[5:70,0:70]
plt.imshow(image2, cmap=cm.gray)
plt.show()

## Resizing the image

In [11]:
image3 = resize(image2, (30, 30), mode='nearest')
plt.imshow(image3, cmap=cm.gray)
print("data type: %s, shape: %s" % 
      (type(image3), image3.shape))

data type: <type 'numpy.ndarray'>, shape: (30L, 30L)


## Flatening the image

In [12]:
image_row = image3.flatten()
print("data type: %s, shape: %s" % 
      (type(image_row), image_row.shape))

data type: <type 'numpy.ndarray'>, shape: (900L,)


# Accessing Data from the Web

In [13]:
from lxml import objectify
import pandas as pd

xml = objectify.parse(open('XMLData.xml'))
root = xml.getroot()
df = pd.DataFrame(columns=('Number', 'String', 'Boolean'))

for i in range(0,4):
    obj = root.getchildren()[i].getchildren()
    row = dict(zip(['Number', 'String', 'Boolean'], 
                   [obj[0].text, obj[1].text, 
                    obj[2].text]))
    row_s = pd.Series(row)
    row_s.name = i
    df = df.append(row_s)
    
print df

  Number  String Boolean
0      1   First    True
1      2  Second   False
2      3   Third    True
3      4  Fourth   False
