# Data Loading, Storage, and File Formats

## Reading and Writing Data in Text Format

In [1]:
import pandas as pd

In [3]:
df = pd.read_csv('ex1.csv')

In [4]:
df

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


In [5]:
pd.read_table('ex1.csv', sep=',')

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


In [11]:
pd.read_csv('ex2.csv', header=None)

Unnamed: 0,0,1,2,3,4
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


In [12]:
pd.read_csv('ex2.csv', names=['a','b','c','d','message'])

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


In [13]:
names = ['a','b','c','d','message']

In [14]:
pd.read_csv('ex2.csv', names=names, index_col='message')

Unnamed: 0_level_0,a,b,c,d
message,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
hello,1,2,3,4
world,5,6,7,8
foo,9,10,11,12


In [15]:
parsed = pd.read_csv('csv_mindex.csv',index_col=['key1','key2'])

In [17]:
parsed

Unnamed: 0_level_0,Unnamed: 1_level_0,value1,value2
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
one,a,1,2
one,b,3,4
one,c,5,6
one,d,7,8
two,a,9,10
two,b,11,12
two,c,13,14
two,d,15,16


In [22]:
list(open('ex3.txt'))

['  A B C\n',
 'aaa -0.264438 -1.026059 -0.619500\n',
 'bbb  0.927272  0.302904 -0.032399\n',
 'ccc -0.264273 -0.386314 -0.217601\n',
 'ddd -0.871858 -0.348382  1.100491']

In [23]:
result= pd.read_table('ex3.txt',sep= '\s+')

In [24]:
result

Unnamed: 0,A,B,C
aaa,-0.264438,-1.026059,-0.6195
bbb,0.927272,0.302904,-0.032399
ccc,-0.264273,-0.386314,-0.217601
ddd,-0.871858,-0.348382,1.100491


In [27]:
pd.read_csv('ex4.csv', skiprows=[0,2,3])

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


In [28]:
result = pd.read_csv('ex5.csv')

In [29]:
result

Unnamed: 0,something,a,b,c,d,message
0,one,1,2,3.0,4,
1,two,5,6,,8,world
2,three,9,10,11.0,12,foo


In [30]:
pd.isnull(result)

Unnamed: 0,something,a,b,c,d,message
0,False,False,False,False,False,True
1,False,False,False,True,False,False
2,False,False,False,False,False,False


In [34]:
result = pd.read_csv('ex5.csv', na_values=['NULL'])

In [35]:
result

Unnamed: 0,something,a,b,c,d,message
0,one,1,2,3.0,4,
1,two,5,6,,8,world
2,three,9,10,11.0,12,foo


In [36]:
sentinels = {'message': ['foo','NA'],'something':['two']}

In [37]:
pd.read_csv('ex5.csv', na_values=sentinels)

Unnamed: 0,something,a,b,c,d,message
0,one,1,2,3.0,4,
1,two,5,6,,8,world
2,three,9,10,11.0,12,


## Reading Text Files in Pieces

In [38]:
pd.options.display.max_rows = 10

In [42]:
import pandas as pd
import numpy as np

# Set random seed for reproducibility
np.random.seed(42)

# Generate the data
data = []
for i in range(10000):
    row = [
        i,
        np.random.normal(0, 1),
        np.random.normal(0, 1),
        np.random.normal(0, 1),
        np.random.normal(0, 1),
        np.random.choice(list('ABCDEFGHIJKLMNOPQRSTUVWXYZ'))
    ]
    data.append(row)

# Create DataFrame
columns = ['', 'one', 'two', 'three', 'four', 'key']
df = pd.DataFrame(data, columns=columns)

# Save to CSV
df.to_csv('ex6.csv', index=False, float_format='%.6f')

print("File 'ex6.csv' created with 10000 rows!")

File 'ex6.csv' created with 10000 rows!


In [43]:
result = pd.read_csv('ex6.csv')

In [44]:
result

Unnamed: 0.1,Unnamed: 0,one,two,three,four,key
0,0,0.496714,-0.138264,0.647689,1.523030,G
1,1,-0.916827,-0.124147,-2.010963,-0.492803,H
2,2,-0.580878,-0.525170,-0.571380,-0.924083,V
3,3,-2.439106,0.603441,-0.251044,-0.163867,S
4,4,0.142465,-0.034652,1.134339,-0.104746,G
...,...,...,...,...,...,...
9995,9995,-0.393108,-1.123158,-0.079028,-0.779463,U
9996,9996,0.265026,-1.227806,0.268761,1.344134,H
9997,9997,-0.203000,-0.407957,-0.709026,0.331596,J
9998,9998,-1.595832,-0.560294,0.451260,1.157619,J


In [47]:
chunker = pd.read_csv('ex6.csv', chunksize=1000)

In [48]:
chunker

<pandas.io.parsers.readers.TextFileReader at 0x19f6f22cd70>

In [49]:
chunker = pd.read_csv('ex6.csv', chunksize=1000)

tot= pd.Series([])
for piece in chunker:
    tot = tot.add(piece['key'].value_counts(), fill_value=0)

tot = tot.sort_values(ascending=False)

In [50]:
tot[:10]

key
V    426
U    416
O    413
R    411
M    411
S    410
Q    395
W    392
D    392
C    390
dtype: object