In [24]:
#####################################################################
# Examples below are (more or less) taken from 
# “Python for Data Analysis" by William Wesley McKinney (O’Reilly).
# Copyright 2012 William McKinney, 978-1-449-31979-3.
#
# TOPICS COVERED:
# - Reading & Writing Data in Text Format
# - Reading Some Other Formats
#####################################################################

# This notebook is written in Python 3.

from pandas import Series, DataFrame
import pandas as pd
import numpy as np
from numpy import nan as NA

## Reading & Writing Data in Text Format

### Reading in Files

In [25]:
!cat Datasets/ex1.csv

a,b,c,d,message
1,2,3,4,hello
5,6,7,8,world
9,10,11,12,foo

In [26]:
# Use read_csv to read in csv file
df = pd.read_csv("Datasets/ex1.csv")
# Equivalent to pd.read_table("Datasets/ex1.csv", sep=',')
df

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


In [27]:
# csv file without a header row
!cat Datasets/ex2.csv

1,2,3,4,hello
5,6,7,8,world
9,10,11,12,foo

In [28]:
# pd.read_csv("Datasets/ex2.csv") gives the wrong answer: it uses the first row as headers.
print(pd.read_csv("Datasets/ex2.csv", header=None))  # default column names
print(pd.read_csv("Datasets/ex2.csv",
                  names=['a','b','c','d','message']))  # your own column names
print(pd.read_csv("Datasets/ex2.csv",
                  names=['a','b','c','d','message'],
                  index_col='message'))  # your own column names & setting index column

   0   1   2   3      4
0  1   2   3   4  hello
1  5   6   7   8  world
2  9  10  11  12    foo
   a   b   c   d message
0  1   2   3   4   hello
1  5   6   7   8   world
2  9  10  11  12     foo
         a   b   c   d
message               
hello    1   2   3   4
world    5   6   7   8
foo      9  10  11  12


In [29]:
# csv file with rows that are not data and rows which contain NAs
# (somehow this is not printing out correctly)
!cat Datasets/ex3.csv

# heysomething,a,b,c,d,message# just wanted to make things more difficult for you
# who reads CSV files with computers, anyway?
one,1,2,3,4,NA
two,5,6,,8,world
three,9,10,11,12,foo

In [30]:
# add skiprows attribute to skip the rows without data
# By default, pandas uses a set of commonly occuring sentinels (e.g. NA, -1.#IND, NULL)
print(pd.read_csv("Datasets/ex3.csv", skiprows=[0,2,3]))

# na_values option can take a list or set of strings to consider missing values
p = pd.read_csv("Datasets/ex3.csv", skiprows=[0,2,3],
                na_values=['NULL']) 

# different sentinels can be specified for different columns
sentinels = {'message' : ['foo', 'NA'], 'something' : ['two']}
print(pd.read_csv("Datasets/ex3.csv", skiprows=[0,2,3],
                  na_values=sentinels))

  something  a   b   c   d message
0       one  1   2   3   4     NaN
1       two  5   6 NaN   8   world
2     three  9  10  11  12     foo
  something  a   b   c   d message
0       one  1   2   3   4     NaN
1       NaN  5   6 NaN   8   world
2     three  9  10  11  12     NaN


### Handling Big Files

In [31]:
# use nrows to read out only a few lines of the file
pd.read_csv("Datasets/ex4.csv", nrows=5)

Unnamed: 0,one,two,three,four,key
0,0.467976,-0.038649,-0.295344,-1.824726,L
1,-0.358893,1.404453,0.704965,-0.200638,B
2,-0.50184,0.659254,-0.421691,-0.057688,G
3,0.204886,1.074134,1.388361,-0.982404,R
4,0.354628,-0.133116,0.283763,-0.837063,Q


In [32]:
# Use chunksize to read out the file in pieces (i.e. chunks).
# It will return a TextParser which you can iterate over
chunker = pd.read_csv("Datasets/ex4.csv", chunksize=1000)
print(chunker)

# Example: iterate over the chunks to aggregate value counts in the 'key' column
tot = Series([])
for piece in chunker:
    tot = tot.add(piece['key'].value_counts(), fill_value=0)

tot = tot.order(ascending=False)
tot[:10]  # top 10 keys and their counts

<pandas.io.parsers.TextFileReader object at 0x10961ff98>




E    368
X    364
L    346
O    343
Q    340
M    338
J    337
F    335
K    334
H    330
dtype: float64

### Writing Out Data to Text Format

In [33]:
# DataFrame's to_csv method writes it out to a .csv file
df = pd.read_csv("Datasets/ex4.csv", nrows=5)
df.to_csv("Datasets/out1.csv")

!cat Datasets/out1.csv

# Options:
# - sep: to change delimiter
# - na_rep: to change how missing values are represented (default is empty string)
# - index=False: to remove row labels (default is True)
# - header=False: to remove col labels (default is True)
# - cols=[...]: to only write out columns in the list

,one,two,three,four,key
0,0.467976300189,-0.0386485396255,-0.295344251987,-1.82472622729,L
1,-0.358893469543,1.40445260007,0.704964644926,-0.20063830401500002,B
2,-0.50184039929,0.659253707223,-0.42169061931199997,-0.0576883018364,G
3,0.20488621220199998,1.07413396504,1.38836131252,-0.982404023494,R
4,0.354627914484,-0.13311585229599998,0.283762637978,-0.837062961653,Q


In [34]:
# Series can also be written out using to_csv
ts = Series(np.arange(7), index = pd.date_range('1/1/2015', periods=7))
print(ts)
print()
ts.to_csv("Datasets/out2.csv", sep="|")

!cat Datasets/out2.csv

2015-01-01    0
2015-01-02    1
2015-01-03    2
2015-01-04    3
2015-01-05    4
2015-01-06    5
2015-01-07    6
Freq: D, dtype: int64

2015-01-01|0
2015-01-02|1
2015-01-03|2
2015-01-04|3
2015-01-05|4
2015-01-06|5
2015-01-07|6


### JSON Data

In [35]:
obj = """ {"name": "Wes",
           "places_lived": ["United States", "Spain", "Germany"], "pet": null,
           "siblings": [{"name": "Scott", "age": 25, "pet": "Zuko"},
                        {"name": "Katie", "age": 33, "pet": "Cisco"}]
           } """

In [36]:
# json.loads to convert JSON string to Python form
import json
result = json.loads(obj)
print(result)

# json.dumps to convert Python form to JSON
# e.g. asjson = json.dumps(result)

# take the siblings info to make a DataFrame
siblings = DataFrame(result['siblings'], columns=['name', 'age'])
siblings

{'pet': None, 'siblings': [{'name': 'Scott', 'age': 25, 'pet': 'Zuko'}, {'name': 'Katie', 'age': 33, 'pet': 'Cisco'}], 'name': 'Wes', 'places_lived': ['United States', 'Spain', 'Germany']}


Unnamed: 0,name,age
0,Scott,25
1,Katie,33


## Other Data Formats

### Reading & Writing Pickles

In [37]:
# pandas objects have a to_pickle method to write data to disk as a pickle
frame = pd.read_csv("Datasets/ex1.csv")
print(frame)
frame.to_pickle("Datasets/ex1_pickle")

# pd.read_pickle to load a pickle
pd.read_pickle("Datasets/ex1_pickle")

   a   b   c   d message
0  1   2   3   4   hello
1  5   6   7   8   world
2  9  10  11  12     foo


Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


### Reading Microsoft Excel Files

In [38]:
# first, create an instance by passing path to an .xls or .xlsx file
# e.g. xls_file = pd.ExcelFile('data.xls')

# then, data stored in a sheet can be read into a DataFrame using parse
# e.g. table = xls_file.parse('Sheet1')