[View in Colaboratory](https://colab.research.google.com/github/jonbaer/googlecolab/blob/master/PandasTour.ipynb)

In [0]:
import numpy as np
import pandas as pd
from datetime import time
pd.set_option('html', False)

In [0]:
from IPython.core.display import Image
Image('http://akamaicovers.oreilly.com/images/0636920023784/lrg.jpg')

Important Features in pandas
===

In [0]:
import pandas as pd
import numpy as np

Fast tabular data IO
---

In [0]:
temp = '/Users/wesm/Downloads/minutebars/%s.csv'
path = temp % 'AAPL'
!wc -l $path

  489598 /Users/wesm/Downloads/minutebars/AAPL.csv


In [0]:
aapl_bars = pd.read_csv(temp % 'AAPL')
aapl_bars

In [0]:
%time _ = pd.read_csv(path)

CPU times: user 0.72 s, sys: 0.18 s, total: 0.90 s
Wall time: 0.90 s


Time series operations
---

In [0]:
aapl_bars.dt

In [0]:
aapl_bars.index = pd.to_datetime(aapl_bars.pop('dt'))

In [0]:
aapl_bars.head()

In [0]:
def load_bars(ticker):
    bars = pd.read_csv(temp % ticker)
    bars.index = pd.to_datetime(bars.pop('dt'))
    return bars

In [0]:
aapl_bars.at_time(time(15, 0)).head(10)

In [0]:
aapl_bars.close_price['2009-10-15']

In [0]:
aapl_bars.close_price

In [0]:
mth_mean = aapl_bars.close_price.resample('M', how=['mean', 'median', 'std'])
mth_mean

In [0]:
mth_mean.plot()

In [0]:
close = aapl_bars.close_price
close / close.shift(1) - 1

In [0]:
minute_returns = aapl_bars.close_price.pct_change()
std_10day = pd.rolling_std(minute_returns, 390 * 10)
std_10day.resample('B').plot()

Data alignment
---

In [0]:
ts1 = pd.Series(np.random.randn(10), 
                index=pd.date_range('1/1/2000', periods=10))
ts1

In [0]:
ts2 = ts1[[0, 2, 4, 5, 6, 7, 8]]
ts2

In [0]:
ts1 + ts2

In [0]:
df = pd.DataFrame({'A': ts1, 'B': ts2})
df

In [0]:
ibm_bars = load_bars('IBM')

In [0]:
def subsample(frame, pct=0.9):
    N = len(frame)
    indexer = np.sort(np.random.permutation(N)[:pct*N])
    return frame.take(indexer)

f1 = subsample(ibm_bars)
f2 = subsample(aapl_bars)
f1

In [0]:
both = pd.concat([f1, f2], axis=1, keys=['IBM', 'AAPL'])
both.head(20)

Missing data handling
---

In [0]:
df

In [0]:
df.count()

In [0]:
both.count()

In [0]:
df.sum()

In [0]:
df.mean(1)

In [0]:
df.dropna()

In [0]:
df.fillna(0)

In [0]:
df.fillna(method='ffill')

In [0]:
df.asfreq('4h')

In [0]:
df.asfreq('4h').ffill(limit=3)

Groupby operations
---

In [0]:
import random, string
import matplotlib as mpl
def rands(n):
    choices = string.ascii_letters
    return ''.join([random.choice(choices) for _ in xrange(n)])
mpl.rc('figure', figsize=(12, 8))

ind_names = np.array(['ENERGY', 'FINANCIAL', 'TECH', 
                      'CONSDUR', 'SERVICES', 'UTILITIES'], dtype='O')
ccys = np.array(['USD', 'EUR'], dtype='O')

Nfull = 2000
tickers = np.array(sorted(rands(5).upper() for _ in xrange(Nfull)), dtype='O')
tickers = np.unique(tickers)

industries = pd.Series(ind_names.take(np.random.randint(0, 6, Nfull)), 
                       index=tickers, name='industry')
ccy = pd.Series(ccys.take(np.random.randint(0, len(ccys), Nfull)), 
                index=tickers, name='ccy')

In [0]:
ccy

In [0]:
df = pd.DataFrame({'Momentum' : np.random.randn(1000) / 200 + 0.03,
                'Value' : np.random.randn(1000) / 200 + 0.08,
                'ShortInterest' : np.random.randn(1000) / 200 - 0.02},
                index=tickers.take(np.random.permutation(Nfull)[:1000]))
df.head()

In [0]:
means = df.groupby(industries).mean()
means

In [0]:
means.plot(kind='barh')

In [0]:
means = df.groupby([industries, ccy]).mean()
means

In [0]:
keys = [industries, ccy]
zscore = lambda x: (x - x.mean()) / x.std()
normed = df.groupby(keys).apply(zscore)

In [0]:
normed.groupby(keys).agg(['mean', 'std'])

Hierarchical indexing
---

In [0]:
means

In [0]:
means['Momentum']

In [0]:
means.ix['TECH']

In [0]:
means.stack()

In [0]:
means.stack().unstack('industry')

Merging and joining
---

In [0]:
base = '/Users/wesm/Dropbox/book/svn/book_scripts/movielens/ml-1m'
get_path = lambda x: '%s/%s.dat' % (base, x)

unames = ['user_id', 'gender', 'age', 'occupation', 'zip']
users = pd.read_table(get_path('users'), sep='::', header=None, names=unames)

rnames = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_table(get_path('ratings'), sep='::', header=None, names=rnames)
mnames = ['movie_id', 'title', 'genres']
movies = pd.read_table(get_path('movies'), sep='::', header=None, names=mnames)

In [0]:
movies.head()

In [0]:
ratings.head()

In [0]:
users.head()

In [0]:
data = pd.merge(pd.merge(ratings, users), movies)
data

In [0]:
rating_counts = data.groupby('title').size()
freq_titles = rating_counts.index[rating_counts > 1000]
freq_titles

In [0]:
highest_rated = data.groupby('title').rating.mean()[freq_titles].order()[-20:]
highest_rated

In [0]:
filtered = data[data.title.isin(highest_rated.index)]
filtered.title = filtered.title.str[:25]
filtered.groupby(['title', 'gender']).rating.count().unstack()

Pivot tables
---

In [0]:
mean_ratings = data.pivot_table('rating', rows='title',
                                cols='gender', aggfunc='mean')
mean_ratings.tail(20)

Data summary, statistics
---
summary, value_counts, etc.

In [0]:
data.title.value_counts()

In [0]:
data.rating.describe()

In [0]:
by_gender = data.groupby('gender').rating.describe()
by_gender

In [0]:
by_gender.unstack(0)