In [None]:
%matplotlib inline

# 第2章引言

## 来自bit.ly的1.usa.gov数据
### 用python读取JSON数据

In [None]:
path = 'usagov_bitly_data2012-03-16-1331923249.txt'
open(path).readline()

### 用json库将JSON数据转换为python对象

In [None]:
import json
path = 'usagov_bitly_data2012-03-16-1331923249.txt' 
records = [json.loads(line) for line in open(path)]
records[0]

In [None]:
records[0]['tz'] #第0条数据的tz列

## 用纯Python代码对时区进行计数
### 取出数据中的tz列

In [None]:
time_zones = [rec['tz'] for rec in records]

In [None]:
time_zones = [rec['tz'] for rec in records if 'tz' in rec]
time_zones[:10]

### tz==‘America/New_York’的记录有多少条？

In [None]:
def get_counts(sequence): 
    counts = {}
    for x in sequence: 
        if x in counts:
            counts[x] += 1 
        else:
            counts[x] = 1 
    return counts

In [None]:
from collections import defaultdict
def get_counts2(sequence):
    counts = defaultdict(int) # values will initialize to 0 for x in sequence:
    counts[x] += 1 
    return counts

In [None]:
counts = get_counts(time_zones)
counts['America/New_York']

In [None]:
len(time_zones)

### 按照count数对时区进行排序

In [None]:
def top_counts(count_dict, n=10):
    value_key_pairs = [(count, tz) for tz, count in count_dict.items()] 
    value_key_pairs.sort()
    return value_key_pairs[-n:]

In [None]:
In [35]: top_counts(counts)

In [None]:
from collections import Counter
counts = Counter(time_zones)
counts.most_common(10)

## 用Pandas对时区进行计数

In [None]:
from pandas import DataFrame, Series
import pandas as pd
frame = DataFrame(records) #创建DataFrame
frame

### tz的前10项

In [None]:
frame['tz'][:10]

### 对tz进行计数

In [None]:
tz_counts = frame['tz'].value_counts()
tz_counts[:10]

### 填充缺失的数据

In [None]:
clean_tz = frame['tz'].fillna('Missing') #NA或者NaN填充为Missing
clean_tz[clean_tz == ''] = 'Unknown' #''填充为Unknown
tz_counts = clean_tz.value_counts()
tz_counts[:10]

### 绘制条形图

In [None]:
tz_counts[:10].plot(kind='barh', rot=0)

### 取出a列中的浏览器信息

In [None]:
frame['a'][1]

In [None]:
frame['a'][50]

In [None]:
frame['a'][51]

In [None]:
results = Series([x.split()[0] for x in frame.a.dropna()])
results[:5]

### 统计浏览器的使用量

In [None]:
results.value_counts()[:8]

### 计算a列是否出现'Windows'

In [None]:
cframe = frame[frame.a.notnull()]
import numpy as np
operating_system = np.where(cframe['a'].str.contains('Windows'),'Windows', 'Not Windows')
operating_system[:5]

### 根据得到的时区tz和操作系统operating_system做分组统计

In [None]:
by_tz_os = cframe.groupby(['tz', operating_system])
agg_counts = by_tz_os.size().unstack().fillna(0)
agg_counts[:10]

### 选取最常出现的时区

In [None]:
indexer = agg_counts.sum(1).argsort()
indexer[:10]

In [None]:
count_subset = agg_counts.take(indexer)[-10:]
count_subset

### 时区和操作系统统计图

In [None]:
count_subset.plot(kind='barh', stacked=True)
normed_subset = count_subset.div(count_subset.sum(1), axis=0) #规范化操作，将数量范围变为0-1之间
normed_subset.plot(kind='barh', stacked=True)

## MovieLens 1M数据集
GroupLens Research(http://www.grouplens.org/node/73)采集了一组从20世纪90年代末到21世纪初由MoieLens用户提供的电影评分数据。

In [None]:
import pandas as pd
unames = ['user_id', 'gender', 'age', 'occupation', 'zip'] 
users = pd.read_table('ml-1m/users.dat', sep='::', header=None,names=unames)
rnames = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_table('ml-1m/ratings.dat', sep='::', header=None,names=rnames)
mnames = ['movie_id', 'title', 'genres']
movies = pd.read_table('ml-1m/movies.dat', sep='::', header=None,names=mnames)

In [None]:
users[:5]

In [None]:
ratings[:5]

In [None]:
movies[:5]

In [None]:
ratings

### 合并ratings、users和moives

In [None]:
data = pd.merge(pd.merge(ratings, users), movies)
data

### 按性别统计每部电影的平均得分

In [None]:
mean_ratings = data.pivot_table('rating', index='title',columns='gender', aggfunc='mean')
#这里书上的rows,cols执行会报错
mean_ratings[:5]

### 获取评论数大于250条的电影

In [None]:
ratings_by_title = data.groupby('title').size()
ratings_by_title[:10]

In [None]:
active_titles = ratings_by_title.index[ratings_by_title >= 250]
active_titles

In [None]:
mean_ratings = mean_ratings.ix[active_titles] #评论数大于250电影的平均评分
mean_ratings

In [None]:
top_female_ratings = mean_ratings.sort_index(by='F', ascending=False) #评论数大于250电影按F排序后的平均评分
top_female_ratings[:10]

## 计算评分分歧

In [None]:
mean_ratings['diff'] = mean_ratings['M'] - mean_ratings['F']
sorted_by_diff = mean_ratings.sort_index(by='diff')
sorted_by_diff[:15]

In [None]:
sorted_by_diff[::-1][:15] #反序得到男性观众最喜欢的电影

### 计算评分标准差

In [None]:
rating_std_by_title = data.groupby('title')['rating'].std()
rating_std_by_title = rating_std_by_title.ix[active_titles] #根据active_titles过滤（取评分数大于250的电影）
rating_std_by_title.sort_values(ascending=False)[:10]
#这里用书上的order()会出错

## 1880-2010年间全美婴儿姓名
1880-2010年间美国婴儿的名字频率数据

In [None]:
!head -n 10 babynames/yob1880.txt

In [None]:
import pandas as pd
names1880 = pd.read_csv('babynames/yob1880.txt',
                        names=['name', 'sex', 'births'])
names1880

### 按性别统计出生人数

In [None]:
names1880.groupby('sex').births.sum()

### 将数据组装为一个DataFrame（names）

In [None]:
years = range(1880, 2011)

pieces = []
columns = ['name', 'sex', 'births']

for year in years:
    path = 'babynames/yob%d.txt' % year
    frame = pd.read_csv(path, names=columns)

    frame['year'] = year
    pieces.append(frame)

# Concatenate everything into a single DataFrame
names = pd.concat(pieces, ignore_index=True)

In [None]:
names

In [None]:
names.head(10)

### 用pivot_table聚合数据

In [None]:
total_births = names.pivot_table('births', index='year',
                                 columns='sex', aggfunc=sum)
total_births.tail()

### 绘制生育曲线图

In [None]:
total_births.plot(title='Total births by sex and year')

### 增加prop列
prop列表示某个名字的婴儿占当年总出生数的比例

In [None]:
def add_prop(group):
    group['prop'] = group.births / group.births.sum()
    return group
names = names.groupby(['year', 'sex']).apply(add_prop)

In [None]:
names

### 验证prop的总和是否为1

In [None]:
np.allclose(names.groupby(['year', 'sex']).prop.sum(), 1)

### 每对sex/year组合的前1000个名字

In [None]:
def get_top1000(group):
    return group.sort_values(by='births', ascending=False)[:1000]
grouped = names.groupby(['year', 'sex'])
top1000 = grouped.apply(get_top1000)
# Drop the group index, not needed
top1000.reset_index(inplace=True, drop=True)

In [None]:
pieces = []
for year, group in names.groupby(['year', 'sex']):
    pieces.append(group.sort_values(by='births', ascending=False)[:1000])
top1000 = pd.concat(pieces, ignore_index=True)

In [None]:
top1000

## 分析命名趋势

In [None]:
boys = top1000[top1000.sex == 'M']
girls = top1000[top1000.sex == 'F']
total_births = top1000.pivot_table('births', index='year',
                                   columns='name',aggfunc=sum)
#书上的不行
total_births

In [None]:
total_births.info()

In [None]:
subset = total_births[['John', 'Harry', 'Mary', 'Marilyn']]
subset.plot(subplots=True, figsize=(12, 10), grid=False,
            title="Number of births per year")

In [None]:
table = top1000.pivot_table('prop', index='year',
                            columns='sex', aggfunc=sum)
table.plot(title='Sum of table1000.prop by year and sex',
           yticks=np.linspace(0, 1.2, 13), xticks=range(1880, 2020, 10))
#书上的不行

In [None]:
df = boys[boys.year == 2010]
df

In [None]:
prop_cumsum = df.sort_values(by='prop', ascending=False).prop.cumsum()
prop_cumsum[:10]

In [None]:
prop_cumsum.values.searchsorted(0.5)

In [None]:
df = boys[boys.year == 1900]
in1900 = df.sort_values(by='prop', ascending=False).prop.cumsum()
in1900.values.searchsorted(0.5) + 1

In [None]:
def get_quantile_count(group, q=0.5):
    group = group.sort_values(by='prop', ascending=False)
    return group.prop.cumsum().values.searchsorted(q) + 1

diversity = top1000.groupby(['year', 'sex']).apply(get_quantile_count)
diversity = diversity.unstack('sex')

In [None]:
diversity.head()

In [None]:
diversity.plot(title="Number of popular names in top 50%")

In [None]:
# extract last letter from name column
get_last_letter = lambda x: x[-1]
last_letters = names.name.map(get_last_letter)
last_letters.name = 'last_letter'

table = names.pivot_table('births', index=last_letters,
                          columns=['sex', 'year'], aggfunc=sum)

In [None]:
subtable = table.reindex(columns=[1910, 1960, 2010], level='year')
subtable.head()

In [None]:
subtable.sum()

In [None]:
letter_prop = subtable / subtable.sum().astype(float)

In [None]:
import matplotlib.pyplot as plt

fig, axes = plt.subplots(2, 1, figsize=(10, 8))
letter_prop['M'].plot(kind='bar', rot=0, ax=axes[0], title='Male')
letter_prop['F'].plot(kind='bar', rot=0, ax=axes[1], title='Female',
                      legend=False)

In [None]:
letter_prop = table / table.sum()
dny_ts = letter_prop.loc[['d', 'n', 'y'], 'M'].T
dny_ts.head()

In [None]:
dny_ts.plot()

In [None]:
all_names = top1000.name.unique()
mask = np.array(['lesl' in x.lower() for x in all_names])
lesley_like = all_names[mask]
lesley_like

In [None]:
filtered = top1000[top1000.name.isin(lesley_like)]
filtered.groupby('name').births.sum()

In [None]:
table = filtered.pivot_table('births', index='year',
                             columns='sex', aggfunc='sum')
table = table.div(table.sum(1), axis=0)
table.tail()

In [None]:
table.plot(style={'M': 'k-', 'F': 'k--'})