## melt整理数据

### 美国收入与宗教信仰数据

In [1]:
# 下面加载美国收入与宗教信仰数据，这种数据称为“宽”数据
import pandas as pd
pew = pd.read_csv('data/pew.csv')
pew.head()

Unnamed: 0,religion,<$10k,$10-20k,$20-30k,$30-40k,$40-50k,$50-75k,$75-100k,$100-150k,>150k,Don't know/refused
0,Agnostic,27,34,60,81,76,137,122,109,84,96
1,Atheist,12,27,37,52,35,70,73,59,74,76
2,Buddhist,27,21,30,34,33,58,62,39,53,54
3,Catholic,418,617,732,670,638,1116,949,792,633,1489
4,Don’t know/refused,15,14,15,11,10,35,21,17,18,116


In [3]:
# 对于展示数据而言，这种"宽"数据没有任何问题，如第一行数据，展示了Agnostic（不可知论（者））所有的收入分布情况
# 从数据分析的角度，有时候我们需要把数据由"宽"数据，转换成”长”数据
# 使用melt对上面的pew数据集进行处理
pew_long = pd.melt(pew,id_vars='religion')


In [4]:
# 可以更改melt之后的数据的列名
pew_long = pd.melt(pew,id_vars='religion',var_name='income',value_name='count')


In [5]:
pew_long

Unnamed: 0,religion,income,count
0,Agnostic,<$10k,27
1,Atheist,<$10k,12
2,Buddhist,<$10k,27
3,Catholic,<$10k,418
4,Don’t know/refused,<$10k,15
...,...,...,...
175,Orthodox,Don't know/refused,73
176,Other Christian,Don't know/refused,18
177,Other Faiths,Don't know/refused,71
178,Other World Religions,Don't know/refused,8


### 唱片信息数据

In [6]:
# 在使用melt函数转换数据的时候，也可以固定多数列，只转换少数列
bill_board = pd.read_csv('data/billboard.csv')


In [7]:
# 使用melt 对上面数据的week进行处理，转换成长数据
bill_borad_long = pd.melt(bill_board,id_vars=['year','artist','track','time','date.entered'],
                          var_name='week',value_name='rating')


In [8]:
# 当我们查询任意一首歌曲信息时，会发现数据的存储有冗余的情况
bill_borad_long[bill_borad_long.track =='Loser']


Unnamed: 0,year,artist,track,time,date.entered,week,rating
3,2000,3 Doors Down,Loser,4:24,2000-10-21,wk1,76.0
320,2000,3 Doors Down,Loser,4:24,2000-10-21,wk2,76.0
637,2000,3 Doors Down,Loser,4:24,2000-10-21,wk3,72.0
954,2000,3 Doors Down,Loser,4:24,2000-10-21,wk4,69.0
1271,2000,3 Doors Down,Loser,4:24,2000-10-21,wk5,67.0
...,...,...,...,...,...,...,...
22510,2000,3 Doors Down,Loser,4:24,2000-10-21,wk72,
22827,2000,3 Doors Down,Loser,4:24,2000-10-21,wk73,
23144,2000,3 Doors Down,Loser,4:24,2000-10-21,wk74,
23461,2000,3 Doors Down,Loser,4:24,2000-10-21,wk75,


In [9]:
# 实际上，上面的数据包含了两类数据：歌曲信息、周排行信息
# 对于同一首歌曲来说，歌曲信息是完全一样的，可以考虑单独保存歌曲信息
# 减少上表中保存的歌曲信息，可以节省存储空间，需要完整信息的时候，可以通过merge拼接数据
# 我们可以把year,artist,track,time和date.entered放入一个新的dataframe中
billboard_songs = bill_borad_long[['year','artist','track','time','date.entered']]
billboard_songs = billboard_songs.drop_duplicates()



In [10]:
# 为上面数据添加id列
billboard_songs['id'] = range(len(billboard_songs))


In [12]:
# 将id列关联到原始数据，得到包含id的完整数据，并从完整数据中，取出每周评分部分，去掉冗余信息
billboard_ratings = bill_borad_long.merge(billboard_songs,on=['year','artist','track','time','date.entered'])
billboard_ratings = billboard_ratings[['id','week','rating']]


In [13]:
# 数据拆分成两个dataframe：billboard_songs和 billboard_ratings，保存成文件后可以减少磁盘开销
# 加载时可以通过merge再还原成原始数据
billboard_songs.merge(billboard_ratings,on=['id'])


Unnamed: 0,year,artist,track,time,date.entered,id,week,rating
0,2000,2 Pac,Baby Don't Cry (Keep...,4:22,2000-02-26,0,wk1,87.0
1,2000,2 Pac,Baby Don't Cry (Keep...,4:22,2000-02-26,0,wk2,82.0
2,2000,2 Pac,Baby Don't Cry (Keep...,4:22,2000-02-26,0,wk3,72.0
3,2000,2 Pac,Baby Don't Cry (Keep...,4:22,2000-02-26,0,wk4,77.0
4,2000,2 Pac,Baby Don't Cry (Keep...,4:22,2000-02-26,0,wk5,87.0
...,...,...,...,...,...,...,...,...
24087,2000,matchbox twenty,Bent,4:12,2000-04-29,316,wk72,
24088,2000,matchbox twenty,Bent,4:12,2000-04-29,316,wk73,
24089,2000,matchbox twenty,Bent,4:12,2000-04-29,316,wk74,
24090,2000,matchbox twenty,Bent,4:12,2000-04-29,316,wk75,


## stack整理数据

In [14]:
# 加载state_fruit数据集
state_fruit = pd.read_csv('data/state_fruit.csv', index_col=0)

In [15]:
# 使用stack函数
state_fruit.stack()

Texas    Apple      12
         Orange     10
         Banana     40
Arizona  Apple       9
         Orange      7
         Banana     12
Florida  Apple       0
         Orange     14
         Banana    190
dtype: int64

In [17]:

# 使用reset_index()，将结果变为DataFrame
state_fruit_tidy = state_fruit.stack().reset_index()

In [20]:
# 重命名列
state_fruit_tidy.columns = ['state', 'fruit', 'weight']


In [22]:
# 也可以使用rename_axis给不同的行索引层级命名
state_fruit.stack().rename_axis(['state', 'fruit'])

state    fruit 
Texas    Apple      12
         Orange     10
         Banana     40
Arizona  Apple       9
         Orange      7
         Banana     12
Florida  Apple       0
         Orange     14
         Banana    190
dtype: int64

In [23]:
# 再次使用reset_index方法
state_fruit.stack().rename_axis(['state', 'fruit']).reset_index(name='weight')


Unnamed: 0,state,fruit,weight
0,Texas,Apple,12
1,Texas,Orange,10
2,Texas,Banana,40
3,Arizona,Apple,9
4,Arizona,Orange,7
5,Arizona,Banana,12
6,Florida,Apple,0
7,Florida,Orange,14
8,Florida,Banana,190


## wide_to_long整理数据

In [24]:
# 加载数据
movie = pd.read_csv('data/movie.csv')
actor = movie[['movie_title', 'actor_1_name', 'actor_2_name', 'actor_3_name', 
               'actor_1_facebook_likes', 'actor_2_facebook_likes', 'actor_3_facebook_likes']]
actor.head()


Unnamed: 0,movie_title,actor_1_name,actor_2_name,actor_3_name,actor_1_facebook_likes,actor_2_facebook_likes,actor_3_facebook_likes
0,Avatar,CCH Pounder,Joel David Moore,Wes Studi,1000.0,936.0,855.0
1,Pirates of the Caribbean: At World's End,Johnny Depp,Orlando Bloom,Jack Davenport,40000.0,5000.0,1000.0
2,Spectre,Christoph Waltz,Rory Kinnear,Stephanie Sigman,11000.0,393.0,161.0
3,The Dark Knight Rises,Tom Hardy,Christian Bale,Joseph Gordon-Levitt,27000.0,23000.0,23000.0
4,Star Wars: Episode VII - The Force Awakens,Doug Walker,Rob Walker,,131.0,12.0,


In [25]:
# 从上面数据中可以看出，列名中包含了数字 1,2,3 如果想把这部分信息提取到列当中，可以使用wide_to_long函数
# 使用wide_to_long函数时，要求 1,2,3 这样的顺序信息在列名的最后，并用分隔符隔开
# 创建一个自定义函数
def change_col_name(col_name):
    col_name = col_name.replace('_name', '')
    if 'facebook' in col_name:
        fb_idx = col_name.find('facebook')
        col_name = col_name[:5] + col_name[fb_idx - 1:] + col_name[5:fb_idx-1]
    return col_name
actor2 = actor.rename(columns=change_col_name)
actor2.head()


Unnamed: 0,movie_title,actor_1,actor_2,actor_3,actor_facebook_likes_1,actor_facebook_likes_2,actor_facebook_likes_3
0,Avatar,CCH Pounder,Joel David Moore,Wes Studi,1000.0,936.0,855.0
1,Pirates of the Caribbean: At World's End,Johnny Depp,Orlando Bloom,Jack Davenport,40000.0,5000.0,1000.0
2,Spectre,Christoph Waltz,Rory Kinnear,Stephanie Sigman,11000.0,393.0,161.0
3,The Dark Knight Rises,Tom Hardy,Christian Bale,Joseph Gordon-Levitt,27000.0,23000.0,23000.0
4,Star Wars: Episode VII - The Force Awakens,Doug Walker,Rob Walker,,131.0,12.0,


In [26]:
# 使用wide_to_long进行转换
stubs = ['actor', 'actor_facebook_likes']
actor2_tidy = pd.wide_to_long(actor2, 
                              stubnames=stubs, 
                              i=['movie_title'], 
                              j='actor_num', 
                              sep='_').reset_index()
actor2_tidy.head()


Unnamed: 0,movie_title,actor_num,actor,actor_facebook_likes
0,Avatar,1,CCH Pounder,1000.0
1,Pirates of the Caribbean: At World's End,1,Johnny Depp,40000.0
2,Spectre,1,Christoph Waltz,11000.0
3,The Dark Knight Rises,1,Tom Hardy,27000.0
4,Star Wars: Episode VII - The Force Awakens,1,Doug Walker,131.0


## unstack 处理数据

In [None]:
# 之前介绍了stack，unstack可以将stack的结果恢复

In [29]:

state_fruit = pd.read_csv('data/state_fruit.csv', index_col=0)
state_fruit.stack()





Texas    Apple      12
         Orange     10
         Banana     40
Arizona  Apple       9
         Orange      7
         Banana     12
Florida  Apple       0
         Orange     14
         Banana    190
dtype: int64

In [30]:
state_fruit.stack().unstack()

Unnamed: 0,Apple,Orange,Banana
Texas,12,10,40
Arizona,9,7,12
Florida,0,14,190
