In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('../data/billboard.csv', encoding='mac_latin2')
df.head()

Unnamed: 0,year,artist.inverted,track,time,genre,date.entered,date.peaked,x1st.week,x2nd.week,x3rd.week,...,x67th.week,x68th.week,x69th.week,x70th.week,x71st.week,x72nd.week,x73rd.week,x74th.week,x75th.week,x76th.week
0,2000,Destiny's Child,Independent Women Part I,3:38,Rock,2000-09-23,2000-11-18,78,63.0,49.0,...,,,,,,,,,,
1,2000,Santana,"Maria, Maria",4:18,Rock,2000-02-12,2000-04-08,15,8.0,6.0,...,,,,,,,,,,
2,2000,Savage Garden,I Knew I Loved You,4:07,Rock,1999-10-23,2000-01-29,71,48.0,43.0,...,,,,,,,,,,
3,2000,Madonna,Music,3:45,Rock,2000-08-12,2000-09-16,41,23.0,18.0,...,,,,,,,,,,
4,2000,"Aguilera, Christina",Come On Over Baby (All I Want Is You),3:38,Rock,2000-08-05,2000-10-14,57,47.0,45.0,...,,,,,,,,,,


## Vấn đề:
* Cột chứa giá trị, từng cell chứa thứ hạng
* Bài hát dưới 75 tuần có chứa NaN

In [3]:
df_after = pd.melt(frame=df, 
                   id_vars=['year', 'artist.inverted', 'track', 'time', 'genre','date.entered','date.peaked'],
                   var_name='week', value_name='rank')
df_after.head(10)

Unnamed: 0,year,artist.inverted,track,time,genre,date.entered,date.peaked,week,rank
0,2000,Destiny's Child,Independent Women Part I,3:38,Rock,2000-09-23,2000-11-18,x1st.week,78.0
1,2000,Santana,"Maria, Maria",4:18,Rock,2000-02-12,2000-04-08,x1st.week,15.0
2,2000,Savage Garden,I Knew I Loved You,4:07,Rock,1999-10-23,2000-01-29,x1st.week,71.0
3,2000,Madonna,Music,3:45,Rock,2000-08-12,2000-09-16,x1st.week,41.0
4,2000,"Aguilera, Christina",Come On Over Baby (All I Want Is You),3:38,Rock,2000-08-05,2000-10-14,x1st.week,57.0
5,2000,Janet,Doesn't Really Matter,4:17,Rock,2000-06-17,2000-08-26,x1st.week,59.0
6,2000,Destiny's Child,Say My Name,4:31,Rock,1999-12-25,2000-03-18,x1st.week,83.0
7,2000,"Iglesias, Enrique",Be With You,3:36,Latin,2000-04-01,2000-06-24,x1st.week,63.0
8,2000,Sisqo,Incomplete,3:52,Rock,2000-06-24,2000-08-12,x1st.week,77.0
9,2000,Lonestar,Amazed,4:25,Country,1999-06-05,2000-03-04,x1st.week,81.0


## Chuẩn hóa dữ liệu
* Thay chuỗi trong tuần bằng số
* Xóa các dòng chứa NaN
* Đổi kiểu rank thành số nguyên

In [4]:
df_after['week'] = df_after['week'].str.extract('(\d+)', expand=False).astype(int)

In [5]:
df_after=df_after.dropna()

In [6]:
df_after['rank']=df_after['rank'].astype(int)

In [9]:
df_after.head()

Unnamed: 0,year,artist.inverted,track,time,genre,date.entered,date.peaked,week,rank
0,2000,Destiny's Child,Independent Women Part I,3:38,Rock,2000-09-23,2000-11-18,1,78
1,2000,Santana,"Maria, Maria",4:18,Rock,2000-02-12,2000-04-08,1,15
2,2000,Savage Garden,I Knew I Loved You,4:07,Rock,1999-10-23,2000-01-29,1,71
3,2000,Madonna,Music,3:45,Rock,2000-08-12,2000-09-16,1,41
4,2000,"Aguilera, Christina",Come On Over Baby (All I Want Is You),3:38,Rock,2000-08-05,2000-10-14,1,57


## Vấn đề mới:
* Trùng lặp dữ liệu => tách bảng song và rank

In [11]:
song_cols = ['year', 'artist.inverted', 'track', 'time', 'genre']
songs = df_after[song_cols].drop_duplicates()
songs = songs.reset_index(drop=True)
songs['song_id'] = songs.index
songs.head()

Unnamed: 0,year,artist.inverted,track,time,genre,song_id
0,2000,Destiny's Child,Independent Women Part I,3:38,Rock,0
1,2000,Santana,"Maria, Maria",4:18,Rock,1
2,2000,Savage Garden,I Knew I Loved You,4:07,Rock,2
3,2000,Madonna,Music,3:45,Rock,3
4,2000,"Aguilera, Christina",Come On Over Baby (All I Want Is You),3:38,Rock,4


In [12]:
ranks = pd.merge(df_after, songs, on=['year', 'artist.inverted', 'track', 'time'])
ranks = ranks[['song_id', 'date.entered', 'date.peaked', 'week', 'rank']]
ranks.head(10)

Unnamed: 0,song_id,date.entered,date.peaked,week,rank
0,0,2000-09-23,2000-11-18,1,78
1,0,2000-09-23,2000-11-18,2,63
2,0,2000-09-23,2000-11-18,3,49
3,0,2000-09-23,2000-11-18,4,33
4,0,2000-09-23,2000-11-18,5,23
5,0,2000-09-23,2000-11-18,6,15
6,0,2000-09-23,2000-11-18,7,7
7,0,2000-09-23,2000-11-18,8,5
8,0,2000-09-23,2000-11-18,9,1
9,0,2000-09-23,2000-11-18,10,1


In [15]:
# Tạo cột date
ranks['date'] = pd.to_datetime(ranks['date.entered']) + pd.to_timedelta(ranks['week'], unit='w') - pd.DateOffset(weeks=1)
ranks.head()

Unnamed: 0,song_id,date.entered,date.peaked,week,rank,date
0,0,2000-09-23,2000-11-18,1,78,2000-09-23
1,0,2000-09-23,2000-11-18,2,63,2000-09-30
2,0,2000-09-23,2000-11-18,3,49,2000-10-07
3,0,2000-09-23,2000-11-18,4,33,2000-10-14
4,0,2000-09-23,2000-11-18,5,23,2000-10-21


In [16]:
ranks = ranks.drop(['date.entered', 'date.peaked'], axis=1)
ranks.head()

Unnamed: 0,song_id,week,rank,date
0,0,1,78,2000-09-23
1,0,2,63,2000-09-30
2,0,3,49,2000-10-07
3,0,4,33,2000-10-14
4,0,5,23,2000-10-21
