## Set up

In [None]:
import pandas as pd
import numpy as np

## Stratified sampling
We have 223644 songs released from 2016 to 2020 that makes it to the weekly top 100 songs chart from 2017 - 2020. However, I hypothesize that there are in fact 1,000,000 songs.

In [None]:
curr_path = '/content/drive/MyDrive/Self-study/ML/melon/'
main_df = pd.read_csv(curr_path + 'melon_data/main_df.csv')

In [None]:
main_df['is_top'].value_counts()

In [None]:
#calculate sample size using Cochran’s Formula 

z = 1.96
e = 0.0006/2
p_hat = 1241/(1241+222403)
q_hat = 1 - p_hat
n = (z**2 * p_hat * q_hat)/e**2
n

In [None]:
p_hat

Since p_hat is very small, 0.005. It's important to have a small margin or error. When p = 0.5, width is typically 0.06. In this case, e should be 0. However, in this case, n should be 235541 which is beyond 


In [None]:
curr_path = '/content/drive/MyDrive/Self-study/ML/melon/'
main_df = pd.read_csv(curr_path + 'melon_data/main_df.csv', index_col = 0)
main_df.drop(['song_id'], axis = 1, inplace = True)

sampling_df = main_df.groupby('is_top',group_keys = False).apply(lambda x : x.sample(frac=0.005, random_state = 1))
sampling_df.to_csv(curr_path+'melon_data/sampling.csv')

## Extract artist metadata for the sample

In [None]:
main_url = 'https://www.melon.com/artist/detail.htm?artistId={artist_id}'
user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.2 Safari/605.1.15'
headers = {'User-Agent': user_agent}

def get_artist_metadata(artist_id):
    sub_url = main_url.format(artist_id = artist_id)
    r = requests.get(sub_url, headers = headers)
    bs = BeautifulSoup(r.text)

    temp_dict = {}
    
    ####
    bs_info1 = bs.find('div', {'class': 'section_atistinfo03'})

    pattern = r'(\d{4}(?:\.\d{2}\.\d{2})?)'
    try:    
        target = bs_info1.find('dl', {'class': 'list_define clfix'}).text
        temp_dict['debut_date'] = re.findall(pattern, target)[0]
    except:
        temp_dict['debut_date'] = np.nan

    pattern =  r'유형\n(\w*)[\s\S]*\|(\w*)'
    try: 
        temp_dict['artist_type'], temp_dict['artist_gender'] = re.findall(pattern, target)[0]
    except:
        temp_dict['artist_type'], temp_dict['artist_gender'] = np.nan, np.nan

    ####
    bs_info2 = bs.find('div', {'class': 'section_atistinfo04'})

    pattern = r'국적\n(\w*)'
    try: 
        target = bs_info2.find('dl', {'class': 'list_define clfix'}).text
        temp_dict['artist_nationality'] = re.findall(pattern, target)[0]
    except: 
        temp_dict['artist_nationality'] = np.nan

    return temp_dict 

In [None]:
artist_metadata = {}
for artist_id in sampling_df['artist_id']:
    if artist_id in artist_metadata:
        continue
    artist_metadata[artist_id] = get_artist_metadata(artist_id)

artist_df = pd.DataFrame(artist_metadata).T
artist_df.to_csv(curr_path + 'melon_data/sampling_artist.csv')

## Load artist metadata

In [None]:
file_names = ['melon_data/sampling_artist' + str(i) + '.csv' for i in range(1,12)]

all_artist_metadata_df = pd.DataFrame({'Unnamed: 0':[],'debut_date':[], 'artist_type':[], 'artist_gender':[], 'artist_nationality':[]})
for file_name in file_names:
    additional_df = pd.read_csv(curr_path + file_name)
    all_artist_metadata_df = all_artist_metadata_df.append(additional_df, ignore_index = True)

all_artist_metadata_df = all_artist_metadata_df.rename(columns= {'Unnamed: 0': 'artist_id'})

all_artist_metadata_df.fillna(0,inplace = True)
all_artist_metadata_df.drop_duplicates(subset = ['artist_id'],inplace = True)
all_artist_metadata_df.to_csv(curr_path+'melon_data/sampling_artist.csv')

In [None]:
test_df = sampling_df.merge(all_artist_metadata_df, how = 'inner', on = 'artist_id')

## Load, Merge Datasets and Split them to two samples

In [None]:
curr_path = '/content/drive/MyDrive/Self-study/ML/melon/'
sampling_artist = pd.read_csv(curr_path + 'melon_data/sampling_artist.csv')
sampling_df = pd.read_csv(curr_path + 'melon_data/sampling.csv')

In [None]:
sampling_df.drop(['Unnamed: 0'], inplace = True, axis = 1)
sampling_artist.drop(['Unnamed: 0'], inplace = True, axis = 1)

In [None]:
sampling_artist

Unnamed: 0,artist_id,debut_date,artist_type,artist_gender,artist_nationality
0,923691.0,0,그룹,혼성,대한민국
1,957555.0,2016,그룹,혼성,대한민국
2,996531.0,0,0,0,0
3,2110551.0,2018.03.07,그룹,혼성,대한민국
4,827147.0,2010,솔로,남성,대한민국
...,...,...,...,...,...
1035,675084.0,2010,그룹,남성,영국
1036,785589.0,2014.09.18,그룹,남성,대한민국
1037,1023977.0,2010,0,0,대한민국
1038,6984.0,2009,솔로,남성,대한민국


In [None]:
sampling_df = sampling_df.merge(sampling_artist, how = 'left', on = ['artist_id'])

In [None]:
sampling_df

Unnamed: 0,song_gn_dtl_gnr_basket,issue_date,alb_name,alb_id,song_name,id,issue_year,artist_id,song_gn_gnr,artist_name,is_top,debut_date,artist_type,artist_gender,artist_nationality
0,"['GN0501', 'GN0601', 'GN0503', 'GN0606', 'GN05...",20160823.0,Individualism,2705739.0,지금 여기,231827.0,2016.0,923691.0,GN0500,해마군단,False,0,그룹,혼성,대한민국
1,['GN0101'],20170612.0,그대 그대로 여기에 있다,10069785.0,그대 그대로 여기에 있다 (Feat. 김인수),349902.0,2017.0,957555.0,GN0100,소담 (小談),False,2016,그룹,혼성,대한민국
2,['GN0901'],20160607.0,This Is What You Came For (Remix Calvin Harris...,2869092.0,This Is What You Came For (Hits Remix Tritube ...,633927.0,2016.0,996531.0,GN0900,Steevy Cruz,False,0,0,0,0
3,"['GN2102', 'GN2101']",20181114.0,Recolor Carol Vol.1,10221801.0,고요한 밤 거룩한 밤 (Silent Night),654826.0,2018.0,2110551.0,GN2100,Recolor,False,2018.03.07,그룹,혼성,대한민국
4,"['GN0302', 'GN0301']",20181217.0,BFOTY,10233247.0,off my head (Feat. Bryn),199089.0,2018.0,827147.0,GN0300,FUTURISTIC SWAVER,False,2010,솔로,남성,대한민국
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1113,"['GN0303', 'GN0301']",20170123.0,`Season of Suffering` (고난의 시기),10033017.0,I Am Me (Feat. 화사 Of 마마무),123107.0,2017.0,402526.0,GN0300,San E,True,2010.09.13,솔로,남성,대한민국
1114,"['GN0105', 'GN1501', 'GN0101', 'GN1504']",20180329.0,나의 아저씨 OST Part.2,10152984.0,어른,286934.0,2018.0,1023977.0,GN1500,Sondia,True,2010,0,0,대한민국
1115,"['GN0401', 'GN2502', 'GN0403', 'GN2501']",20170608.0,권지용,10069644.0,OUTRO. 신곡(神曲) (Divina Commedia),244176.0,2017.0,6984.0,GN0400,G-DRAGON,True,2009,솔로,남성,대한민국
1116,"['GN2503', 'GN0105', 'GN2501', 'GN0101', 'GN25...",20200115.0,Purpose - The 2nd Album Repackage,10376260.0,내게 들려주고 싶은 말 (Dear Me),157393.0,2020.0,236797.0,GN2500,태연 (TAEYEON),True,2015.10.07,솔로,여성,대한민국


In [None]:
sampling_df['artist_gender'].value_counts()

남성    609
0     205
여성    194
혼성    110
Name: artist_gender, dtype: int64

In [None]:
female_artist_df = sampling_df[sampling_df['artist_gender'] == '여성']
male_artist_df = sampling_df[sampling_df['artist_gender'] == '남성']

In [None]:
female_artist_df

Unnamed: 0,song_gn_dtl_gnr_basket,issue_date,alb_name,alb_id,song_name,id,issue_year,artist_id,song_gn_gnr,artist_name,is_top,debut_date,artist_type,artist_gender,artist_nationality
8,"['GN1201', 'GN1209']",20181116.0,Acrylic,10222611.0,Girl Blunt,410091.0,2018.0,857869.0,GN1200,Leikeli47,False,2010,솔로,여성,미국
9,"['GN1301', 'GN1308', 'GN1302']",20191107.0,Hot Pink,10348753.0,Addiction,453890.0,2019.0,764646.0,GN1300,Doja Cat,False,2010,솔로,여성,미국
24,"['GN1912', 'GN1901']",20170202.0,Five,10061569.0,Sharp,341978.0,2017.0,686207.0,GN1900,Negoto,False,2010,그룹,여성,일본
30,['GN1801'],20170522.0,마음이 아픈 날,10064063.0,마음이 아픈 날,303178.0,2017.0,1229978.0,GN1800,플라코,False,2010,솔로,여성,대한민국
32,"['GN1912', 'GN1902', 'GN1901']",20180209.0,When the Sun Will Rise,10136965.0,The Silence,389487.0,2018.0,50006.0,GN1900,Shoko,False,2000,솔로,여성,일본
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1096,"['GN0908', 'GN0901']",20190820.0,Stranger&#39;s Arms (Feat. DCF) Remix,10318807.0,Stranger&#39;s Arms (Feat. DCF) Remix,65650.0,2019.0,2138201.0,GN0900,Lo Lo,False,2010,솔로,여성,캐나다
1098,['GN0901'],20200320.0,Cherry,10404320.0,Cherry (Feat. lil aaron),295965.0,2020.0,2765225.0,GN0900,Almondmilkhunni,False,2020,솔로,여성,미국
1101,"['GN1014', 'GN1008', 'GN1001']",20190208.0,What Chaos is Imaginary,10240915.0,What Chaos is Imaginary,47302.0,2019.0,1703988.0,GN1000,Girlpool,False,2010,솔로,여성,미국
1104,"['GN0105', 'GN1501', 'GN0101', 'GN1504']",20180217.0,화유기 OST Part.8,10139894.0,Always You,707671.0,2018.0,521301.0,GN1500,leeSA (리싸),False,2011,솔로,여성,대한민국


In [None]:
def proportion_t_test(data1, data2, alpha):
    #calculate p_hat
    p_hat1 = 

In [None]:
male_artist_df['is_top'].value_counts()

False    605
True       4
Name: is_top, dtype: int64