In [55]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

%matplotlib inline

#データ読み込み
input = Path('input')
train_df = pd.read_csv(input / 'train.csv')
test_df = pd.read_csv(input / 'test.csv')
sample_submission_df = pd.read_csv(input / 'sample_submit.csv',header=None)
sample_submission_df.columns = ['index','genre']
genre_label = pd.read_csv(input / 'genre_labels.csv')

#ランダムシードの設定
import random
np.random.seed(2021)
random.seed(2021)

In [56]:
#データ結合
def merge_data(train_df,test_df):
    if 'genre' not in test_df.columns.tolist():
        test_df['genre'] = 999
    join_data = pd.concat([train_df,test_df],ignore_index=True)
    return join_data

join_data = merge_data(train_df,test_df)
join_data.head()

Unnamed: 0,index,genre,popularity,duration_ms,acousticness,positiveness,danceability,loudness,energy,liveness,speechiness,instrumentalness,tempo,region
0,0,10,11,201094,0.112811,0.157247,0.187841,-1.884852,0.893918,0.363568,0.390108,0.888884,121-152,region_H
1,1,8,69,308493,0.101333,0.346563,0.554444,-5.546495,0.874409,0.193892,0.161497,0.12391,153-176,region_I
2,2,3,43,197225,0.49642,0.265391,0.457642,-9.25567,0.439933,0.217146,0.369057,0.16647,64-76,region_E
3,3,10,45,301092,0.165667,0.245533,0.356578,-5.088788,0.868704,0.377025,0.226677,0.175399,177-192,region_C
4,4,3,57,277348,0.19072,0.777578,0.830479,-3.933896,0.650149,0.169323,0.222488,0.22603,97-120,unknown


In [57]:
#ジャンルの結合
join_data['genre_name'] = join_data['genre'].map(dict(genre_label[['labels', 'genre']].values))

In [4]:
join_data['tempo'].value_counts()

121-152    2793
97-120     2287
77-96      1482
153-176     867
177-192     257
64-76       241
193-208     101
57-63        29
0-40         20
51-56         8
209-220       5
41-50         2
Name: tempo, dtype: int64

In [5]:
join_data['region'].value_counts()        #region_Mはtrain_dfのみ→無視するのもあり？

region_I    1573
region_E    1353
unknown      696
region_B     673
region_P     665
region_K     462
region_D     409
region_H     385
region_T     345
region_F     295
region_O     290
region_L     225
region_C     171
region_S     128
region_G     109
region_R      85
region_N      75
region_Q      74
region_J      58
region_A      18
region_M       3
Name: region, dtype: int64

In [4]:
dummy_region = pd.get_dummies(join_data['region'],drop_first=False)
dummy_region.head()

Unnamed: 0,region_A,region_B,region_C,region_D,region_E,region_F,region_G,region_H,region_I,region_J,...,region_L,region_M,region_N,region_O,region_P,region_Q,region_R,region_S,region_T,unknown
0,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [7]:
#変数tempoの変更

join_data.loc[join_data['tempo'].isin(['0-40','41-50','51-56','57-63']),'tempo'] = '0-63'
join_data.loc[join_data['tempo'].isin(['209-220','193-208']),'tempo'] = '193-220'

join_data['tempo'].value_counts()

121-152    2793
97-120     2287
77-96      1482
153-176     867
177-192     257
64-76       241
193-220     106
0-63         59
Name: tempo, dtype: int64

In [8]:
join_data['tempo'] = join_data['tempo'].map(lambda x: sum(map(int, x.split("-"))) / 2)
join_data['tempo'].head()

0    136.5
1    164.5
2     70.0
3    184.5
4    108.5
Name: tempo, dtype: float64

In [9]:
join_data = pd.concat([join_data,dummy_region],axis=1)
join_data.head()

Unnamed: 0,index,genre,popularity,duration_ms,acousticness,positiveness,danceability,loudness,energy,liveness,...,region_L,region_M,region_N,region_O,region_P,region_Q,region_R,region_S,region_T,unknown
0,0,10,11,201094,0.112811,0.157247,0.187841,-1.884852,0.893918,0.363568,...,0,0,0,0,0,0,0,0,0,0
1,1,8,69,308493,0.101333,0.346563,0.554444,-5.546495,0.874409,0.193892,...,0,0,0,0,0,0,0,0,0,0
2,2,3,43,197225,0.49642,0.265391,0.457642,-9.25567,0.439933,0.217146,...,0,0,0,0,0,0,0,0,0,0
3,3,10,45,301092,0.165667,0.245533,0.356578,-5.088788,0.868704,0.377025,...,0,0,0,0,0,0,0,0,0,0
4,4,3,57,277348,0.19072,0.777578,0.830479,-3.933896,0.650149,0.169323,...,0,0,0,0,0,0,0,0,0,1


In [10]:
join_data.replace('None',np.nan,inplace=True)

for col in ['acousticness','positiveness','danceability','energy',
            'liveness','speechiness','instrumentalness']:
    join_data[col] = join_data[col].astype('float64')
    join_mean = join_data[col].mean()
    join_data[col].fillna(join_mean,inplace=True)

In [11]:
join_data['music_nature_sum'] = 0
for col in [
    "acousticness",
    "positiveness",
    "danceability",
    "energy",
    "liveness",
    "speechiness",
    "instrumentalness",
]:
    join_data['music_nature_sum'] += join_data[col]
join_data['music_nature_sum'].head()

0    2.994378
1    2.356047
2    2.412058
3    2.415582
4    3.066767
Name: music_nature_sum, dtype: float64

In [3]:
join_data.to_csv(input / 'join_data.csv',index=False)

In [13]:
join_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8092 entries, 0 to 8091
Data columns (total 37 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   index             8092 non-null   int64  
 1   genre             8092 non-null   int64  
 2   popularity        8092 non-null   int64  
 3   duration_ms       8092 non-null   int64  
 4   acousticness      8092 non-null   float64
 5   positiveness      8092 non-null   float64
 6   danceability      8092 non-null   float64
 7   loudness          8092 non-null   float64
 8   energy            8092 non-null   float64
 9   liveness          8092 non-null   float64
 10  speechiness       8092 non-null   float64
 11  instrumentalness  8092 non-null   float64
 12  tempo             8092 non-null   float64
 13  region            8092 non-null   object 
 14  genre_name        4046 non-null   object 
 15  region_A          8092 non-null   uint8  
 16  region_B          8092 non-null   uint8  


In [14]:
'''
    1. 'popularity', 'duration_ms', 'acousticness',
    'positiveness', 'danceability', 'loudness', 'energy', 'liveness',
    'speechiness', 'instrumentalness', 'log_tempo', 'num_nans',
    等の組み合わせ(足す、かける、割る等)
    2. 標準化
    3. 欠損値どうする?
'''

"\n    1. 'popularity', 'duration_ms', 'acousticness',\n    'positiveness', 'danceability', 'loudness', 'energy', 'liveness',\n    'speechiness', 'instrumentalness', 'log_tempo', 'num_nans',\n    等の組み合わせ(足す、かける、割る等)\n    2. 標準化\n    3. 欠損値どうする?\n"

In [18]:
cluster_df = pd.read_csv(input / 'cluster_data.csv')
cluster_df.head()

Unnamed: 0,pca_0,pca_1,pca_2,pca_cluster,cluster_0,cluster_1,cluster_2,cluster_3,cluster_4,cluster_5
0,-0.261937,5.044936,2.521302,5,0,0,0,0,0,1
1,-1.661705,0.255611,-1.272544,0,1,0,0,0,0,0
2,1.268983,0.030598,0.715309,4,0,0,0,0,1,0
3,-1.40778,1.502747,-0.80962,0,1,0,0,0,0,0
4,-1.798215,-1.600899,0.474826,1,0,1,0,0,0,0


In [19]:
join_data = pd.concat([join_data,cluster_df],axis=1)
join_data.head()

Unnamed: 0,index,genre,popularity,duration_ms,acousticness,positiveness,danceability,loudness,energy,liveness,...,pca_0,pca_1,pca_2,pca_cluster,cluster_0,cluster_1,cluster_2,cluster_3,cluster_4,cluster_5
0,0,10,11,201094,0.112811,0.157247,0.187841,-1.884852,0.893918,0.363568,...,-0.261937,5.044936,2.521302,5,0,0,0,0,0,1
1,1,8,69,308493,0.101333,0.346563,0.554444,-5.546495,0.874409,0.193892,...,-1.661705,0.255611,-1.272544,0,1,0,0,0,0,0
2,2,3,43,197225,0.49642,0.265391,0.457642,-9.25567,0.439933,0.217146,...,1.268983,0.030598,0.715309,4,0,0,0,0,1,0
3,3,10,45,301092,0.165667,0.245533,0.356578,-5.088788,0.868704,0.377025,...,-1.40778,1.502747,-0.80962,0,1,0,0,0,0,0
4,4,3,57,277348,0.19072,0.777578,0.830479,-3.933896,0.650149,0.169323,...,-1.798215,-1.600899,0.474826,1,0,1,0,0,0,0


In [20]:
join_data.to_csv(input / 'new_join_data.csv',index=False)

In [11]:
import pandas as pd
from pathlib import Path

input = Path('input')

new_join_data = pd.read_csv(input / 'new_join_data.csv')
factor_data = pd.read_csv(input / 'factor_data.csv')

In [12]:
new_join_data.head()

Unnamed: 0,index,genre,popularity,duration_ms,acousticness,positiveness,danceability,loudness,energy,liveness,...,pca_0,pca_1,pca_2,pca_cluster,cluster_0,cluster_1,cluster_2,cluster_3,cluster_4,cluster_5
0,0,10,11,201094,0.112811,0.157247,0.187841,-1.884852,0.893918,0.363568,...,-0.261937,5.044936,2.521302,5,0,0,0,0,0,1
1,1,8,69,308493,0.101333,0.346563,0.554444,-5.546495,0.874409,0.193892,...,-1.661705,0.255611,-1.272544,0,1,0,0,0,0,0
2,2,3,43,197225,0.49642,0.265391,0.457642,-9.25567,0.439933,0.217146,...,1.268983,0.030598,0.715309,4,0,0,0,0,1,0
3,3,10,45,301092,0.165667,0.245533,0.356578,-5.088788,0.868704,0.377025,...,-1.40778,1.502747,-0.80962,0,1,0,0,0,0,0
4,4,3,57,277348,0.19072,0.777578,0.830479,-3.933896,0.650149,0.169323,...,-1.798215,-1.600899,0.474826,1,0,1,0,0,0,0


In [13]:
factor_data.head()

Unnamed: 0,factor_1,factor_2,factor_3
0,-1.420504,2.259227,-0.021925
1,-1.265113,0.284038,0.253307
2,0.808452,0.326082,-0.240502
3,-1.235781,1.103026,0.207864
4,-0.324308,-1.58623,-0.732366


In [14]:
print(new_join_data.shape)
print(factor_data.shape)

(8092, 47)
(8092, 3)


In [15]:
concat_data = pd.concat([new_join_data,factor_data],axis=1)
concat_data.head()

Unnamed: 0,index,genre,popularity,duration_ms,acousticness,positiveness,danceability,loudness,energy,liveness,...,pca_cluster,cluster_0,cluster_1,cluster_2,cluster_3,cluster_4,cluster_5,factor_1,factor_2,factor_3
0,0,10,11,201094,0.112811,0.157247,0.187841,-1.884852,0.893918,0.363568,...,5,0,0,0,0,0,1,-1.420504,2.259227,-0.021925
1,1,8,69,308493,0.101333,0.346563,0.554444,-5.546495,0.874409,0.193892,...,0,1,0,0,0,0,0,-1.265113,0.284038,0.253307
2,2,3,43,197225,0.49642,0.265391,0.457642,-9.25567,0.439933,0.217146,...,4,0,0,0,0,1,0,0.808452,0.326082,-0.240502
3,3,10,45,301092,0.165667,0.245533,0.356578,-5.088788,0.868704,0.377025,...,0,1,0,0,0,0,0,-1.235781,1.103026,0.207864
4,4,3,57,277348,0.19072,0.777578,0.830479,-3.933896,0.650149,0.169323,...,1,0,1,0,0,0,0,-0.324308,-1.58623,-0.732366


In [16]:
concat_data.shape

(8092, 50)

In [17]:
concat_data.to_csv(input / 'new_join_data.csv',index=False)

In [17]:
features_data = pd.read_csv(input / 'features_data.csv') 
features_data.head()

Unnamed: 0,region_A,region_B,region_C,region_D,region_E,region_F,region_G,region_H,region_I,region_J,...,factor_1,factor_2,factor_3,music_nature,nature_factor,exciting_factor,language_factor,song_factor,standardscaled_tempo_log,standardscaled_tempo
0,0,0,0,0,0,0,0,1,0,0,...,-1.420504,2.259227,-0.021925,5.742936,1.90734,0.985881,2.849715,5.7687,-0.726893,0.483409
1,0,0,0,0,0,0,0,0,1,0,...,-1.265113,0.284038,0.253307,-0.86371,0.857923,-0.79532,-0.926314,-0.062652,0.355492,1.426883
2,0,0,0,0,1,0,0,0,0,0,...,0.808452,0.326082,-0.240502,-0.393039,-0.56617,-1.492048,1.665179,-0.704443,,-1.757342
3,0,0,1,0,0,0,0,0,0,0,...,-1.235781,1.103026,0.207864,0.068024,1.212081,-2.156487,1.012429,0.382009,0.742315,2.100792
4,0,0,0,0,0,0,0,0,0,0,...,-0.324308,-1.58623,-0.732366,3.675958,0.513928,3.527748,-0.365718,0.993303,,-0.460065


In [18]:
join_data['tempo'].isnull().sum()

0

In [28]:
join_data['tempo'] = join_data['tempo'].map(lambda x: sum(map(int, x.split("-"))) / 2)
join_data['tempo'].head()

0    136.5
1    164.5
2     70.0
3    184.5
4    108.5
Name: tempo, dtype: float64

In [29]:
sc_data = join_data['tempo']
features_data['tempo_log'] = np.log(sc_data)

In [33]:
import scipy.stats

sc_data = np.log(sc_data)

features_data["standardscaled_tempo"] = scipy.stats.zscore(sc_data)
features_data.head()

Unnamed: 0,region_A,region_B,region_C,region_D,region_E,region_F,region_G,region_H,region_I,region_J,...,factor_1,factor_2,factor_3,music_nature,nature_factor,exciting_factor,language_factor,song_factor,tempo_log,standardscaled_tempo
0,0,0,0,0,0,0,0,1,0,0,...,-1.420504,2.259227,-0.021925,5.742936,1.90734,0.985881,2.849715,5.7687,4.916325,0.549758
1,0,0,0,0,0,0,0,0,1,0,...,-1.265113,0.284038,0.253307,-0.86371,0.857923,-0.79532,-0.926314,-0.062652,5.102911,1.269242
2,0,0,0,0,1,0,0,0,0,0,...,0.808452,0.326082,-0.240502,-0.393039,-0.56617,-1.492048,1.665179,-0.704443,4.248495,-2.025425
3,0,0,1,0,0,0,0,0,0,0,...,-1.235781,1.103026,0.207864,0.068024,1.212081,-2.156487,1.012429,0.382009,5.217649,1.711681
4,0,0,0,0,0,0,0,0,0,0,...,-0.324308,-1.58623,-0.732366,3.675958,0.513928,3.527748,-0.365718,0.993303,4.68675,-0.335492


In [34]:
features_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8092 entries, 0 to 8091
Data columns (total 41 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   region_A                         8092 non-null   int64  
 1   region_B                         8092 non-null   int64  
 2   region_C                         8092 non-null   int64  
 3   region_D                         8092 non-null   int64  
 4   region_E                         8092 non-null   int64  
 5   region_F                         8092 non-null   int64  
 6   region_G                         8092 non-null   int64  
 7   region_H                         8092 non-null   int64  
 8   region_I                         8092 non-null   int64  
 9   region_J                         8092 non-null   int64  
 10  region_K                         8092 non-null   int64  
 11  region_L                         8092 non-null   int64  
 12  region_M            

In [35]:
features_data.to_csv(input / 'features_data.csv',index=False)

In [31]:
features_data = features_data.drop('standardscaled_tempo',axis=1)
features_data.head()

Unnamed: 0,region_A,region_B,region_C,region_D,region_E,region_F,region_G,region_H,region_I,region_J,...,standardscaled_instrumentalness,factor_1,factor_2,factor_3,music_nature,nature_factor,exciting_factor,language_factor,song_factor,tempo_log
0,0,0,0,0,0,0,0,1,0,0,...,4.338655,-1.420504,2.259227,-0.021925,5.742936,1.90734,0.985881,2.849715,5.7687,4.916325
1,0,0,0,0,0,0,0,0,1,0,...,-0.588519,-1.265113,0.284038,0.253307,-0.86371,0.857923,-0.79532,-0.926314,-0.062652,5.102911
2,0,0,0,0,1,0,0,0,0,0,...,-0.314395,0.808452,0.326082,-0.240502,-0.393039,-0.56617,-1.492048,1.665179,-0.704443,4.248495
3,0,0,1,0,0,0,0,0,0,0,...,-0.256881,-1.235781,1.103026,0.207864,0.068024,1.212081,-2.156487,1.012429,0.382009,5.217649
4,0,0,0,0,0,0,0,0,0,0,...,0.069233,-0.324308,-1.58623,-0.732366,3.675958,0.513928,3.527748,-0.365718,0.993303,4.68675


In [30]:
features_data.head()

Unnamed: 0,region_A,region_B,region_C,region_D,region_E,region_F,region_G,region_H,region_I,region_J,...,factor_1,factor_2,factor_3,music_nature,nature_factor,exciting_factor,language_factor,song_factor,standardscaled_tempo,tempo_log
0,0,0,0,0,0,0,0,1,0,0,...,-1.420504,2.259227,-0.021925,5.742936,1.90734,0.985881,2.849715,5.7687,0.549758,4.916325
1,0,0,0,0,0,0,0,0,1,0,...,-1.265113,0.284038,0.253307,-0.86371,0.857923,-0.79532,-0.926314,-0.062652,1.269242,5.102911
2,0,0,0,0,1,0,0,0,0,0,...,0.808452,0.326082,-0.240502,-0.393039,-0.56617,-1.492048,1.665179,-0.704443,-2.025425,4.248495
3,0,0,1,0,0,0,0,0,0,0,...,-1.235781,1.103026,0.207864,0.068024,1.212081,-2.156487,1.012429,0.382009,1.711681,5.217649
4,0,0,0,0,0,0,0,0,0,0,...,-0.324308,-1.58623,-0.732366,3.675958,0.513928,3.527748,-0.365718,0.993303,-0.335492,4.68675


In [5]:
join_data['tempo_max'] = join_data['tempo'].map(lambda x: max(map(int, x.split("-"))))
join_data['tempo_max'].head()

0    152
1    176
2     76
3    192
4    120
Name: tempo_max, dtype: int64

In [5]:
dummy_region.to_csv(input / 'region_dummy.csv',index=False)

In [25]:
join_data.groupby('tempo')['popularity'].sum()/join_data.groupby('tempo')['popularity'].count()

tempo
0-40       27.350000
121-152    41.149660
153-176    39.295271
177-192    39.136187
193-208    42.425743
209-220    53.800000
41-50      35.500000
51-56      43.000000
57-63      40.724138
64-76      42.352697
77-96      42.181511
97-120     41.173590
Name: popularity, dtype: float64

In [26]:
join_data.groupby('region')['popularity'].sum()/join_data.groupby('region')['popularity'].count()

region
region_A    23.722222
region_B    38.789004
region_C    39.953216
region_D    42.239609
region_E    43.988914
region_F    41.220339
region_G    44.036697
region_H    33.594805
region_I    40.509218
region_J    67.051724
region_K    40.450216
region_L    38.968889
region_M    33.333333
region_N    33.746667
region_O    41.062069
region_P    42.762406
region_Q    42.581081
region_R    33.435294
region_S    43.070312
region_T    42.017391
unknown     41.011494
Name: popularity, dtype: float64

In [11]:
join_data.groupby('tempo')['duration_ms'].sum()/join_data.groupby('tempo')['duration_ms'].count()

tempo
0-40        27930.450000
121-152    246834.656642
153-176    241148.585928
177-192    238862.431907
193-208    232420.405941
209-220    220161.200000
41-50      184485.500000
51-56      216319.000000
57-63      248208.068966
64-76      239211.311203
77-96      240070.475034
97-120     240701.153039
Name: duration_ms, dtype: float64

In [12]:
join_data.groupby('region')['duration_ms'].sum()/join_data.groupby('region')['duration_ms'].count()

region
region_A    291180.388889
region_B    254407.407132
region_C    257904.222222
region_D    228842.990220
region_E    247786.749446
region_F    230083.776271
region_G    252804.899083
region_H    241215.114286
region_I    239207.812460
region_J    215581.448276
region_K    242752.699134
region_L    210673.080000
region_M    199781.333333
region_N    239480.306667
region_O    233289.700000
region_P    244185.780451
region_Q    233800.594595
region_R    250979.364706
region_S    229734.562500
region_T    246645.113043
unknown     244900.321839
Name: duration_ms, dtype: float64

In [13]:
features_data = pd.read_csv(input / 'features_data.csv')

In [15]:
features_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8092 entries, 0 to 8091
Data columns (total 87 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   region_A                             8092 non-null   int64  
 1   region_B                             8092 non-null   int64  
 2   region_C                             8092 non-null   int64  
 3   region_D                             8092 non-null   int64  
 4   region_E                             8092 non-null   int64  
 5   region_F                             8092 non-null   int64  
 6   region_G                             8092 non-null   int64  
 7   region_H                             8092 non-null   int64  
 8   region_I                             8092 non-null   int64  
 9   region_J                             8092 non-null   int64  
 10  region_K                             8092 non-null   int64  
 11  region_L                      

0     NaN
40    NaN
41    NaN
50    NaN
51    NaN
56    NaN
57    NaN
63    NaN
64    NaN
76    NaN
77    NaN
96    NaN
97    NaN
120   NaN
121   NaN
152   NaN
153   NaN
176   NaN
177   NaN
192   NaN
193   NaN
208   NaN
209   NaN
220   NaN
Name: music_nature, dtype: float64

In [49]:
data_region = pd.DataFrame()
data_tempo = pd.DataFrame()

In [58]:
data_region['region_popularity'] = join_data.groupby('region')['popularity'].var()
data_tempo['tempo_popularity'] = join_data.groupby('tempo')['popularity'].var()
data_tempo['tempo_duration_ms'] = join_data.groupby('tempo')['duration_ms'].var()
data_region['region_duration_ms'] = join_data.groupby('region')['duration_ms'].var()
data_region

Unnamed: 0_level_0,region_popularity,region_duration_ms
region,Unnamed: 1_level_1,Unnamed: 2_level_1
region_A,194.212418,15265530000.0
region_B,239.092319,17248160000.0
region_C,246.386034,8943596000.0
region_D,350.060094,4963398000.0
region_E,237.728427,4327530000.0
region_F,180.757408,2048072000.0
region_G,145.017159,3058467000.0
region_H,274.809348,5620451000.0
region_I,266.613942,5730062000.0
region_J,8.50605,1473500000.0


In [59]:
data_tempo

Unnamed: 0_level_0,tempo_popularity,tempo_duration_ms
tempo,Unnamed: 1_level_1,Unnamed: 2_level_1
0-40,68.555263,3793353000.0
121-152,273.570001,7465764000.0
153-176,270.697934,5738932000.0
177-192,277.282162,5481376000.0
193-208,271.626931,4591649000.0
209-220,36.2,1368594000.0
41-50,144.5,241538200.0
51-56,181.714286,8500545000.0
57-63,238.849754,9407346000.0
64-76,269.154253,6883439000.0


In [60]:
merge_data = pd.merge(join_data,data_region,on='region',how='left')
merge_data.head()

Unnamed: 0,index,genre,popularity,duration_ms,acousticness,positiveness,danceability,loudness,energy,liveness,speechiness,instrumentalness,tempo,region,genre_name,region_popularity,region_duration_ms
0,0,10,11,201094,0.112811,0.157247,0.187841,-1.884852,0.893918,0.363568,0.390108,0.888884,121-152,region_H,rock,274.809348,5620451000.0
1,1,8,69,308493,0.101333,0.346563,0.554444,-5.546495,0.874409,0.193892,0.161497,0.12391,153-176,region_I,pop,266.613942,5730062000.0
2,2,3,43,197225,0.49642,0.265391,0.457642,-9.25567,0.439933,0.217146,0.369057,0.16647,64-76,region_E,hip-hop,237.728427,4327530000.0
3,3,10,45,301092,0.165667,0.245533,0.356578,-5.088788,0.868704,0.377025,0.226677,0.175399,177-192,region_C,rock,246.386034,8943596000.0
4,4,3,57,277348,0.19072,0.777578,0.830479,-3.933896,0.650149,0.169323,0.222488,0.22603,97-120,unknown,hip-hop,271.461738,8692245000.0


In [61]:
merge_data = pd.merge(merge_data,data_tempo,on='tempo',how='left')
merge_data.head()

Unnamed: 0,index,genre,popularity,duration_ms,acousticness,positiveness,danceability,loudness,energy,liveness,speechiness,instrumentalness,tempo,region,genre_name,region_popularity,region_duration_ms,tempo_popularity,tempo_duration_ms
0,0,10,11,201094,0.112811,0.157247,0.187841,-1.884852,0.893918,0.363568,0.390108,0.888884,121-152,region_H,rock,274.809348,5620451000.0,273.570001,7465764000.0
1,1,8,69,308493,0.101333,0.346563,0.554444,-5.546495,0.874409,0.193892,0.161497,0.12391,153-176,region_I,pop,266.613942,5730062000.0,270.697934,5738932000.0
2,2,3,43,197225,0.49642,0.265391,0.457642,-9.25567,0.439933,0.217146,0.369057,0.16647,64-76,region_E,hip-hop,237.728427,4327530000.0,269.154253,6883439000.0
3,3,10,45,301092,0.165667,0.245533,0.356578,-5.088788,0.868704,0.377025,0.226677,0.175399,177-192,region_C,rock,246.386034,8943596000.0,277.282162,5481376000.0
4,4,3,57,277348,0.19072,0.777578,0.830479,-3.933896,0.650149,0.169323,0.222488,0.22603,97-120,unknown,hip-hop,271.461738,8692245000.0,250.53547,6893177000.0


In [62]:
features = [
    'region_popularity',
    'region_duration_ms',
    'tempo_popularity',
    'tempo_duration_ms'
]

groupdata = merge_data[features]
groupdata.head()

Unnamed: 0,region_popularity,region_duration_ms,tempo_popularity,tempo_duration_ms
0,274.809348,5620451000.0,273.570001,7465764000.0
1,266.613942,5730062000.0,270.697934,5738932000.0
2,237.728427,4327530000.0,269.154253,6883439000.0
3,246.386034,8943596000.0,277.282162,5481376000.0
4,271.461738,8692245000.0,250.53547,6893177000.0


In [65]:
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
sc.fit(groupdata)
groupdata_sc = sc.transform(groupdata)

groupdata_sc

array([[ 0.55036231, -0.26667548,  0.77915715,  0.93457894],
       [ 0.37617193, -0.23746153,  0.6223273 , -0.8777387 ],
       [-0.23777918, -0.61127191,  0.53803428,  0.32342603],
       ...,
       [ 0.55036231, -0.26667548,  0.77915715,  0.93457894],
       [-0.20879015,  2.83240433,  0.77915715,  0.93457894],
       [ 0.39598692,  0.05795362, -1.16262793, -1.47659522]])

In [67]:
groupdata_sc = pd.DataFrame(groupdata_sc)

In [68]:
groupdata_sc.head()

Unnamed: 0,0,1,2,3
0,0.550362,-0.266675,0.779157,0.934579
1,0.376172,-0.237462,0.622327,-0.877739
2,-0.237779,-0.611272,0.538034,0.323426
3,-0.053765,0.619027,0.981861,-1.148045
4,0.47921,0.552035,-0.478648,0.333646


In [69]:
groupdata_sc.to_csv(input / 'groupdata.csv',index=False)