In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

%matplotlib inline


#ランダムシードの設定
import random
np.random.seed(2021)
random.seed(2021)

In [195]:
input = Path('input')
input_data = pd.read_csv(input / 'join_data.csv')
input_data.head()

Unnamed: 0,index,genre,popularity,duration_ms,acousticness,positiveness,danceability,loudness,energy,liveness,speechiness,instrumentalness,tempo,region
0,0,10,11,201094,0.112811,0.157247,0.187841,-1.884852,0.893918,0.363568,0.390108,0.888884,121-152,region_H
1,1,8,69,308493,0.101333,0.346563,0.554444,-5.546495,0.874409,0.193892,0.161497,0.12391,153-176,region_I
2,2,3,43,197225,0.49642,0.265391,0.457642,-9.25567,0.439933,0.217146,0.369057,0.16647,64-76,region_E
3,3,10,45,301092,0.165667,0.245533,0.356578,-5.088788,0.868704,0.377025,0.226677,0.175399,177-192,region_C
4,4,3,57,277348,0.19072,0.777578,0.830479,-3.933896,0.650149,0.169323,0.222488,0.22603,97-120,unknown


In [196]:
_df = input_data["tempo"].str.split("-").apply(pd.Series).astype(float)
input_data['min_tempo'] = input_data['tempo'].map(lambda x: min(map(int, x.split("-"))))
input_data['max_tempo'] = input_data['tempo'].map(lambda x: max(map(int, x.split('-'))))
input_data['diff_tempo'] = input_data['max_tempo'] - input_data['min_tempo']
input_data['var_tempo'] = _df.var(axis=1)
input_data['sum_tempo'] = input_data['min_tempo'] + input_data['max_tempo']
input_data['mean_tempo'] = _df.mean(axis=1)

In [207]:
cols = [
    'popularity',
    'duration_ms',
    'acousticness',
    'positiveness',
    'danceability',
    'loudness',
    'energy',
    'liveness',
    'speechiness',
    'instrumentalness',
    'mean_tempo',
    'max_tempo',
    'region'
]

input_data = input_data[cols]
input_data.head()

Unnamed: 0,popularity,duration_ms,acousticness,positiveness,danceability,loudness,energy,liveness,speechiness,instrumentalness,mean_tempo,max_tempo,region
0,11,201094,0.112811,0.157247,0.187841,-1.884852,0.893918,0.363568,0.390108,0.888884,136.5,152,region_H
1,69,308493,0.101333,0.346563,0.554444,-5.546495,0.874409,0.193892,0.161497,0.12391,164.5,176,region_I
2,43,197225,0.49642,0.265391,0.457642,-9.25567,0.439933,0.217146,0.369057,0.16647,70.0,76,region_E
3,45,301092,0.165667,0.245533,0.356578,-5.088788,0.868704,0.377025,0.226677,0.175399,184.5,192,region_C
4,57,277348,0.19072,0.777578,0.830479,-3.933896,0.650149,0.169323,0.222488,0.22603,108.5,120,unknown


In [203]:
def max_min(x):
    return x.max() - x.min()

def q75_q25(x):
    return x.quantile(0.75) - x.quantile(0.25)

def q75(x):
    return x.quantile(0.75)

def q25(x):
    return x.quantile(0.25)

def count(x):
    return x.count()

def top(x):
    return x.mode().count()

def sumx(x):
    return x.max() + x.min()

In [208]:
def get_agg_region_features(input_df):

    group_key = "region"
    group_values = [
        'popularity',
        'duration_ms',
        'acousticness',
        'positiveness',
        'danceability',
        'loudness',
        'energy',
        'liveness',
        'speechiness',
        'instrumentalness',
        'mean_tempo',
        'max_tempo'
    ]
    agg_methods = ["min", "mean", "max", "median","std", max_min, q75_q25, "z-score",count,sumx]
    encoder = GroupingEngine(group_key=group_key, group_values=group_values, agg_methods=agg_methods)
    output_df = encoder.fit_transform(input_data)
    return output_df

In [209]:
agg_region_feature = get_agg_region_features(input_data)

In [33]:
class GroupingEngine: 

    def __init__(self, group_key, group_values, agg_methods):
        self.group_key = group_key
        self.group_values = group_values

        ex_trans_methods = ["val-mean", "z-score"]
        self.ex_trans_methods = [m for m in agg_methods if m in ex_trans_methods]
        self.agg_methods = [m for m in agg_methods if m not in self.ex_trans_methods]
        self.df = None

    def fit(self, input_df, y=None):
        new_df = []
        for agg_method in self.agg_methods:

            for col in self.group_values:
                if callable(agg_method):
                    agg_method_name = agg_method.__name__
                else:
                    agg_method_name = agg_method

                new_col = f"agg_{agg_method_name}_{col}_grpby_{self.group_key}"
                df_agg = (input_df[[col] + [self.group_key]].groupby(self.group_key)[[col]].agg(agg_method))
                df_agg.columns = [new_col]
                new_df.append(df_agg)
        self.df = pd.concat(new_df, axis=1).reset_index()

    def transform(self, input_df):
        output_df = pd.merge(input_df[[self.group_key]], self.df, on=self.group_key, how="left")
        if len(self.ex_trans_methods) != 0:
            output_df = self.ex_transform(input_df, output_df)
        output_df.drop(self.group_key, axis=1, inplace=True)
        return output_df

    def ex_transform(self, df1, df2):
        """
        df1: input_df
        df2: output_df
        return: output_df (added ex transformed features)
        """

        if "val-mean" in self.ex_trans_methods:
            df2[self._get_col("val-mean")] = df1[self.group_values].values - df2[self._get_col("mean")].values
        if "z-score" in self.ex_trans_methods:
            df2[self._get_col("z-score")] = (df1[self.group_values].values - df2[self._get_col("mean")].values) \
                                            / (df2[self._get_col("std")].values + 1e-3)
        return df2

    def _get_col(self, method):
        return np.sort([f"agg_{method}_{group_val}_grpby_{self.group_key}" for group_val in self.group_values])

    def fit_transform(self, input_df, y=None):
        self.fit(input_df, y=y)
        return self.transform(input_df)

In [210]:
agg_region_feature.to_csv(input / 'agg_region_feature2.csv',index=False)

In [16]:
agg_region_feature.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8092 entries, 0 to 8091
Columns: 108 entries, agg_min_popularity_grpby_region to agg_z-score_speechiness_grpby_region
dtypes: float64(91), int64(17)
memory usage: 6.7 MB


In [27]:
features_data = pd.read_csv(input / 'features_data.csv')

In [22]:
features_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8092 entries, 0 to 8091
Data columns (total 82 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   region_A                             8092 non-null   int64  
 1   region_B                             8092 non-null   int64  
 2   region_C                             8092 non-null   int64  
 3   region_D                             8092 non-null   int64  
 4   region_E                             8092 non-null   int64  
 5   region_F                             8092 non-null   int64  
 6   region_G                             8092 non-null   int64  
 7   region_H                             8092 non-null   int64  
 8   region_I                             8092 non-null   int64  
 9   region_J                             8092 non-null   int64  
 10  region_K                             8092 non-null   int64  
 11  region_L                      

In [40]:
input_data['factor_1'] = features_data['factor_1']
input_data['factor_2'] = features_data['factor_2']
input_data['factor_3'] = features_data['factor_3']

input_data['music_nature_mul'] = features_data['music_nature_mul']
input_data['nature_factor'] = features_data['nature_factor']
input_data['exciting_factor'] = features_data['exciting_factor']
input_data['language_factor_mul'] = features_data['language_factor_mul']
input_data['standardscaled_popularity'] = features_data['standardscaled_popularity']
input_data['standardscaled_duration_ms'] = features_data['standardscaled_duration_ms']
input_data['standardscaled_acousticness'] = features_data['standardscaled_acousticness']
input_data['standardscaled_positiveness'] = features_data['standardscaled_positiveness']
input_data['standardscaled_danceability'] = features_data['standardscaled_danceability']
input_data['standardscaled_loudness'] = features_data['standardscaled_loudness']
input_data['standardscaled_energy'] = features_data['standardscaled_energy']
input_data['standardscaled_liveness'] = features_data['standardscaled_liveness']
input_data['standardscaled_speechiness'] = features_data['standardscaled_speechiness']
input_data['standardscaled_instrumentalness'] = features_data['standardscaled_instrumentalness']

In [2]:
input = Path('input')

features_data = pd.read_csv(input / 'features_data.csv')

In [7]:
features = ['factor_1',
            'factor_2',
            'factor_3',
            'music_nature_mul',
            'nature_factor',
            'exciting_factor',
            'language_factor_mul'
]

data = features_data[features]
data.head()

Unnamed: 0,factor_1,factor_2,factor_3,music_nature_mul,nature_factor,exciting_factor,language_factor_mul
0,-1.420504,2.259227,-0.021925,-31.822863,1.90734,0.985881,1.374217
1,-1.265113,0.284038,0.253307,-0.015749,0.857923,-0.79532,0.214458
2,0.808452,0.326082,-0.240502,0.010537,-0.56617,-1.492048,-0.639576
3,-1.235781,1.103026,0.207864,0.030949,1.212081,-2.156487,0.218308
4,-0.324308,-1.58623,-0.732366,0.004176,0.513928,3.527748,-0.16457


In [13]:
data['region'] = input_data['region']
data.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,factor_1,factor_2,factor_3,music_nature_mul,nature_factor,exciting_factor,language_factor_mul,region
0,-1.420504,2.259227,-0.021925,-31.822863,1.90734,0.985881,1.374217,region_H
1,-1.265113,0.284038,0.253307,-0.015749,0.857923,-0.79532,0.214458,region_I
2,0.808452,0.326082,-0.240502,0.010537,-0.56617,-1.492048,-0.639576,region_E
3,-1.235781,1.103026,0.207864,0.030949,1.212081,-2.156487,0.218308,region_C
4,-0.324308,-1.58623,-0.732366,0.004176,0.513928,3.527748,-0.16457,unknown


In [14]:
def get_agg_region_features(input_df):

    group_key = "region"
    group_values = ['factor_1',
                    'factor_2',
                    'factor_3',
                    'music_nature_mul',
                    'nature_factor',
                    'exciting_factor',
                    'language_factor_mul']
    agg_methods = ["min", "mean", "max", "median", "std", max_min, q75_q25, "z-score"]
    encoder = GroupingEngine(group_key=group_key, group_values=group_values, agg_methods=agg_methods)
    output_df = encoder.fit_transform(input_data)
    return output_df

In [17]:
agg_region_feature = get_agg_region_features(data)

KeyError: "['factor_1'] not in index"

In [16]:
data = data.reset_index()

In [18]:
data.head()

Unnamed: 0,index,factor_1,factor_2,factor_3,music_nature_mul,nature_factor,exciting_factor,language_factor_mul,region
0,0,-1.420504,2.259227,-0.021925,-31.822863,1.90734,0.985881,1.374217,region_H
1,1,-1.265113,0.284038,0.253307,-0.015749,0.857923,-0.79532,0.214458,region_I
2,2,0.808452,0.326082,-0.240502,0.010537,-0.56617,-1.492048,-0.639576,region_E
3,3,-1.235781,1.103026,0.207864,0.030949,1.212081,-2.156487,0.218308,region_C
4,4,-0.324308,-1.58623,-0.732366,0.004176,0.513928,3.527748,-0.16457,unknown


In [39]:
agg_region_feature.to_csv(input / 'agg_region_feature_new.csv',index=False)

In [45]:
agg_region_feature_standardscled.to_csv(input / 'agg_region_feature_standardscaled.csv',index=False)

In [130]:
agg_region_feature_addtempo.to_csv(input / 'agg_region_feature_notempo.csv',index=False)

In [121]:
import category_encoders as ce

def get_ce_features(input_df):
    # count encording した特徴量
    _input_df = pd.concat([input_df,
                           get_binned_popularity_features(input_df)], axis=1)
    
    cols = ["region", "popularity10"]
    encoder = ce.CountEncoder()
    output_df = encoder.fit_transform(_input_df[cols]).add_prefix("CE_")
    return output_df

def get_oe_features(input_df):
    # ordinal encording (label encording)した特徴量
    cols = ["region"]
    encoder = ce.OrdinalEncoder()
    output_df = encoder.fit_transform(input_df[cols]).add_prefix("OE_")
    return output_df

def get_binned_popularity_features(input_df):
    # popularity の10の位と1の位の特徴量
    tmp = input_df["popularity"].astype(str).str.zfill(2)
    tmp = [[i[0], i[1]] for i in tmp]
    output_df = pd.DataFrame(tmp, columns=["popularity10", "popularity01"])
    return output_df.astype(int)

In [123]:
get_ce_features = get_ce_features(input_data)

In [124]:
get_oe_features = get_oe_features(input_data)

In [125]:
get_binned_popularity_features = get_binned_popularity_features(input_data)

In [126]:
get_ce_features.to_csv(input / 'get_ce_features.csv',index=False)
get_oe_features.to_csv(input / 'get_oe_features.csv',index=False)
get_binned_popularity_features.to_csv(input / 'get_binned_popularity_features.csv',index=False)

In [131]:
def get_tmpo_features(input_df):
    # tmpo に関する特徴量
    _df = input_df["tempo"].str.split("-").apply(pd.Series).astype(float)
    _df.columns = ["tempo_low", "tempo_high"]
    output_df = _df.copy()
    output_df["diff_tempo"] = _df["tempo_high"] - _df["tempo_low"]
    output_df["var_tempo"] = _df.var(axis=1)
    output_df["sum_tempo"] = _df.sum(axis=1)
    return output_df

In [134]:
get_tempo_features = get_tmpo_features(input_data)

In [135]:
get_tempo_features.to_csv(input / 'get_tempo_features.csv',index=False)

In [146]:
factor = pd.read_csv(input / 'factor_features_data.csv')

In [188]:
input_data = pd.concat([input_data,factor],axis=1)