## 目的
- ゲームの売上であるGlobal_Salesを予測変数とし, NN, 決定木, SVMで予測を行い, 精度を比較する

### よく使うライブラリのimport

In [6]:
import os, sys

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
%matplotlib inline
sns.set()
%config InlineBackend.figure_formats = {'png', 'retina'}

### データの読み込み

In [7]:
INPUT_DIR = '../inputs/'
game_df = pd.read_csv(os.path.join(INPUT_DIR, 'Video_Games_Sales_as_at_22_Dec_2016.csv'))

In [12]:
game_df.head(5).T

Unnamed: 0,0,1,2,3,4
Name,Wii Sports,Super Mario Bros.,Mario Kart Wii,Wii Sports Resort,Pokemon Red/Pokemon Blue
Platform,Wii,NES,Wii,Wii,GB
Year_of_Release,2006,1985,2008,2009,1996
Genre,Sports,Platform,Racing,Sports,Role-Playing
Publisher,Nintendo,Nintendo,Nintendo,Nintendo,Nintendo
NA_Sales,41.36,29.08,15.68,15.61,11.27
EU_Sales,28.96,3.58,12.76,10.93,8.89
JP_Sales,3.77,6.81,3.79,3.28,10.22
Other_Sales,8.45,0.77,3.29,2.95,1
Global_Sales,82.53,40.24,35.52,32.77,31.37


### trainデータとtestデータに分割

In [82]:
from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(game_df, test_size=0.5, random_state=2)

In [83]:
train_df = train_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)

In [84]:
train_df.shape, test_df.shape

((8359, 16), (8360, 16))

### 連続変数の特徴量

In [88]:
def preprocess(input_df: pd.DataFrame) -> pd.DataFrame:
    output_df = input_df.copy()

    idx_tbd = input_df['User_Score'] == 'tbd'
    output_df['User_Score_is_tbd'] = idx_tbd.astype(int)

    output_df['User_Score'] = input_df['User_Score'].replace('tbd', None).astype(float)
    return output_df

def create_continuous_features(input_df):
    input_df = preprocess(input_df)
    use_columns = [
        # 連続変数
        'Critic_Score',
        'Critic_Count', 
        'User_Score', 
        'User_Count',
        'Year_of_Release',
        'User_Score_is_tbd'
    ]
    return input_df[use_columns].copy()

### Booleanの特徴量

In [89]:
def create_boolean_feature(input_df):
    output_df = pd.DataFrame()

    texts = [
        'nintendo', 
    ]

    for t in texts:
        output_df[f'Developer_has_{t}'] = input_df['Developer'].fillna('').str.lower().str.contains(t).astype(int)
    return output_df

### CountEncoding

In [90]:
class BaseBlock(object):
    def fit(self, input_df, y=None):
        return self.transform(input_df)

    def transform(self, input_df):
        raise NotImplementedError()

In [91]:
class CountEncodingBlock(BaseBlock):
    def __init__(self, column, whole_df: pd.DataFrame):
        self.column = column
        self.whole_df = whole_df

    def transform(self, input_df):
        output_df = pd.DataFrame()
        c = self.column

        vc = self.whole_df[c].value_counts()
        output_df[c] = input_df[c].map(vc)
        return output_df.add_prefix('CE_')

### OneHotEncoding

In [92]:
class OneHotBlock(BaseBlock):
    def __init__(self, column: str):
        self.column = column
        self.cats_ = None

    def fit(self, input_df, y=None):
        vc = input_df[self.column].dropna().value_counts()
        cats = vc[vc > 40].index
        self.cats_ = cats
        return self.transform(input_df)

    def transform(self, input_df):
        x = pd.Categorical(input_df[self.column], categories=self.cats_)
        out_df = pd.get_dummies(x, dummy_na=False)
        out_df.columns = out_df.columns.tolist()
        return out_df.add_prefix(f'OH_{self.column}=')

In [93]:
class WrapperBlock(BaseBlock):
    def __init__(self, function):
        self.function = function

    def transform(self, input_df):
        return self.function(input_df)

In [94]:
process_blocks = [
    WrapperBlock(create_continuous_features),
    WrapperBlock(create_boolean_feature),
    *[OneHotBlock(c) for c in ['Platform', 'Genre', 'Publisher', 'Rating']],
    *[CountEncodingBlock(c, whole_df=game_df) for c in ['Name', 'Platform', 'Year_of_Release', 'Genre', 'Publisher', 'Rating']],
]

In [95]:
from tqdm import tqdm


def get_function(block, is_train):
    s = mapping = {
        True: 'fit',
        False: 'transform'
    }.get(is_train)
    return getattr(block, s)


def to_feature(input_df, 
               blocks,
               is_train=False):
    out_df = pd.DataFrame()

    for block in tqdm(blocks, total=len(blocks)):
        func = get_function(block, is_train)

        _df = func(input_df)

        assert len(_df) == len(input_df), func.__name__
        out_df = pd.concat([out_df, _df], axis=1)

    return out_df

In [113]:
train_feat_df = to_feature(train_df, process_blocks, is_train=True)
test_feat_df = to_feature(test_df, process_blocks)

100%|██████████| 12/12 [00:00<00:00, 102.30it/s]
100%|██████████| 12/12 [00:00<00:00, 198.00it/s]


### 欠損値を雑に0埋め

In [114]:
train_feat_df = train_feat_df.fillna(0)
test_feat_df = test_feat_df.fillna(0)

### ニューラルネットワーク

In [127]:
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_absolute_error

model = MLPRegressor(random_state=2)
model.fit(train_feat_df, train_df['Global_Sales'])

mean_absolute_error(model.predict(test_feat_df), test_df['Global_Sales'])

3.633149119441224

### 決定木

In [124]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error

model = DecisionTreeRegressor()
model.fit(train_feat_df, train_df['Global_Sales'])

mean_absolute_error(model.predict(test_feat_df), test_df['Global_Sales'])

0.5037113750284804

### SVM

In [128]:
from sklearn.svm import SVR
from sklearn.metrics import mean_absolute_error

model = SVR()
model.fit(train_feat_df, train_df['Global_Sales'])

mean_absolute_error(model.predict(test_feat_df), test_df['Global_Sales'])

0.4373421514774

## 結果  
mae 平均絶対誤差を見た結果(パラメータはsklearnの初期値のまま)
- NN: 3.633149119441224
- 決定木(回帰木): 0.5037113750284804
- SVR(SVR): 0.4373421514774  

parameter searchを行っていないため, 性能比較とは言えない.  
特に, ニューラルネットワークに関しては, デフォルトのニューロン数がかなり少ない