## XGBoost-pca

In [18]:
import pandas as pd
import time
import numpy as np
import xgboost as xgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score, accuracy_score

In [19]:
train = pd.read_csv('./data/train_age_dataset.csv')
test = pd.read_csv('./data/test_age_dataset.csv')
sample = pd.read_csv('./data/sample_submission.csv')

In [20]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 488877 entries, 0 to 488876
Data columns (total 27 columns):
 #   Column                           Non-Null Count   Dtype  
---  ------                           --------------   -----  
 0   Unnamed: 0                       488877 non-null  int64  
 1   userId                           488877 non-null  int64  
 2   tier                             488877 non-null  int64  
 3   gender                           488877 non-null  int64  
 4   following_rate                   488877 non-null  float64
 5   followers_avg_age                488877 non-null  float64
 6   following_avg_age                488877 non-null  float64
 7   max_repetitive_punc              488877 non-null  int64  
 8   num_of_hashtags_per_action       488877 non-null  float64
 9   emoji_count_per_action           488877 non-null  float64
 10  punctuations_per_action          488877 non-null  float64
 11  number_of_words_per_action       488877 non-null  float64
 12  av

In [21]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 488877 entries, 0 to 488876
Data columns (total 27 columns):
 #   Column                           Non-Null Count   Dtype  
---  ------                           --------------   -----  
 0   Unnamed: 0                       488877 non-null  int64  
 1   userId                           488877 non-null  int64  
 2   tier                             488877 non-null  int64  
 3   gender                           488877 non-null  int64  
 4   following_rate                   488877 non-null  float64
 5   followers_avg_age                488877 non-null  float64
 6   following_avg_age                488877 non-null  float64
 7   max_repetitive_punc              488877 non-null  int64  
 8   num_of_hashtags_per_action       488877 non-null  float64
 9   emoji_count_per_action           488877 non-null  float64
 10  punctuations_per_action          488877 non-null  float64
 11  number_of_words_per_action       488877 non-null  float64
 12  av

In [None]:
# カテゴリ変数であるtierとgenderのone-hotエンコーディング
train = pd.get_dummies(train, columns=['tier','gender'])
test = pd.get_dummies(test, columns=['tier','gender'])

In [23]:
train['age_group'] = train['age_group']-1

In [25]:
main_feature_cols = [col for col in train.columns.tolist() if col not in ['age_group']]
print(main_feature_cols)

['Unnamed: 0', 'userId', 'following_rate', 'followers_avg_age', 'following_avg_age', 'max_repetitive_punc', 'num_of_hashtags_per_action', 'emoji_count_per_action', 'punctuations_per_action', 'number_of_words_per_action', 'avgCompletion', 'avgTimeSpent', 'avgDuration', 'avgComments', 'creations', 'content_views', 'num_of_comments', 'weekends_trails_watched_per_day', 'weekdays_trails_watched_per_day', 'slot1_trails_watched_per_day', 'slot2_trails_watched_per_day', 'slot3_trails_watched_per_day', 'slot4_trails_watched_per_day', 'avgt2', 'tier_1', 'tier_2', 'tier_3', 'gender_1', 'gender_2']


この行のPythonコードは、train データフレームの中から、'age_group' を除いたすべての列名をリストとして取得しています。以下で詳しく解説します。

In [30]:
from sklearn.decomposition import PCA

#　使用する主成分の数
NUM_PCA = 3

# trainデータから目的変数age_groupを取り除く
train_features = train.drop('age_group', axis=1)

pca = PCA(n_components=NUM_PCA, random_state=0)
data = pd.concat([train_features,test])
pca_fts = pca.fit_transform(data[main_feature_cols]).reshape(-1, NUM_PCA)
pca_cols = []
for i in range(NUM_PCA):
    data[f'pca_{i}'] = pca_fts[:, i]
    pca_cols.append(f'pca_{i}')

train_features = data.iloc[:train.shape[0], :]
test = data.iloc[train.shape[0]:, :]

train = pd.concat([train_features, train['age_group']], axis=1)

In [31]:
train

Unnamed: 0.1,Unnamed: 0,userId,following_rate,followers_avg_age,following_avg_age,max_repetitive_punc,num_of_hashtags_per_action,emoji_count_per_action,punctuations_per_action,number_of_words_per_action,...,avgt2,tier_1,tier_2,tier_3,gender_1,gender_2,pca_0,pca_1,pca_2,age_group
0,265153,48958844,0.000000,0.000,0.00,0,0.0,0.0,0.000000,0.000000,...,0.000000,False,True,False,True,False,6.585837e+06,-18423.021887,-18.419889,0
1,405231,51100441,0.000000,0.000,0.00,0,0.0,0.0,0.076923,0.153846,...,82.500000,False,True,False,False,True,8.727685e+06,117759.739874,46.826853,1
2,57867,6887426,0.000000,0.000,0.00,0,0.0,0.0,0.000000,0.000000,...,0.000000,False,True,False,True,False,-3.548589e+07,-149191.428065,-451.848957,0
3,272618,50742404,0.000000,0.000,0.00,0,0.0,0.0,0.000000,0.000000,...,0.000000,False,True,False,True,False,8.369407e+06,-14201.868368,-32.360541,0
4,251123,45589200,0.000000,0.000,0.00,0,0.0,0.0,0.000000,0.000000,...,0.000000,False,True,False,False,True,3.216173e+06,-26324.460280,-72.742800,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
488872,535266,38983571,0.000000,0.000,0.00,0,0.0,0.0,0.000000,0.300000,...,195.000000,False,True,False,False,True,-3.388928e+06,269832.050717,10.737846,0
488873,36706,53264639,0.017241,0.000,0.00,0,0.0,0.0,0.000000,0.012195,...,188.742857,True,False,False,True,False,1.089121e+07,-254700.674622,-94.600345,3
488874,31570,44693578,0.229508,1.625,2.25,0,0.0,0.0,0.000000,0.094340,...,184.880000,True,False,False,True,False,2.320153e+06,-244248.076255,-184.694046,3
488875,239209,42760145,0.000000,0.000,0.00,0,0.0,0.0,0.000000,0.000000,...,0.000000,False,True,False,True,False,3.871009e+05,-33093.106640,-102.845293,0


In [35]:
feature_cols = [col for col in train.columns.tolist() if col not in ['age_group']]
target_cols = ['age_group']

In [None]:
# train_test_split() 関数で、データを トレーニングデータ（80%）と検証データ（20%）に分割。
from sklearn.model_selection import train_test_split
xtrain, xval, ytrain, yval = train_test_split(train[feature_cols], train[target_cols], test_size=0.2)
# XGBoostモデルの作成と学習
clf = xgb.XGBClassifier()
clf.fit(xtrain, ytrain)

In [None]:
# 特徴量の重要度を表示
pd.DataFrame(zip(feature_cols, clf.feature_importances_)).sort_values(by=1)

Unnamed: 0,0,1
28,gender_2,0.0
26,tier_3,0.002502
6,num_of_hashtags_per_action,0.002734
13,avgComments,0.002951
12,avgDuration,0.002951
10,avgCompletion,0.003017
17,weekends_trails_watched_per_day,0.003105
21,slot3_trails_watched_per_day,0.003249
20,slot2_trails_watched_per_day,0.003401
23,avgt2,0.003523


- `clf.feature_importances_` は、学習済みのモデルから得られる各特徴量の重要度（0～1の値）。
- `zip(feature_cols, clf.feature_importances_)` によって、特徴量名とその重要度をペアに。
- それを `pd.DataFrame()` にして、データフレームに変換。
- `.sort_values(by=1)` によって、重要度で昇順に並べ替えている。


In [38]:
# KFOLD
skf = StratifiedKFold(n_splits=5, random_state=0, shuffle=True)

- StratifiedKFold は、クラスの分布ができるだけ等しくなるようにデータを K 個に分ける手法（層化抽出）。

- n_splits=5：5分割（5-fold cross validation）。

- shuffle=True：データをランダムにシャッフルしてから分割。

- random_state=0：ランダムシードを固定（再現性の確保）。

In [39]:
for i, (trn, val) in enumerate(skf.split(train[target_cols], train[target_cols])):
    train.loc[val, 'kfold'] = i


- skf.split() は、入力データとそのラベルから 訓練インデックス（trn）と検証インデックス（val） を返します。

- この for ループで、すべてのデータに対して「このデータは何番目のfoldか（0〜4）」を train['kfold'] 列に割り当てます。

In [40]:
train['kfold'] = train['kfold'].astype(int)