In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.metrics import accuracy_score, classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
from sklearn.ensemble import RandomForestClassifier

In [6]:
pokemon_df = pd.read_csv('data/pokemon_df_rf.csv')

In [7]:
pokemon_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype
---  ------             --------------  -----
 0   Winner_mark        50000 non-null  int64
 1   Generation_first   50000 non-null  int64
 2   Legendary_first    50000 non-null  int64
 3   Generation_second  50000 non-null  int64
 4   Legendary_second   50000 non-null  int64
 5   HP_diff            50000 non-null  int64
 6   Attack_diff        50000 non-null  int64
 7   Defense_diff       50000 non-null  int64
 8   Sp. Atk_diff       50000 non-null  int64
 9   Sp. Def_diff       50000 non-null  int64
 10  Speed_diff         50000 non-null  int64
dtypes: int64(11)
memory usage: 4.2 MB


In [8]:
X = pokemon_df.drop('Winner_mark',axis=1)
y = pokemon_df['Winner_mark']

In [9]:
# Train/Test(val) 분리
X_train,X_test, y_train,y_test = train_test_split(X,y,stratify=y,random_state=1)

In [11]:
rf = RandomForestClassifier(n_estimators=200,
                           max_features = 7,
                           max_depth = 5,
                           n_jobs=-1,
                           random_state=1)

In [12]:
rf.fit(X_train,y_train)

RandomForestClassifier(max_depth=5, max_features=7, n_estimators=200, n_jobs=-1,
                       random_state=1)

In [13]:
pred_train= rf.predict(X_train)
pred_test = rf.predict(X_test)

In [14]:
accuracy_score(y_train,pred_train),accuracy_score(y_test,pred_test)

(0.9437066666666667, 0.94656)

In [15]:
fi = rf.feature_importances_
fi

array([1.41878967e-04, 3.51053683e-05, 7.28208168e-04, 1.76413114e-04,
       3.96604292e-03, 4.30883919e-02, 4.75347506e-03, 1.02256334e-02,
       6.06912507e-04, 9.36277939e-01])

In [16]:
pokemon_df_onehot = pd.read_csv('data/pokemon_df_rf.csv')

In [17]:
pokemon_df_onehot = pd.get_dummies(pokemon_df, columns=['Generation_first','Generation_second','Legendary_first','Legendary_second'])

In [18]:
pokemon_df_onehot.to_csv('data/pokemon_df_onehot.csv')

In [20]:
pokemon_df

Unnamed: 0,Winner_mark,Generation_first,Legendary_first,Generation_second,Legendary_second,HP_diff,Attack_diff,Defense_diff,Sp. Atk_diff,Sp. Def_diff,Speed_diff
0,2,2,0,3,0,-20,-6,10,-15,10,-19
1,2,5,1,5,1,0,-39,-18,18,39,0
2,2,2,0,5,0,-20,-35,10,-45,10,0
3,2,2,0,5,0,-37,-80,-50,10,-50,-28
4,1,1,0,2,0,50,50,-105,105,-160,50
...,...,...,...,...,...,...,...,...,...,...,...
49995,1,5,1,1,0,70,80,30,80,95,30
49996,1,5,0,5,0,25,30,0,-15,5,8
49997,2,3,0,3,0,-13,-65,40,25,10,-25
49998,1,1,0,1,0,15,-5,-20,-40,0,55


In [21]:
data = pd.read_csv('data/type_data.csv')

In [22]:
data = pd.get_dummies(data, columns = ['Type 1_first','Type 1_second','Type 2_first','Type 2_second'])

In [27]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 76 columns):
 #   Column            Non-Null Count  Dtype
---  ------            --------------  -----
 0   Unnamed: 0        50000 non-null  int64
 1   Winner_mark       50000 non-null  int64
 2   Type 1_first_0    50000 non-null  uint8
 3   Type 1_first_1    50000 non-null  uint8
 4   Type 1_first_2    50000 non-null  uint8
 5   Type 1_first_3    50000 non-null  uint8
 6   Type 1_first_4    50000 non-null  uint8
 7   Type 1_first_5    50000 non-null  uint8
 8   Type 1_first_6    50000 non-null  uint8
 9   Type 1_first_7    50000 non-null  uint8
 10  Type 1_first_8    50000 non-null  uint8
 11  Type 1_first_9    50000 non-null  uint8
 12  Type 1_first_10   50000 non-null  uint8
 13  Type 1_first_11   50000 non-null  uint8
 14  Type 1_first_12   50000 non-null  uint8
 15  Type 1_first_13   50000 non-null  uint8
 16  Type 1_first_14   50000 non-null  uint8
 17  Type 1_first_15   50000 non-nul

In [23]:
X_type = data.drop('Winner_mark',axis=1)
y_type = data['Winner_mark']

In [25]:
X_type_train,X_type_test,y_type_train,y_type_test = train_test_split(X_type,y_type,stratify=y_type,random_state=1)

In [26]:
rf = RandomForestClassifier(n_estimators=200,
                           max_features = 50,
                           max_depth = 9,
                           n_jobs=-1,
                           random_state=1)

In [28]:
rf.fit(X_type_train,y_type_train)

RandomForestClassifier(max_depth=5, max_features=7, n_estimators=200, n_jobs=-1,
                       random_state=1)

In [29]:
pred_type_train= rf.predict(X_type_train)
pred_type_test = rf.predict(X_type_test)

In [30]:
accuracy_score(y_type_train,pred_type_train),accuracy_score(y_type_test,pred_type_test)

(0.6226133333333334, 0.6212)

In [31]:
# categorical

In [32]:
data_categorical = pd.read_csv('data/data_categorical.csv')

In [33]:
data_categorical = pd.get_dummies(data_categorical, columns = ['Type 1_first','Type 1_second','Type 2_first','Type 2_second','Legendary_first','Legendary_second','Generation_first','Generation_second'])
X_cate = data.drop('Winner_mark',axis=1)
y_cate = data['Winner_mark']

In [34]:
X_cate_train,X_cate_test,y_cate_train,y_cate_test = train_test_split(X_cate,y_cate,stratify=y_cate,random_state=1)

In [35]:
rf = RandomForestClassifier(n_estimators=200,
                           max_features = 50,
                           max_depth = 9,
                           n_jobs=-1,
                           random_state=1)

In [36]:
rf.fit(X_cate_train,y_cate_train)

RandomForestClassifier(max_depth=9, max_features=50, n_estimators=200,
                       n_jobs=-1, random_state=1)

In [37]:
pred_cate_train= rf.predict(X_cate_train)
pred_cate_test = rf.predict(X_cate_test)
accuracy_score(y_cate_train,pred_cate_train),accuracy_score(y_cate_test,pred_cate_test)

(0.63208, 0.62032)