In [None]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [None]:
cd /content/drive/My Drive/psat_summer

/content/drive/My Drive/psat_summer


In [None]:
import pandas as pd
import numpy as np
import os
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, classification_report
import pickle
import joblib

In [None]:
train = pd.read_csv("train_mwmote.csv")

In [None]:
y = train['target']
x = train.drop('target', axis = 1)

In [None]:
cf_matrix = []
f1_scores = []

5 fold CV

In [None]:
folds = KFold(n_splits = 5, shuffle = True, random_state = 0)

In [None]:
for n_fold, (train_idx, val_idx) in enumerate(folds.split(x)) :
    train_X, train_y = x.iloc[train_idx], y.iloc[train_idx]
    valid_X, valid_y = x.iloc[val_idx], y.iloc[val_idx]

    params = {
    'n_estimators' : 100,
    'max_depth' : 6,
    'min_samples_leaf' : 20,
    'min_samples_split' : 20
    }

    n_estimators = params['n_estimators']
    max_depth = params['max_depth']
    min_samples_leaf = params['min_samples_leaf']
    min_samples_split = params['min_samples_split']

    rf_clf = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, min_samples_leaf=min_samples_leaf, min_samples_split=min_samples_split, random_state=0)
    rf_clf.fit(train_X, train_y)

    y_pred = rf_clf.predict(valid_X)

    cf_matrix.append(confusion_matrix(valid_y, y_pred))
    f1_scores.append(f1_score(valid_y, y_pred))

    print(classification_report(valid_y, y_pred, target_names=['0', '1']))

cf_matrix = sum(cf_matrix)
f1_scores = np.mean(f1_scores)
print("F1 score : %f" % f1_scores)

              precision    recall  f1-score   support

           0       0.82      0.78      0.80      5020
           1       0.79      0.83      0.81      5057

    accuracy                           0.80     10077
   macro avg       0.80      0.80      0.80     10077
weighted avg       0.80      0.80      0.80     10077

              precision    recall  f1-score   support

           0       0.83      0.77      0.80      5053
           1       0.79      0.84      0.81      5024

    accuracy                           0.81     10077
   macro avg       0.81      0.81      0.81     10077
weighted avg       0.81      0.81      0.81     10077

              precision    recall  f1-score   support

           0       0.82      0.77      0.79      5048
           1       0.78      0.83      0.80      5028

    accuracy                           0.80     10076
   macro avg       0.80      0.80      0.80     10076
weighted avg       0.80      0.80      0.80     10076

              preci

Grid Search

In [None]:
params = {
    'n_estimators' : [200, 300, 400],
    'max_depth' : [3, 4, 5],
    'min_samples_leaf' : [30, 40, 50],
    'min_samples_split' : [30, 40, 50]
}

In [None]:
params = {
    'n_estimators' : [200, 300],
    'max_depth' : [3, 4],
    'min_samples_leaf' : [30, 40],
    'min_samples_split' : [30, 40]
}

In [None]:
rf_clf = RandomForestClassifier(random_state=0, n_jobs=-1)

rf_grid = GridSearchCV(rf_clf, param_grid=params, cv=5, n_jobs=-1, scoring='f1')
rf_grid.fit(x, y)

GridSearchCV(cv=5, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=100, n_jobs=-1,
                                              oob_score=False, random_state=0,
                                     

In [None]:
rf_grid.best_params_

{'max_depth': 4,
 'min_samples_leaf': 30,
 'min_samples_split': 30,
 'n_estimators': 300}

In [None]:
rf_grid.best_score_

0.8041521696235012

Save Model

In [None]:
model = RandomForestClassifier(n_estimators=300, max_depth=4, min_samples_leaf=30, min_samples_split=30, random_state=0)

In [None]:
joblib.dump(model, 'RF.pkl')

['RF.pkl']

In [None]:
test = pd.read_csv("test_pca.csv")

In [None]:
train_X = train.drop('target', axis = 1)
train_y = train['target']

In [None]:
model.fit(train_X, train_y)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=4, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=30, min_samples_split=30,
                       min_weight_fraction_leaf=0.0, n_estimators=300,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [None]:
prediction = model.predict(test)

In [None]:
test.shape

(2000, 189)

In [None]:
submission = pd.DataFrame(columns = ['id', 'target'])
submission['id'] = range(1,2001)

In [None]:
submission['target'] = prediction

In [None]:
submission['target'].value_counts()

0    1420
1     580
Name: target, dtype: int64

In [None]:
submission.to_csv("randomforest_ver2.csv", header = True, index = False)