## Import

In [None]:
import pandas as pd
import random
import os
import numpy as np

from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier

In [None]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(37) 

## Data Load

In [None]:
from google.colab import files
uploaded = files.upload()

Saving test.csv to test.csv


In [None]:
from google.colab import files
uploaded = files.upload()

Saving sample_submission.csv to sample_submission.csv


In [None]:
from google.colab import files
uploaded = files.upload()

Saving train.csv to train.csv


In [None]:
train_df = pd.read_csv('./train.csv')
test_df = pd.read_csv('./test.csv')

In [None]:
train_df = train_df.drop(columns=['PRODUCT_ID', 'TIMESTAMP', 'Y_Class', 'Y_Quality'])
test_x = test_df.drop(columns=['PRODUCT_ID', 'TIMESTAMP'])

In [None]:
train_x = train_df.drop(columns=['PRODUCT_ID', 'TIMESTAMP', 'Y_Class', 'Y_Quality'])
train_y = train_df['Y_Class']

test_x = test_df.drop(columns=['PRODUCT_ID', 'TIMESTAMP'])

## Data Pre-processing

In [None]:
train_x = train_x.fillna(0)
test_x = test_x.fillna(0)

In [None]:
# qualitative to quantitative
# one-hot encoding

qual_col = ['LINE', 'PRODUCT_CODE']

for i in qual_col:
    le = LabelEncoder()
    le = le.fit(train_x[i])
    train_x[i] = le.transform(train_x[i])
    
    for label in np.unique(test_x[i]): 
        if label not in le.classes_: 
            le.classes_ = np.append(le.classes_, label)
    test_x[i] = le.transform(test_x[i]) 
print('Done.')

Done.


#### 데이터 전처리 (Not Baseline)



In [None]:
# 불균형 문제 해소 때문에 아직 train_y로 클래스값을 나누지 않음
train_x = train_df.drop(columns=['PRODUCT_ID', 'TIMESTAMP', 'Y_Quality'])

test_x = test_df.drop(columns=['PRODUCT_ID', 'TIMESTAMP'])

In [None]:
# 클래스 분류 값에는 결측치 없음
print(train_x['Y_Class'].isnull().any().any())

In [None]:
# NaN 을 0으로 채우기
train_x = train_x.fillna(0)
test_x = test_x.fillna(0)

# 결측값 확인
print(train_x.isnull().any().any())
print('-----')
print(test_x.isnull().any().any())

In [None]:
# qualitative to quantitative
qual_col = ['LINE', 'PRODUCT_CODE']

for i in qual_col:
    le = LabelEncoder()
    le = le.fit(train_x[i])
    train_x[i] = le.transform(train_x[i])
    
    for label in np.unique(test_x[i]): 
        if label not in le.classes_: 
            le.classes_ = np.append(le.classes_, label)
    test_x[i] = le.transform(test_x[i]) 
print('Done.')

## Classification Model Fit

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

In [None]:
RF = RandomForestClassifier(random_state=37).fit(train_x, train_y)
print('Done.')
# 0.5495720225

Done.


In [None]:
RF = DecisionTreeClassifier(random_state=37).fit(train_x, train_y)
print('Done.')
# 

Done.


#### GradientBoostingClassifier (Not yet)

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

RF =  GradientBoostingClassifier(random_state=37).fit(train_x, train_y)
print('Done.')
#0.6940474287

Done.


#### RandomizedSearchCV (Not Yet)

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

estimators = [DecisionTreeClassifier(random_state=37),
RandomForestClassifier(random_state=37),
GradientBoostingClassifier(random_state=37)
]

In [None]:
from sklearn.model_selection import RandomizedSearchCV

max_depth = np.random.randint(2, 30, 10)
max_features = np.random.uniform(0.3, 1.0, 10)

param_distributions = {"max_depth": max_depth, 
                       "max_features": max_features}

results = []
for estimator in estimators:
    result = []
    if estimator.__class__.__name__ != 'DecisionTreeClassifier':
        param_distributions["n_estimators"] = np.random.randint(100, 200, 10)
        
    clf = RandomizedSearchCV(estimator, 
                       param_distributions, 
                       n_iter=10,
                       scoring="accuracy",
                       n_jobs=-1,
                       cv=5, 
                       verbose=2
                      )

    clf.fit(train_x, train_y)
    result.append(estimator.__class__.__name__)
    result.append(clf.best_params_)
    result.append(clf.best_score_)
    result.append(clf.cv_results_)
    results.append(result)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
Fitting 5 folds for each of 10 candidates, totalling 50 fits
Fitting 5 folds for each of 10 candidates, totalling 50 fits


In [None]:
df = pd.DataFrame(results, 
             columns=["estimator", "best_params", "train_score", "cv_result"])
df

In [None]:
pd.DataFrame(df.loc[1, "cv_result"]).sort_values(by="rank_train_score")

## Inference

In [None]:
preds = RF.predict(test_x)
print('Done.')

Done.


## Submit

In [None]:
submit = pd.read_csv('./sample_submission.csv')

In [None]:
submit['Y_Class'] = preds

In [None]:
submit.to_csv('./submission8.csv', index=False)