In [1]:
import numpy as np
import pandas as pd
from awesome_functions import *
import warnings
warnings.simplefilter('ignore')
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report , accuracy_score ,log_loss
from sklearn.ensemble import RandomForestClassifier , ExtraTreesClassifier
from sklearn.model_selection import GridSearchCV , KFold
from sklearn.svm import SVC
import xgboost as xgb

## 시행할 모델링을 선택해주세요 원하시지 않으면 `False` 로 처리해주세요

In [2]:
logistic = True
multi = True
random = False # 그리드서치로 인해 시간이 많이 소요됩니다.
extra = False # 그리드서치로 인해 시간이 많이 소요됩니다.
svc = False # 기본적으로 시간이 많이 소요됩니다.
xgboost = True

## 경로를 포함한 파일이름을 넣어주시면 됩니다.

In [3]:
insert_training_file_name = 'Feature_matrix/new_df_dd_and_breif_fl_n_dd_n_fl_n_company_n_upc_201808081841.csv' #here!!

### 아래서부터는 건드시지 않으셔도 됩니다.

In [4]:
train_df = pd.read_csv(str(insert_training_file_name))

In [5]:
train_df.tail()

Unnamed: 0,VisitNumber,TripType,Return,1-HR PHOTO,ACCESSORIES,AUTOMOTIVE,BAKERY,BATH AND SHOWER,BEAUTY,BEDDING,...,9640.0,7554.0,8304.0,9800.0,9912.0,Scancount_total,N_DD,N_FL,N_COMPANY,N_UPC
95669,191343,25,0,0,0,0,0,0,0,0,...,0,0,0,0,0,9,3,5,6,7
95670,191344,22,0,0,0,0,0,0,4,0,...,0,0,0,0,0,5,2,3,4,5
95671,191345,39,0,0,0,0,0,0,1,0,...,0,0,0,0,0,17,8,12,11,13
95672,191346,39,0,0,0,0,0,0,0,0,...,0,0,0,0,0,17,8,16,13,17
95673,191347,8,0,0,0,0,0,0,0,0,...,0,0,0,0,0,2,2,2,2,2


## Split DataFrame into X , y

In [6]:
train_X , train_y = get_df_to_fit(train_df)

In [7]:
train_X.tail()

Unnamed: 0,Return,1-HR PHOTO,ACCESSORIES,AUTOMOTIVE,BAKERY,BATH AND SHOWER,BEAUTY,BEDDING,BOOKS AND MAGAZINES,BOYS WEAR,...,9640.0,7554.0,8304.0,9800.0,9912.0,Scancount_total,N_DD,N_FL,N_COMPANY,N_UPC
95669,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,9,3,5,6,7
95670,0,0,0,0,0,0,4,0,0,0,...,0,0,0,0,0,5,2,3,4,5
95671,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,17,8,12,11,13
95672,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,17,8,16,13,17
95673,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,2,2,2,2,2


In [8]:
train_y.tail()

95669    25
95670    22
95671    39
95672    39
95673     8
Name: TripType, dtype: int64

## Logistic Regression
- 시간이 좀 걸립니다.

In [9]:
if logistic:
    logis_model = LogisticRegression().fit(train_X , train_y)
    print("Logistic Regression's accuracy score: ",accuracy_score(train_y,logis_model.predict(train_X)))

Logistic Regression's accuracy score:  0.6684992788009282


## Multinomial Naive Bayes
- negative_value가 있으면 작동을 하지 않습니다.

In [10]:
if multi:
    Multi_model = MultinomialNB().fit(train_X,train_y)
    print("Multinomial Naive's accuracy score: ",accuracy_score(train_y,Multi_model.predict(train_X)))

Multinomial Naive's accuracy score:  0.5899931015740953


## Ensemble
- Random forest, Extra Tree
- 그리드 서치 이후, 값을 반환하기 때문에 시간이 소요됩니다.
- n_jobs = -1 으로 설정되어 있기 때문에, 컴퓨터가 느려질 수 있습니다.

## RandomForest

In [11]:
parameters = {'n_estimators':np.arange(1,100,10),'max_depth':np.arange(1,20,5)}
kfold = KFold(10)

In [12]:
if random:
    random_model = RandomForestClassifier(random_state=0)
    grid_model1 = GridSearchCV(random_model,parameters,scoring='accuracy',cv=kfold,n_jobs=-1)
    grid_model1.fit(train_X,train_y)
    params_ls1 = grid_model1.cv_results_['params']
    mean_test_score_ls1 = grid_model1.cv_results_["mean_test_score"]
    plt.plot(mean_test_score_ls1)
    print(grid_model1.best_score_)
    print(grid_model1.best_params_)

## ExtraRandomForest

In [13]:
if extra:
    extra_model = ExtraTreesClassifier(random_state=0)
    grid_model2 = GridSearchCV(extra_model,parameters,scoring='accuracy',cv=kfold,n_jobs=-1)
    grid_model2.fit(train_X,train_y)
    params_ls2 = grid_model2.cv_results_['params']
    mean_test_score_ls2 = grid_model2.cv_results_["mean_test_score"]
    plt.plot(mean_test_score_ls2)
    print(grid_model2.best_score_)
    print(grid_model2.best_params_)

## Support Vector Machine

In [14]:
if svc:
    svc_model = SVC(kernel='rbf',random_state=0).fit(train_X,train_y)
    print("Kernel Support Vector Machine's Accuracy score: ",\
          train_y,accuracy_score(svc_model.predict(train_X)))

## Xgboost

In [15]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
label_enc = LabelEncoder().fit(train_y)
y_labeled = label_enc.transform(train_y)

In [16]:
train_X, test_X, train_y, test_y = train_test_split(train_X, y_labeled, random_state=0)

In [17]:
if xgboost:
    dtrain = xgb.DMatrix(train_X.values, label=train_y)
    dtest = xgb.DMatrix(test_X.values, label=test_y)
    num_boost_round = 300
    params = {'objective': 'multi:softprob', 
              'eval_metric': 'mlogloss',
              'num_class':38, 
              'max_delta_step': 3, 
              'eta': 0.2}

    evals = [(dtrain, 'train'), (dtest, 'eval')]

    bst = xgb.train(params=params,  
                    dtrain=dtrain, 
                    num_boost_round=num_boost_round, 
                    evals=evals,
                    early_stopping_rounds=10,)

[0]	train-mlogloss:3.07134	eval-mlogloss:3.08014
Multiple eval metrics have been passed: 'eval-mlogloss' will be used for early stopping.

Will train until eval-mlogloss hasn't improved in 10 rounds.
[1]	train-mlogloss:2.56501	eval-mlogloss:2.58607
[2]	train-mlogloss:2.16771	eval-mlogloss:2.20016
[3]	train-mlogloss:1.88095	eval-mlogloss:1.92273
[4]	train-mlogloss:1.68865	eval-mlogloss:1.73809
[5]	train-mlogloss:1.5461	eval-mlogloss:1.60137
[6]	train-mlogloss:1.43463	eval-mlogloss:1.49504
[7]	train-mlogloss:1.34488	eval-mlogloss:1.4105
[8]	train-mlogloss:1.26889	eval-mlogloss:1.33981
[9]	train-mlogloss:1.2059	eval-mlogloss:1.28129
[10]	train-mlogloss:1.15241	eval-mlogloss:1.23183
[11]	train-mlogloss:1.10683	eval-mlogloss:1.19062
[12]	train-mlogloss:1.06769	eval-mlogloss:1.15556
[13]	train-mlogloss:1.03313	eval-mlogloss:1.12506
[14]	train-mlogloss:1.0032	eval-mlogloss:1.09886
[15]	train-mlogloss:0.976721	eval-mlogloss:1.07607
[16]	train-mlogloss:0.953	eval-mlogloss:1.05561
[17]	train-mlo

# dd와 fl 63개 그리고 scan total count(dd기준) 0.87