In [1]:
import numpy as np
import pandas as pd
from awesome_functions import *
import warnings
warnings.simplefilter('ignore')
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report , accuracy_score ,log_loss
from sklearn.ensemble import RandomForestClassifier , ExtraTreesClassifier
from sklearn.model_selection import GridSearchCV , KFold
from sklearn.svm import SVC
import xgboost as xgb

## 시행할 모델링을 선택해주세요 원하시지 않으면 `False` 로 처리해주세요

In [2]:
logistic = False
multi = False
random = False # 그리드서치로 인해 시간이 많이 소요됩니다.
extra = False # 그리드서치로 인해 시간이 많이 소요됩니다.
svc = False # 기본적으로 시간이 많이 소요됩니다.
xgboost = True

## 경로를 포함한 파일이름을 넣어주시면 됩니다.

In [3]:
insert_training_file_name = 'Feature_matrix/new_df_dd_fl_cp_201808082113.csv' #here!!

### 아래서부터는 건드시지 않으셔도 됩니다.

In [4]:
train_df = pd.read_csv(str(insert_training_file_name))

In [5]:
train_df.tail()

Unnamed: 0,VisitNumber,TripType,Return,1-HR PHOTO,ACCESSORIES,AUTOMOTIVE,BAKERY,BATH AND SHOWER,BEAUTY,BEDDING,...,890104,890968,891038,891040,892992,893621,897522,897666,897752,898115
95669,191343,25,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
95670,191344,22,0,0,0,0,0,0,4,0,...,0,0,0,0,0,0,0,0,0,0
95671,191345,39,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
95672,191346,39,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
95673,191347,8,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Split DataFrame into X , y

In [6]:
train_X , train_y = get_df_to_fit(train_df)

In [7]:
train_X.tail()

Unnamed: 0,Return,1-HR PHOTO,ACCESSORIES,AUTOMOTIVE,BAKERY,BATH AND SHOWER,BEAUTY,BEDDING,BOOKS AND MAGAZINES,BOYS WEAR,...,890104,890968,891038,891040,892992,893621,897522,897666,897752,898115
95669,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
95670,0,0,0,0,0,0,4,0,0,0,...,0,0,0,0,0,0,0,0,0,0
95671,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
95672,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
95673,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [8]:
train_y.tail()

95669    25
95670    22
95671    39
95672    39
95673     8
Name: TripType, dtype: int64

## Logistic Regression
- 시간이 좀 걸립니다.

In [9]:
if logistic:
    logis_model = LogisticRegression().fit(train_X , train_y)
    print("Logistic Regression's accuracy score: ",accuracy_score(train_y,logis_model.predict(train_X)))

## Multinomial Naive Bayes
- negative_value가 있으면 작동을 하지 않습니다.

In [10]:
if multi:
    Multi_model = MultinomialNB().fit(train_X,train_y)
    print("Multinomial Naive's accuracy score: ",accuracy_score(train_y,Multi_model.predict(train_X)))

## Ensemble
- Random forest, Extra Tree
- 그리드 서치 이후, 값을 반환하기 때문에 시간이 소요됩니다.
- n_jobs = -1 으로 설정되어 있기 때문에, 컴퓨터가 느려질 수 있습니다.

## RandomForest

In [11]:
parameters = {'n_estimators':np.arange(1,100,10),'max_depth':np.arange(1,20,5)}
kfold = KFold(10)

In [12]:
if random:
    random_model = RandomForestClassifier(random_state=0)
    grid_model1 = GridSearchCV(random_model,parameters,scoring='accuracy',cv=kfold,n_jobs=-1)
    grid_model1.fit(train_X,train_y)
    params_ls1 = grid_model1.cv_results_['params']
    mean_test_score_ls1 = grid_model1.cv_results_["mean_test_score"]
    plt.plot(mean_test_score_ls1)
    print(grid_model1.best_score_)
    print(grid_model1.best_params_)

## ExtraRandomForest

In [13]:
if extra:
    extra_model = ExtraTreesClassifier(random_state=0)
    grid_model2 = GridSearchCV(extra_model,parameters,scoring='accuracy',cv=kfold,n_jobs=-1)
    grid_model2.fit(train_X,train_y)
    params_ls2 = grid_model2.cv_results_['params']
    mean_test_score_ls2 = grid_model2.cv_results_["mean_test_score"]
    plt.plot(mean_test_score_ls2)
    print(grid_model2.best_score_)
    print(grid_model2.best_params_)

## Support Vector Machine

In [14]:
if svc:
    svc_model = SVC(kernel='rbf',random_state=0).fit(train_X,train_y)
    print("Kernel Support Vector Machine's Accuracy score: ",\
          train_y,accuracy_score(svc_model.predict(train_X)))

## Xgboost

In [15]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
label_enc = LabelEncoder().fit(train_y)
y_labeled = label_enc.transform(train_y)

In [16]:
train_X, test_X, train_y, test_y = train_test_split(train_X, y_labeled, random_state=0)

In [17]:
from scipy.sparse import csr_matrix

In [18]:
csr_train = csr_matrix(train_X.values)
csr_test = csr_matrix(test_X.values)

In [19]:
if xgboost:
    dtrain = xgb.DMatrix(csr_train, label=train_y)
    dtest = xgb.DMatrix(csr_test, label=test_y)
    num_boost_round = 300
    params = {'objective': 'multi:softprob', 
              'eval_metric': 'mlogloss',
              'num_class':38, 
              'max_delta_step': 3, 
              'eta': 0.2}

    evals = [(dtrain, 'train'), (dtest, 'eval')]

    bst = xgb.train(params=params,  
                    dtrain=dtrain, 
                    num_boost_round=num_boost_round, 
                    evals=evals,
                    early_stopping_rounds=10,)

[0]	train-mlogloss:3.06803	eval-mlogloss:3.07842
Multiple eval metrics have been passed: 'eval-mlogloss' will be used for early stopping.

Will train until eval-mlogloss hasn't improved in 10 rounds.
[1]	train-mlogloss:2.55732	eval-mlogloss:2.58119
[2]	train-mlogloss:2.15374	eval-mlogloss:2.1919
[3]	train-mlogloss:1.86139	eval-mlogloss:1.91069
[4]	train-mlogloss:1.66514	eval-mlogloss:1.7227
[5]	train-mlogloss:1.51844	eval-mlogloss:1.5833
[6]	train-mlogloss:1.40253	eval-mlogloss:1.47421
[7]	train-mlogloss:1.30999	eval-mlogloss:1.38792
[8]	train-mlogloss:1.23242	eval-mlogloss:1.31628
[9]	train-mlogloss:1.16779	eval-mlogloss:1.25694
[10]	train-mlogloss:1.1137	eval-mlogloss:1.20726
[11]	train-mlogloss:1.06687	eval-mlogloss:1.16523
[12]	train-mlogloss:1.02598	eval-mlogloss:1.12903
[13]	train-mlogloss:0.990832	eval-mlogloss:1.09806
[14]	train-mlogloss:0.959172	eval-mlogloss:1.07073
[15]	train-mlogloss:0.931729	eval-mlogloss:1.04704
[16]	train-mlogloss:0.906866	eval-mlogloss:1.0261
[17]	train

[156]	train-mlogloss:0.450428	eval-mlogloss:0.796558
[157]	train-mlogloss:0.449588	eval-mlogloss:0.796488
[158]	train-mlogloss:0.448776	eval-mlogloss:0.796462
[159]	train-mlogloss:0.448018	eval-mlogloss:0.796429
[160]	train-mlogloss:0.447087	eval-mlogloss:0.796431
[161]	train-mlogloss:0.446224	eval-mlogloss:0.796244
[162]	train-mlogloss:0.445412	eval-mlogloss:0.796102
[163]	train-mlogloss:0.444595	eval-mlogloss:0.796053
[164]	train-mlogloss:0.443726	eval-mlogloss:0.795957
[165]	train-mlogloss:0.442794	eval-mlogloss:0.79589
[166]	train-mlogloss:0.441947	eval-mlogloss:0.795888
[167]	train-mlogloss:0.441129	eval-mlogloss:0.795821
[168]	train-mlogloss:0.440264	eval-mlogloss:0.795685
[169]	train-mlogloss:0.439437	eval-mlogloss:0.795729
[170]	train-mlogloss:0.438606	eval-mlogloss:0.795583
[171]	train-mlogloss:0.43777	eval-mlogloss:0.795496
[172]	train-mlogloss:0.436915	eval-mlogloss:0.795431
[173]	train-mlogloss:0.435996	eval-mlogloss:0.79551
[174]	train-mlogloss:0.434956	eval-mlogloss:0.795

# dd와 fl 63개 그리고 scan total count(dd기준) 0.87