In [2]:
import pandas as pd

test_data = pd.read_csv('./data/test.csv')
train_data = pd.read_csv('./data/train.csv')

In [3]:
X = train_data.drop('price_range', axis=1)
X.shape

(2000, 20)

In [4]:
y = train_data['price_range']
y.shape

(2000,)

In [5]:
from sklearn.model_selection import train_test_split
X_train, X_val_ensemble, y_train, y_val_ensemble = train_test_split(X, y, test_size=0.25, random_state=1)

In [6]:
X_train_model, X_train_ensemble, y_train_model, y_train_ensemble = train_test_split(X_train, y_train, test_size=0.333, random_state=1)

In [7]:
print(X_train_model.shape)
print(y_train_model.shape)
print('---------------')
print(X_train_ensemble.shape)
print(y_train_ensemble.shape)
print('---------------')
print(X_val_ensemble.shape)
print(y_val_ensemble.shape)


(1000, 20)
(1000,)
---------------
(500, 20)
(500,)
---------------
(500, 20)
(500,)


In [8]:
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier

param_grid = { 
    'min_samples_leaf': [i for i in range(1, 11)],
    'min_samples_split': [3, 5, 7, 9],
    'criterion' : ['gini', 'entropy'],
    'max_depth' : [i for i in range(1,11)]
}

dtc = DecisionTreeClassifier(random_state=1)
grid_dtc = GridSearchCV(n_jobs=-1, estimator=dtc, cv=5, param_grid=param_grid)
grid_dtc.fit(X_train_model,y_train_model)

print("best score: ", grid_dtc.best_score_)
print("best param: ", grid_dtc.best_params_)

best score:  0.836
best param:  {'criterion': 'entropy', 'max_depth': 9, 'min_samples_leaf': 2, 'min_samples_split': 3}


In [9]:
dtc_model = DecisionTreeClassifier( random_state=1,
                                    min_samples_leaf=grid_dtc.best_params_['min_samples_leaf'],
                                    min_samples_split=grid_dtc.best_params_['min_samples_split'],
                                    criterion=grid_dtc.best_params_['criterion'],
                                    max_depth=grid_dtc.best_params_['max_depth'])

dtc_model.fit(X_train_model, y_train_model)

In [10]:
# accuracy recall precision fscore
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score

dtc_pred = dtc_model.predict(X_val_ensemble)

dtc_lst = {'name': 'DTC',
          'accuracy': accuracy_score(y_val_ensemble, dtc_pred), 
          'precision': precision_score(y_val_ensemble, dtc_pred, average='macro'),
          'recall': recall_score(y_val_ensemble, dtc_pred, average='macro'),
          'F1': f1_score(y_val_ensemble, dtc_pred, average='macro')}

print('The "Accruacy" of {0} model is {1:.5f}'.format(dtc_lst['name'], dtc_lst['accuracy']))
print('The "Precision" of {0} model is {1:.5f}'.format(dtc_lst['name'], dtc_lst['precision']))
print('The "Recall" of {0} model is {1:.5f}'.format(dtc_lst['name'], dtc_lst['recall'])) 
print('The "F1" of {0} model is {1:.5f}'.format(dtc_lst['name'], dtc_lst['F1']))

The "Accruacy" of DTC model is 0.81200
The "Precision" of DTC model is 0.81712
The "Recall" of DTC model is 0.81546
The "F1" of DTC model is 0.81583


In [11]:
from sklearn.ensemble import RandomForestClassifier

param_grid = { 
    'n_estimators': [10, 20, 30, 40, 50],
    'max_features': [10, 12, 14, 16, 18, 20],
    'max_depth' : [10, 11, 12, 13, 14, 15],
    'criterion' : ['gini', 'entropy']
}

rfc = RandomForestClassifier(random_state=1)
grid_rfc = GridSearchCV(n_jobs=-1, estimator=rfc, cv=5, param_grid=param_grid)
grid_rfc.fit(X_train_model,y_train_model)

print("best score: ", grid_rfc.best_score_)
print("best param: ", grid_rfc.best_params_)

best score:  0.898
best param:  {'criterion': 'entropy', 'max_depth': 10, 'max_features': 12, 'n_estimators': 30}


In [12]:
# Define the model. Set random_state to 1
rfc_model = RandomForestClassifier( random_state=1,
                                    n_estimators=grid_rfc.best_params_['n_estimators'],
                                    max_features=grid_rfc.best_params_['max_features'],
                                    max_depth=grid_rfc.best_params_['max_depth'],
                                    criterion=grid_rfc.best_params_['criterion'])

rfc_model.fit(X_train_model, y_train_model)

In [13]:
rfc_pred = rfc_model.predict(X_val_ensemble)

rfc_lst = {'name': 'RFC',
          'accuracy': accuracy_score(y_val_ensemble, rfc_pred), 
          'precision': precision_score(y_val_ensemble, rfc_pred, average='macro'),
          'recall': recall_score(y_val_ensemble, rfc_pred, average='macro'),
          'F1': f1_score(y_val_ensemble, rfc_pred, average='macro')}

print('The "Accruacy" of {0} model is {1:.5f}'.format(rfc_lst['name'], rfc_lst['accuracy']))
print('The "Precision" of {0} model is {1:.5f}'.format(rfc_lst['name'], rfc_lst['precision']))
print('The "Recall" of {0} model is {1:.5f}'.format(rfc_lst['name'], rfc_lst['recall'])) 
print('The "F1" of {0} model is {1:.5f}'.format(rfc_lst['name'], rfc_lst['F1']))

The "Accruacy" of RFC model is 0.88000
The "Precision" of RFC model is 0.88252
The "Recall" of RFC model is 0.88308
The "F1" of RFC model is 0.88279


In [14]:
print("{:<25}{:<5}".format("特徵", "重要程度"))
for i in range(len(test_data.columns) - 1):
    print("{:<25}{:<5}".format(test_data.columns[i], rfc_model.feature_importances_[i]))

特徵                       重要程度 
id                       0.10250947550907967
battery_power            0.0013699474117007328
blue                     0.006892365236841772
clock_speed              0.0021500397295199985
dual_sim                 0.007490082658590095
fc                       0.0012969360771513648
four_g                   0.012495170892774982
int_memory               0.0064557874317279955
m_dep                    0.014052129186607367
mobile_wt                0.010411877900674552
n_cores                  0.007782655933490201
pc                       0.06206957307571412
px_height                0.05383933713685874
px_width                 0.6857323195456924
ram                      0.004849001419689647
sc_h                     0.007145129296912634
sc_w                     0.010630553691425784
talk_time                0.0005704278968386517
three_g                  0.0013640534499467297
touch_screen             0.0008931365187627293


In [15]:
from sklearn.svm import SVC

param_grid = { 
    'C' : [1, 0.1, 0.25, 0.5, 2, 0.75],
    'kernel' : ["linear", "rbf"],
    'gamma' : ["auto", 0.01, 0.001, 0.0001, 1],
    'decision_function_shape' : ["ovo", "ovr"]
}

svc=SVC(random_state=1)
grid_svc=GridSearchCV(n_jobs=-1, estimator=svc, cv=5, param_grid=param_grid)
grid_svc.fit(X_train_model,y_train_model)

print("best score: ", grid_svc.best_score_)
print("best param: ", grid_svc.best_params_)

best score:  0.9639999999999999
best param:  {'C': 0.1, 'decision_function_shape': 'ovo', 'gamma': 'auto', 'kernel': 'linear'}


In [16]:
# 創建SVC模型
svc_model = SVC( random_state=1,
                 C=grid_svc.best_params_['C'],
                 decision_function_shape=grid_svc.best_params_['decision_function_shape'],
                 gamma=grid_svc.best_params_['gamma'],
                 kernel=grid_svc.best_params_['kernel'])

# 擬合模型（進行訓練）
svc_model.fit(X_train_model, y_train_model)

In [17]:
svc_pred = svc_model.predict(X_val_ensemble)

svc_lst = {'name': 'SVC',
          'accuracy': accuracy_score(y_val_ensemble, svc_pred), 
          'precision': precision_score(y_val_ensemble, svc_pred, average='macro'),
          'recall': recall_score(y_val_ensemble, svc_pred, average='macro'),
          'F1': f1_score(y_val_ensemble, svc_pred, average='macro')}

print('The "Accruacy" of {0} model is {1:.5f}'.format(svc_lst['name'], svc_lst['accuracy']))
print('The "Precision" of {0} model is {1:.5f}'.format(svc_lst['name'], svc_lst['precision']))
print('The "Recall" of {0} model is {1:.5f}'.format(svc_lst['name'], svc_lst['recall'])) 
print('The "F1" of {0} model is {1:.5f}'.format(svc_lst['name'], svc_lst['F1']))

The "Accruacy" of SVC model is 0.97200
The "Precision" of SVC model is 0.97385
The "Recall" of SVC model is 0.97186
The "F1" of SVC model is 0.97265


In [18]:
from sklearn.neighbors import KNeighborsClassifier

param_grid = { 
    'n_neighbors' : [i for i in range(1, 21)],
    'weights' : ['uniform', 'distance'],
    'p' : [i for i in range(1,11)],
    
}

knn=KNeighborsClassifier()
grid_knn=GridSearchCV(n_jobs=-1, estimator=knn, cv=5, param_grid=param_grid)
grid_knn.fit(X_train_model,y_train_model)

print("best score: ", grid_knn.best_score_)
print("best param: ", grid_knn.best_params_)

best score:  0.9260000000000002
best param:  {'n_neighbors': 14, 'p': 9, 'weights': 'distance'}


In [19]:
# Define the model. Set random_state to 1
knn_model = KNeighborsClassifier(   n_neighbors=grid_knn.best_params_['n_neighbors'],
                                    weights=grid_knn.best_params_['weights'],
                                    p=grid_knn.best_params_['p']
                                    )

knn_model.fit(X_train_model, y_train_model)

In [20]:
knn_pred = knn_model.predict(X_val_ensemble)

knn_lst = {'name': 'KNN',
          'accuracy': accuracy_score(y_val_ensemble, knn_pred), 
          'precision': precision_score(y_val_ensemble, knn_pred, average='macro'),
          'recall': recall_score(y_val_ensemble, knn_pred, average='macro'),
          'F1': f1_score(y_val_ensemble, knn_pred, average='macro')}

print('The "Accruacy" of {0} model is {1:.5f}'.format(knn_lst['name'], knn_lst['accuracy']))
print('The "Precision" of {0} model is {1:.5f}'.format(knn_lst['name'], knn_lst['precision']))
print('The "Recall" of {0} model is {1:.5f}'.format(knn_lst['name'], knn_lst['recall'])) 
print('The "F1" of {0} model is {1:.5f}'.format(knn_lst['name'], knn_lst['F1']))

The "Accruacy" of KNN model is 0.91600
The "Precision" of KNN model is 0.91933
The "Recall" of KNN model is 0.91742
The "F1" of KNN model is 0.91810


In [21]:
# Build training data of ensemble model
knn_pred = knn_model.predict(X_train_ensemble)
svc_pred = svc_model.predict(X_train_ensemble)
rfc_pred = rfc_model.predict(X_train_ensemble)
dtc_pred = dtc_model.predict(X_train_ensemble)

X_train_model_pred = pd.DataFrame({'DT': dtc_pred,
                                   'RF': rfc_pred,
                                   'SVC': svc_pred, 
                                   'knn': knn_pred}) 
X_train_model_pred

Unnamed: 0,DT,RF,SVC,knn
0,2,2,2,2
1,2,2,2,2
2,2,3,3,3
3,0,0,0,0
4,0,0,0,0
...,...,...,...,...
495,0,0,0,0
496,3,3,3,3
497,1,1,1,1
498,1,1,1,1


In [22]:
# Build val data of ensemble model
dtc_val_pred = dtc_model.predict(X_val_ensemble)
rfc_val_pred = rfc_model.predict(X_val_ensemble)
svc_val_pred = svc_model.predict(X_val_ensemble)
knn_val_pred = knn_model.predict(X_val_ensemble)

X_val_model_pred = pd.DataFrame({'DT': dtc_val_pred,
                                   'RF': rfc_val_pred,
                                   'SVC': svc_val_pred, 
                                   'knn': knn_val_pred}) 

X_val_model_pred

Unnamed: 0,DT,RF,SVC,knn
0,0,0,0,0
1,0,0,0,0
2,1,1,1,1
3,0,0,1,0
4,3,3,2,3
...,...,...,...,...
495,2,2,2,2
496,0,0,0,0
497,0,0,0,0
498,2,2,2,3


In [23]:
# train ensemble model
dtc_ensemble_model = DecisionTreeClassifier(random_state=1)
dtc_ensemble_model.fit(X_train_model_pred, y_train_ensemble)

dtc_ensemble_pre = dtc_ensemble_model.predict(X_val_model_pred)

dtc_ensemble_lst = {'name': 'Ensemble DTC',
          'accuracy': accuracy_score(y_val_ensemble, dtc_ensemble_pre), 
          'precision': precision_score(y_val_ensemble, dtc_ensemble_pre, average='macro'),
          'recall': recall_score(y_val_ensemble, dtc_ensemble_pre, average='macro'),
          'F1': f1_score(y_val_ensemble, dtc_ensemble_pre, average='macro')}

In [24]:
rfc_ensemble_model = RandomForestClassifier(random_state=1)
rfc_ensemble_model.fit(X_train_model_pred, y_train_ensemble)

rfc_ensemble_pre = rfc_ensemble_model.predict(X_val_model_pred)

rfc_ensemble_lst = {'name': 'Ensemble RFC',
          'accuracy': accuracy_score(y_val_ensemble, rfc_ensemble_pre), 
          'precision': precision_score(y_val_ensemble, rfc_ensemble_pre, average='macro'),
          'recall': recall_score(y_val_ensemble, rfc_ensemble_pre, average='macro'),
          'F1': f1_score(y_val_ensemble, rfc_ensemble_pre, average='macro')}

In [25]:
# 創建SVC模型
svc_ensemble_model = SVC(random_state=1)
svc_ensemble_model.fit(X_train_model_pred, y_train_ensemble)

svc_ensemble_pre = svc_ensemble_model.predict(X_val_model_pred)

svc_ensemble_lst = {'name': 'Ensemble SVC',
          'accuracy': accuracy_score(y_val_ensemble, svc_ensemble_pre), 
          'precision': precision_score(y_val_ensemble, svc_ensemble_pre, average='macro'),
          'recall': recall_score(y_val_ensemble, svc_ensemble_pre, average='macro'),
          'F1': f1_score(y_val_ensemble, svc_ensemble_pre, average='macro')}

In [26]:
# Define the model. Set random_state to 1
knn_ensemble_model = KNeighborsClassifier()
knn_ensemble_model.fit(X_train_model_pred, y_train_ensemble)

knn_ensemble_pre = knn_ensemble_model.predict(X_val_model_pred)

knn_ensemble_lst = {'name': 'Ensemble KNN',
          'accuracy': accuracy_score(y_val_ensemble, knn_ensemble_pre), 
          'precision': precision_score(y_val_ensemble, knn_ensemble_pre, average='macro'),
          'recall': recall_score(y_val_ensemble, knn_ensemble_pre, average='macro'),
          'F1': f1_score(y_val_ensemble, knn_ensemble_pre, average='macro')}

In [27]:
# original model
original_output = pd.DataFrame(  {'accuracy':  {svc_lst['name']: svc_lst['accuracy'], 
                                       rfc_lst['name']: rfc_lst['accuracy'],
                                       dtc_lst['name']: dtc_lst['accuracy'],
                                       knn_lst['name']: knn_lst['accuracy']
                                      },

                        'precision':  {svc_lst['name']: svc_lst['precision'], 
                                       rfc_lst['name']: rfc_lst['precision'],
                                       dtc_lst['name']: dtc_lst['precision'],
                                       knn_lst['name']: knn_lst['precision']
                                      },

                        'recall':     {svc_lst['name']: svc_lst['recall'], 
                                       rfc_lst['name']: rfc_lst['recall'],
                                       dtc_lst['name']: dtc_lst['recall'],
                                       knn_lst['name']: knn_lst['recall']
                                      },

                        'F1':         {svc_lst['name']: svc_lst['F1'], 
                                       rfc_lst['name']: rfc_lst['F1'],
                                       dtc_lst['name']: dtc_lst['F1'],
                                       knn_lst['name']: knn_lst['F1']
                                      }}         
                                    
                      )

original_output

Unnamed: 0,accuracy,precision,recall,F1
SVC,0.972,0.973848,0.971864,0.972653
RFC,0.88,0.882522,0.883078,0.882793
DTC,0.812,0.817123,0.815459,0.815827
KNN,0.916,0.919325,0.917425,0.918099


In [28]:
# ensemble model
ensemble_output = pd.DataFrame(  {'accuracy':  {svc_ensemble_lst['name']: svc_ensemble_lst['accuracy'], 
                                       rfc_ensemble_lst['name']: rfc_ensemble_lst['accuracy'],
                                       dtc_ensemble_lst['name']: dtc_ensemble_lst['accuracy'],
                                       knn_ensemble_lst['name']: knn_ensemble_lst['accuracy']
                                      },

                        'precision':  {svc_ensemble_lst['name']: svc_ensemble_lst['precision'], 
                                       rfc_ensemble_lst['name']: rfc_ensemble_lst['precision'],
                                       dtc_ensemble_lst['name']: dtc_ensemble_lst['precision'],
                                       knn_ensemble_lst['name']: knn_ensemble_lst['precision']
                                      },

                        'recall':     {svc_ensemble_lst['name']: svc_ensemble_lst['recall'], 
                                       rfc_ensemble_lst['name']: rfc_ensemble_lst['recall'],
                                       dtc_ensemble_lst['name']: dtc_ensemble_lst['recall'],
                                       knn_ensemble_lst['name']: knn_ensemble_lst['recall']
                                      },

                        'F1':         {svc_ensemble_lst['name']: svc_ensemble_lst['F1'], 
                                       rfc_ensemble_lst['name']: rfc_ensemble_lst['F1'],
                                       dtc_ensemble_lst['name']: dtc_ensemble_lst['F1'],
                                       knn_ensemble_lst['name']: knn_ensemble_lst['F1']
                                      }}         
                                    
                      )

ensemble_output

Unnamed: 0,accuracy,precision,recall,F1
Ensemble SVC,0.972,0.973848,0.971864,0.972653
Ensemble RFC,0.96,0.962724,0.959766,0.960723
Ensemble DTC,0.962,0.964957,0.96175,0.962814
Ensemble KNN,0.95,0.952251,0.95008,0.950963


In [29]:
output = original_output.append(ensemble_output)
output

  output = original_output.append(ensemble_output)


Unnamed: 0,accuracy,precision,recall,F1
SVC,0.972,0.973848,0.971864,0.972653
RFC,0.88,0.882522,0.883078,0.882793
DTC,0.812,0.817123,0.815459,0.815827
KNN,0.916,0.919325,0.917425,0.918099
Ensemble SVC,0.972,0.973848,0.971864,0.972653
Ensemble RFC,0.96,0.962724,0.959766,0.960723
Ensemble DTC,0.962,0.964957,0.96175,0.962814
Ensemble KNN,0.95,0.952251,0.95008,0.950963
