In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [2]:
df = pd.read_csv('Electronics_train.csv')

In [3]:
df = df.iloc[:,1:]

In [4]:
def hello_outlier(df=None, column=None, weight=1.5) :
    quantile_25 = np.percentile(df[column].values, 25)
    quantile_75 = np.percentile(df[column].values, 75)

    IQR = quantile_75 - quantile_25
    IQR_weight = IQR*weight

    lowest = quantile_25 - IQR_weight
    highest = quantile_75 + IQR_weight

    outlier_idx = df[column][ (df[column] < lowest) | (df[column] > highest) ].index
    return outlier_idx

def bye_outlier(df=None, column=None, weight=1.5) :
    outlier_idx = hello_outlier(df=df, column=column, weight=1.5)
    df.drop(outlier_idx, axis=0, inplace=True)

In [5]:
df.drop(['dual','4G','3G','ts','wifi','blue'], inplace = True, axis=1)
bye_outlier(df=df, column='front_c',weight=1.5)

In [37]:
X = df.iloc[:,:-1]
y = df.iloc[:,-1]

ss = StandardScaler()
scaled_X_standard = ss.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(scaled_X_standard, y, test_size=0.2, random_state=42)

In [38]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score
import warnings

warnings.filterwarnings('ignore')

tree = DecisionTreeClassifier()
GNB = GaussianNB()
rf = RandomForestClassifier()
ada = AdaBoostClassifier()
gb = GradientBoostingClassifier()
xgb = XGBClassifier()
lgbm = LGBMClassifier()

for model in [tree, GNB, rf, ada, gb, xgb, lgbm]:
    model.fit(X_train, y_train)
    pred_model = model.predict(X_test)
    accuracy = accuracy_score(y_test, pred_model)
    
    print('{0}: {1: .4f}'.format(model, accuracy))

DecisionTreeClassifier():  0.7987
GaussianNB():  0.7852
RandomForestClassifier():  0.8859
AdaBoostClassifier():  0.5470
GradientBoostingClassifier():  0.9060
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=8,
              num_parallel_tree=1, objective='multi:softprob', predictor='auto',
              random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=None,
              subsample=1, tree_method='exact', validate_parameters=1,
              verbosity=None):  0.9195
LGBMClassifier():  0.9228


In [39]:
# adaboost

tree_model = DecisionTreeClassifier(max_depth=5)
ada = AdaBoostClassifier(base_estimator=tree_model, n_estimators=50,random_state=0)
ada.fit(X_train, y_train)
pred_ada = ada.predict(X_test)

accuracy_score(y_test, pred_ada)

0.8791946308724832

In [40]:
# randomForest

from sklearn.model_selection import GridSearchCV

params = {'n_estimators':[300],
         'max_depth' : [8,10,12],
         'min_samples_leaf':[2,4,6],
         'min_samples_split':[4,6,8]}

rf = RandomForestClassifier(random_state=0)
grid = GridSearchCV(rf, param_grid = params, cv=2)
grid.fit(X_train, y_train)

print(grid.best_params_)
print(grid.best_score_)

{'max_depth': 12, 'min_samples_leaf': 2, 'min_samples_split': 4, 'n_estimators': 300}
0.8556045908296204


In [13]:
rf = RandomForestClassifier(n_estimators=300, max_depth = 10, min_samples_leaf = 2,
                           min_samples_split=8, random_state=30)
rf.fit(X_train, y_train)
pred_rf = rf.predict(X_test)

accuracy_score(y_test, pred_rf)

0.8657718120805369

In [21]:
#Gradient Boost

params = {
    'n_estimators':[100,500],
    'learning_rate':[0.05, 0.1]
}

grid = GridSearchCV(gb, param_grid = params, cv=2)
grid.fit(X_train, y_train)

print(grid.best_params_)
print(grid.best_score_)

{'learning_rate': 0.1, 'n_estimators': 500}
0.8715286785855282


In [22]:
gb = GradientBoostingClassifier(n_estimators=500, learning_rate=0.1)
gb.fit(X_train, y_train)
pred_gb = gb.predict(X_test)
accuracy = accuracy_score(y_test, pred_gb)

print('정확도: ',accuracy)

정확도:  0.889261744966443


In [23]:
#XGBoost

params = {
    'n_estimators':[100,500,600],
    'learning_rate':[0.05,0.1,0.3]
}

grid = GridSearchCV(xgb, param_grid = params, cv=2)
grid.fit(X_train, y_train)

print(grid.best_params_)
print(grid.best_score_)

{'learning_rate': 0.3, 'n_estimators': 100}
0.8841323106423777


In [27]:
xgb = XGBClassifier(grid.best_params_, max_depth=6)
xgb.fit(X_train, y_train)
pred_xgb = xgb.predict(X_test)
accuracy = accuracy_score(y_test, pred_xgb)

print('정확도: ',accuracy)

정확도:  0.9026845637583892


In [41]:
#ligthgbm

params = {
    'n_estimators':[100,500],
    'learning_rate':[0.01,0.05,0.1]
}

grid = GridSearchCV(lgbm, param_grid = params, cv=2)
grid.fit(X_train, y_train)

print(grid.best_params_)
print(grid.best_score_)

{'learning_rate': 0.1, 'n_estimators': 500}
0.8749055326828719


In [42]:
grid.fit(X_train, y_train)
pred_lgbm = grid.predict(X_test)
accuracy = accuracy_score(y_test, pred_lgbm)

print('정확도: ',accuracy)

정확도:  0.9228187919463087


In [30]:
#Voting

from sklearn.ensemble import VotingClassifier

voting_s = VotingClassifier(estimators=[('RF',rf), ('XGB',xgb),('LGBM',lgbm),('GB',gb)], voting='soft')

voting_s.fit(X_train, y_train)
pred_s = voting_s.predict(X_test)

accuracy = accuracy_score(y_test, pred_s)

print('정확도: ',accuracy)

정확도:  0.9060402684563759


In [46]:
test =  pd.read_csv('Electronics_testx.csv')

In [47]:
test.head()

Unnamed: 0.1,Unnamed: 0,BP,blue,c_speed,dual,front_c,4G,m_int,m_dep,m_wt,...,prim_c,px_h,px_w,ram,sc_h,sc_w,talk_t,3G,ts,wifi
0,0,1225,0,0.7,1,6,0,60,0.1,107,...,15,10,1567,2423,17,11,6,1,0,0
1,1,1970,1,0.5,1,0,1,15,1.0,132,...,0,1399,1684,1658,15,9,20,1,1,1
2,2,1186,1,0.5,1,2,0,21,0.4,160,...,4,68,584,2361,17,8,7,1,0,0
3,3,1762,0,0.7,0,7,0,60,0.1,157,...,10,643,790,1380,14,5,14,1,0,0
4,4,1731,1,1.4,1,4,1,4,0.5,163,...,18,809,1988,3892,5,1,4,1,1,1


In [48]:
test = test.iloc[:,1:]

In [49]:
test.drop(['dual','4G','3G','ts','wifi','blue'], inplace = True, axis=1)

In [50]:
test.head()

Unnamed: 0,BP,c_speed,front_c,m_int,m_dep,m_wt,n_cores,prim_c,px_h,px_w,ram,sc_h,sc_w,talk_t
0,1225,0.7,6,60,0.1,107,2,15,10,1567,2423,17,11,6
1,1970,0.5,0,15,1.0,132,2,0,1399,1684,1658,15,9,20
2,1186,0.5,2,21,0.4,160,8,4,68,584,2361,17,8,7
3,1762,0.7,7,60,0.1,157,4,10,643,790,1380,14,5,14
4,1731,1.4,4,4,0.5,163,6,18,809,1988,3892,5,1,4


In [51]:
#피처 스케일링

test_ss = ss.transform(test)

In [52]:
y_test_pred = grid.predict(test_ss)

In [53]:
y_pred = pd.DataFrame(y_test_pred)
y_pred.columns = ['target']
y_pred['index'] = y_pred.index
y_pred = y_pred[['index', 'target']]

y_pred.to_csv("kaggle1.csv", header=True, index=False)