In [97]:
import pandas as pd

READ DATASET

In [98]:
df = pd.read_csv('../dataset/train.csv')

In [99]:
df

Unnamed: 0,id,age,height(cm),weight(kg),waist(cm),eyesight(left),eyesight(right),hearing(left),hearing(right),systolic,...,HDL,LDL,hemoglobin,Urine protein,serum creatinine,AST,ALT,Gtp,dental caries,smoking
0,0,55,165,60,81.0,0.5,0.6,1,1,135,...,40,75,16.5,1,1.0,22,25,27,0,1
1,1,70,165,65,89.0,0.6,0.7,2,2,146,...,57,126,16.2,1,1.1,27,23,37,1,0
2,2,20,170,75,81.0,0.4,0.5,1,1,118,...,45,93,17.4,1,0.8,27,31,53,0,1
3,3,35,180,95,105.0,1.5,1.2,1,1,131,...,38,102,15.9,1,1.0,20,27,30,1,0
4,4,30,165,60,80.5,1.5,1.0,1,1,121,...,44,93,15.4,1,0.8,19,13,17,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
159251,159251,40,155,45,69.0,1.5,2.0,1,1,127,...,72,159,14.5,1,0.8,25,26,13,0,0
159252,159252,50,155,75,82.0,1.0,1.0,1,1,120,...,64,108,14.5,1,0.6,21,20,18,0,0
159253,159253,40,160,50,66.0,1.5,1.0,1,1,114,...,87,93,10.9,1,0.6,15,9,12,0,0
159254,159254,50,165,75,92.0,1.2,1.0,1,1,121,...,55,80,14.4,1,1.1,22,17,37,0,1


In [100]:
df.shape

(159256, 24)

DROP ID

In [101]:
df = df.drop(['id'], axis=1)

TRAINING

In [102]:
reduced_df = df.sample(frac=1, random_state=42)

In [103]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
df_scaled = scaler.fit_transform(reduced_df.drop(columns=['smoking']))
target = reduced_df['smoking']

In [104]:
from sklearn.model_selection import GridSearchCV, train_test_split
X_train, X_test, y_train, y_test = train_test_split(df_scaled, target, test_size=0.2, random_state=42)

In [105]:
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier

pipeline = Pipeline([
    ('classifier', RandomForestClassifier())
])

parameters = [
    {
        'classifier': [GradientBoostingClassifier()],
        'classifier__n_estimators': [20, 50],
        'classifier__learning_rate': [0.1],
        'classifier__max_depth': [7],
        'classifier__min_samples_split': [15, 20],
        'classifier__min_samples_leaf': [2, 16]
    },
]

grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1, scoring='accuracy', cv=5)
grid_search.fit(X_train, y_train)

print("Best parameters found: ", grid_search.best_params_)
print()

best_model = grid_search.best_estimator_

y_pred = best_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: {:.2f}%".format(accuracy * 100))

Fitting 5 folds for each of 8 candidates, totalling 40 fits
Best parameters found:  {'classifier': GradientBoostingClassifier(), 'classifier__learning_rate': 0.1, 'classifier__max_depth': 7, 'classifier__min_samples_leaf': 2, 'classifier__min_samples_split': 15, 'classifier__n_estimators': 50}

Accuracy: 77.81%


TEXT DATA

In [106]:
df_test = pd.read_csv('../dataset/test.csv')
df_test.head()

Unnamed: 0,id,age,height(cm),weight(kg),waist(cm),eyesight(left),eyesight(right),hearing(left),hearing(right),systolic,...,triglyceride,HDL,LDL,hemoglobin,Urine protein,serum creatinine,AST,ALT,Gtp,dental caries
0,159256,40,165,70,84.0,1.2,1.2,1,1,130,...,186,49,115,14.2,1,0.9,19,25,32,0
1,159257,80,160,60,93.0,1.0,1.0,2,2,144,...,158,35,104,13.0,1,1.1,20,12,24,0
2,159258,60,170,70,86.5,0.6,0.7,1,1,117,...,173,39,88,15.4,1,1.4,38,60,36,0
3,159259,40,160,50,67.0,0.3,0.4,1,1,116,...,47,75,128,14.5,1,0.6,25,18,10,1
4,159260,40,170,75,89.4,1.0,0.9,1,1,132,...,100,39,123,16.5,1,1.0,30,39,27,1


In [107]:
df_test_scaled = scaler.fit_transform(df_test.drop('id', axis=1))
df_test_scaled_df = pd.DataFrame(df_test_scaled, columns=df_test.drop('id', axis=1).columns)
df_test_scaled_df.head()

Unnamed: 0,age,height(cm),weight(kg),waist(cm),eyesight(left),eyesight(right),hearing(left),hearing(right),systolic,relaxation,...,triglyceride,HDL,LDL,hemoglobin,Urine protein,serum creatinine,AST,ALT,Gtp,dental caries
0,-0.372069,-0.025045,0.22837,0.111787,0.490896,0.519811,-0.157533,-0.156746,0.589449,1.350446,...,0.883073,-0.494432,0.014942,-0.413692,-0.213598,0.041889,-0.690166,-0.087239,-0.132142,-0.492893
1,2.9901,-0.590846,-0.566131,1.117762,-0.01201,0.00134,6.347882,6.379741,1.686156,-0.534518,...,0.461082,-1.494397,-0.393323,-1.250344,-0.213598,1.153454,-0.584952,-0.811711,-0.390807,-0.492893
2,1.309015,0.540757,0.22837,0.391225,-1.017823,-0.776365,-0.157533,-0.156746,-0.428923,-0.201877,...,0.687148,-1.208693,-0.987163,0.422959,-0.213598,2.820802,1.308908,1.863262,-0.002809,-0.492893
3,-0.372069,-0.590846,-1.360633,-1.788388,-1.772182,-1.554071,-0.157533,-0.156746,-0.507259,-1.64332,...,-1.21181,1.362647,0.497438,-0.204529,-0.213598,-1.625459,-0.05888,-0.477339,-0.843472,2.028837
4,-0.372069,0.540757,0.625621,0.715372,-0.01201,-0.257895,-0.157533,-0.156746,0.746121,1.904848,...,-0.413042,-1.208693,0.311862,1.18989,-0.213598,0.597671,0.467193,0.692961,-0.293808,2.028837


In [111]:
test_result = best_model.predict(df_test_scaled_df) 



In [None]:
df_result = pd.DataFrame({
    'id': df_test['id'],
    'smoking': test_result[:,0]
})

In [None]:
df_result.to_csv('../result/submission_gnb.csv', index=False)