In [1]:
import pandas as pd
import numpy as np

In [2]:
from sklearn.metrics import accuracy_score

In [3]:
from sklearn.metrics import classification_report

### Data Load

In [4]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')
info = pd.read_csv('data/snp_info.csv')

In [5]:
submit = pd.read_csv('data/sample_submission.csv')

In [6]:
info

Unnamed: 0,SNP_id,name,chrom,cm,pos
0,SNP_01,BTA-19852-no-rs,2,67.0546,42986890
1,SNP_02,ARS-USMARC-Parent-DQ647190-rs29013632,6,31.1567,13897068
2,SNP_03,ARS-BFGL-NGS-117009,6,68.2892,44649549
3,SNP_04,ARS-BFGL-NGS-60567,6,77.8749,53826064
4,SNP_05,BovineHD0600017032,6,80.5015,61779512
5,SNP_06,BovineHD0600017424,6,80.5954,63048481
6,SNP_07,Hapmap49442-BTA-111073,6,80.78,64037334
7,SNP_08,BovineHD0600018638,6,82.6856,67510588
8,SNP_09,ARS-BFGL-NGS-37727,6,86.874,73092782
9,SNP_10,BTB-01558306,7,62.0692,40827112


### 불필요 Column 제거

- 값이 하나뿐이므로 모델링에 불필요 판단

In [7]:
train.drop(['id', 'father', 'mother', 'gender'], axis = 1, inplace = True)

In [8]:
test.drop(['id', 'father', 'mother', 'gender'], axis = 1, inplace = True)

### Train Test Split

In [9]:
from sklearn.model_selection import train_test_split

In [10]:
train_x = train.drop('class', axis = 1)
train_y = train['class']

In [11]:
x_train, x_test, y_train, y_test = train_test_split(train_x, train_y, test_size = 0.2, random_state = 32, stratify = train_y)

In [12]:
train_snp = x_train.columns[1:]
test_snp = x_test.columns[1:]
train_snp_data = []
test_snp_data = []

for col in train_snp :
    train_snp_data += list(x_train[col].values)

for col in test_snp :
    test_snp_data += list(x_test[col].values)

### Encoding

#### Label Encoding

In [13]:
from sklearn.preprocessing import LabelEncoder

In [14]:
class_le = LabelEncoder()
snp_le = LabelEncoder()

In [15]:
y_train = class_le.fit_transform(y_train)
y_test = class_le.transform(y_test)

In [16]:
snp_le.fit_transform(train_snp_data)

array([1, 5, 0, ..., 4, 4, 0], dtype=int64)

In [17]:
for col in train_x.columns:
    if col in train_snp:
        x_train[col] = snp_le.transform(x_train[col])
        x_test[col] = snp_le.transform(x_test[col])
        test[col] = snp_le.transform(test[col])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x_train[col] = snp_le.transform(x_train[col])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x_test[col] = snp_le.transform(x_test[col])


### Modeling

#### Gradient Boosting

In [18]:
from sklearn.ensemble import GradientBoostingClassifier

In [19]:
gbr = GradientBoostingClassifier(random_state = 42, learning_rate = 0.01, n_estimators = 600, min_samples_split = 5)

In [20]:
gbr.fit(x_train, y_train)

GradientBoostingClassifier(learning_rate=0.01, min_samples_split=5,
                           n_estimators=600, random_state=42)

In [21]:
gbr_val_pred = gbr.predict(x_test)

In [22]:
accuracy_score(y_test, gbr_val_pred)

0.9433962264150944

In [23]:
print(classification_report(y_test, gbr_val_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        14
           1       0.95      0.91      0.93        23
           2       0.88      0.94      0.91        16

    accuracy                           0.94        53
   macro avg       0.95      0.95      0.95        53
weighted avg       0.94      0.94      0.94        53



In [24]:
pred = gbr.predict(test)

In [172]:
# submit['class'] = class_le.inverse_transform(pred)

In [173]:
# submit.to_csv('data/submit/submit_gbr.csv', index=False)

#### Logistic

- 최종 제출 모델

In [25]:
from sklearn.linear_model import LogisticRegression

In [26]:
lr = LogisticRegression(C = 10, solver = "lbfgs", max_iter = 100, n_jobs = 10)

In [27]:
lr.fit(x_train, y_train)

LogisticRegression(C=10, n_jobs=10)

In [28]:
lr_val_pred = lr.predict(x_test)

In [29]:
print(classification_report(y_test, lr_val_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        14
           1       1.00      0.96      0.98        23
           2       0.94      1.00      0.97        16

    accuracy                           0.98        53
   macro avg       0.98      0.99      0.98        53
weighted avg       0.98      0.98      0.98        53



In [102]:
lr_pred = lr.predict(test)

In [25]:
submit['class'] = class_le.inverse_transform(lr_pred)

In [26]:
submit.to_csv('data/submit/submit_logistic.csv', index=False)

### SGD

In [103]:
from sklearn.linear_model import SGDClassifier

In [179]:
sgd = SGDClassifier(loss = 'squared_hinge', max_iter = 1600, alpha = 1)
sgd.fit(x_train, y_train)

SGDClassifier(alpha=1, loss='squared_hinge', max_iter=1600)

In [180]:
sgd_val_pred = sgd.predict(x_test)

In [181]:
accuracy_score(y_test, sgd_val_pred)

0.9245283018867925

In [182]:
print(classification_report(y_test, sgd_val_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        14
           1       0.95      0.87      0.91        23
           2       0.83      0.94      0.88        16

    accuracy                           0.92        53
   macro avg       0.93      0.94      0.93        53
weighted avg       0.93      0.92      0.93        53



### Naive Bayes

In [37]:
from sklearn.naive_bayes import GaussianNB

In [38]:
naive = GaussianNB()

In [39]:
naive.fit(x_train, y_train)

GaussianNB()

In [40]:
naive_val_pred = naive.predict(x_test)

In [41]:
accuracy_score(y_test, naive_val_pred)

0.9056603773584906

In [42]:
print(classification_report(y_test, naive_val_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        14
           1       0.82      1.00      0.90        23
           2       1.00      0.69      0.81        16

    accuracy                           0.91        53
   macro avg       0.94      0.90      0.91        53
weighted avg       0.92      0.91      0.90        53



### MLP

In [43]:
from sklearn.neural_network import MLPClassifier

In [44]:
mlp = MLPClassifier(hidden_layer_sizes = (100, ), activation = 'relu', solver = "adam", learning_rate = 'constant', max_iter = 1000)

In [45]:
mlp.fit(x_train, y_train)

MLPClassifier(max_iter=1000)

In [46]:
mlp_val_pred = mlp.predict(x_test)

In [47]:
accuracy_score(y_test, mlp_val_pred)

0.9245283018867925

In [48]:
print(classification_report(y_test, mlp_val_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        14
           1       0.88      0.96      0.92        23
           2       0.93      0.81      0.87        16

    accuracy                           0.92        53
   macro avg       0.94      0.92      0.93        53
weighted avg       0.93      0.92      0.92        53



### SVM

In [161]:
import sklearn.svm as svm

In [162]:
svm_clf =svm.SVC(kernel = 'rbf', C = 1, gamma = 0.05, probability = True, random_state = 42, max_iter = -1, cache_size = 500)

In [163]:
svm_clf.fit(x_train, y_train)

SVC(C=1, cache_size=500, gamma=0.05, probability=True, random_state=42)

In [164]:
svm_val_pred = svm_clf.predict(x_test)

In [165]:
accuracy_score(y_test, svm_val_pred)

0.9622641509433962

In [166]:
print(classification_report(y_test, svm_val_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        14
           1       0.96      0.96      0.96        23
           2       0.94      0.94      0.94        16

    accuracy                           0.96        53
   macro avg       0.96      0.96      0.96        53
weighted avg       0.96      0.96      0.96        53



In [250]:
from sklearn.model_selection import GridSearchCV

In [139]:
# accuracy_score : 0.943396
param_grid = {'C' : [0.1, 1, 10, 100, 1000, 5, 50, 30, 40],
            'gamma' : [1, 0.1, 0.01, 0.001, 0.0001, 5, 0.5, 0.05, 0.3, 0.4],
            'kernel' : ['rbf', 'linear']}

In [140]:
grid = GridSearchCV(svm_clf, param_grid)

In [141]:
grid.fit(x_train, y_train)

GridSearchCV(estimator=SVC(C=1, gamma=0.05, probability=True, random_state=42),
             param_grid={'C': [0.1, 1, 10, 100, 1000, 5, 50, 30, 40],
                         'gamma': [1, 0.1, 0.01, 0.001, 0.0001, 5, 0.5, 0.05,
                                   0.3, 0.4],
                         'kernel': ['rbf', 'linear']})

In [142]:
grid_pred = grid.predict(x_test)

In [143]:
print(grid.best_params_)

{'C': 1, 'gamma': 0.01, 'kernel': 'rbf'}


### Ada

In [192]:
from sklearn.ensemble import AdaBoostClassifier

In [209]:
ada = AdaBoostClassifier(n_estimators = 500, learning_rate = 0.001, random_state = 42)

In [210]:
ada.fit(x_train, y_train)

AdaBoostClassifier(learning_rate=0.001, n_estimators=500, random_state=42)

In [211]:
ada_val = ada.predict(x_test)

In [212]:
accuracy_score(y_test, ada_val)

0.8679245283018868

### XGBoost

In [257]:
from xgboost import XGBClassifier

In [510]:
xgb = XGBClassifier(min_child_weight = 6, max_leaf_nodes = 10, gamma = 0.1, eta = 0.1, random_state = 42,)

In [511]:
xgb.fit(x_train, y_train)

Parameters: { "max_leaf_nodes" } are not used.



XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              early_stopping_rounds=None, enable_categorical=False, eta=0.1,
              eval_metric=None, feature_types=None, gamma=0.1, gpu_id=-1,
              grow_policy='depthwise', importance_type=None,
              interaction_constraints='', learning_rate=0.100000001,
              max_bin=256, max_cat_threshold=64, max_cat_to_onehot=4,
              max_delta_step=0, max_depth=6, max_leaf_nodes=10, max_leaves=0,
              min_child_weight=6, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=0, num_parallel_tree=1, ...)

In [512]:
xgb_pred = xgb.predict(x_test)

In [513]:
accuracy_score(y_test, xgb_pred)

0.9811320754716981

In [614]:
print(classification_report(y_test, xgb_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        14
           1       1.00      0.96      0.98        23
           2       0.94      1.00      0.97        16

    accuracy                           0.98        53
   macro avg       0.98      0.99      0.98        53
weighted avg       0.98      0.98      0.98        53



### Voting Ensemble

In [168]:
from sklearn.ensemble import VotingClassifier

In [227]:
votingC = VotingClassifier(estimators = [('logistic', lr), ('svm', svm_clf), ('sgd', sgd), ('gbr', gbr)], voting = 'hard', n_jobs = 3)

In [228]:
votingC.fit(x_train, y_train)

VotingClassifier(estimators=[('logistic', LogisticRegression(C=10, n_jobs=10)),
                             ('svm',
                              SVC(C=1, cache_size=500, gamma=0.05,
                                  probability=True, random_state=42)),
                             ('sgd',
                              SGDClassifier(alpha=1, loss='squared_hinge',
                                            max_iter=1600)),
                             ('gbr',
                              GradientBoostingClassifier(learning_rate=0.01,
                                                         min_samples_split=5,
                                                         n_estimators=600,
                                                         random_state=42))],
                 n_jobs=3)

In [229]:
voting_val_pred = votingC.predict(x_test)

In [230]:
accuracy_score(y_test, voting_val_pred)

0.9622641509433962

In [190]:
voting_pred = votingC.predict(test)

In [191]:
print(classification_report(y_test, voting_val_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        14
           1       1.00      0.96      0.98        23
           2       0.94      1.00      0.97        16

    accuracy                           0.98        53
   macro avg       0.98      0.99      0.98        53
weighted avg       0.98      0.98      0.98        53



In [77]:
submit = pd.read_csv('data/sample_submission.csv')

In [78]:
submit['class'] = class_le.inverse_transform(voting_pred)

In [79]:
submit.to_csv('data/submit/submit_voting3.csv', index=False)