In [1]:
import pandas as pd
import numpy as np

In [2]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')
info = pd.read_csv('data/snp_info.csv')

In [3]:
train.head(3)

Unnamed: 0,id,father,mother,gender,trait,SNP_01,SNP_02,SNP_03,SNP_04,SNP_05,...,SNP_07,SNP_08,SNP_09,SNP_10,SNP_11,SNP_12,SNP_13,SNP_14,SNP_15,class
0,TRAIN_000,0,0,0,2,G G,A G,A A,G A,C A,...,A A,G G,A A,G G,A G,A A,A A,A A,A A,B
1,TRAIN_001,0,0,0,2,A G,A G,C A,A A,A A,...,A A,G A,A A,A G,A A,G A,G G,A A,A A,C
2,TRAIN_002,0,0,0,2,G G,G G,A A,G A,C C,...,A A,G A,G A,A G,A A,A A,A A,A A,A A,B


In [4]:
test.head(3)

Unnamed: 0,id,father,mother,gender,trait,SNP_01,SNP_02,SNP_03,SNP_04,SNP_05,SNP_06,SNP_07,SNP_08,SNP_09,SNP_10,SNP_11,SNP_12,SNP_13,SNP_14,SNP_15
0,TEST_000,0,0,0,1,A G,G G,A A,G A,A A,A G,G G,G A,G A,A G,A G,G A,G G,C A,G A
1,TEST_001,0,0,0,2,G G,A G,C C,G G,C C,A A,A A,A A,A A,G G,A G,A A,A A,A A,A A
2,TEST_002,0,0,0,2,G G,A G,A A,A A,C A,A G,A A,A A,A A,A G,A A,G A,G G,A A,G G


- chrom : 염색체 정보
- cm : Genetic distance (유전적 거리)
- pos : 각 마커의 유전체상 위치 정보

In [5]:
info

Unnamed: 0,SNP_id,name,chrom,cm,pos
0,SNP_01,BTA-19852-no-rs,2,67.0546,42986890
1,SNP_02,ARS-USMARC-Parent-DQ647190-rs29013632,6,31.1567,13897068
2,SNP_03,ARS-BFGL-NGS-117009,6,68.2892,44649549
3,SNP_04,ARS-BFGL-NGS-60567,6,77.8749,53826064
4,SNP_05,BovineHD0600017032,6,80.5015,61779512
5,SNP_06,BovineHD0600017424,6,80.5954,63048481
6,SNP_07,Hapmap49442-BTA-111073,6,80.78,64037334
7,SNP_08,BovineHD0600018638,6,82.6856,67510588
8,SNP_09,ARS-BFGL-NGS-37727,6,86.874,73092782
9,SNP_10,BTB-01558306,7,62.0692,40827112


- train data 살펴보기
    - `father`, `mother`, `gender` column은 값이 하나뿐이므로 무의미하여 삭제해줌

In [6]:
train['father'].unique()

array([0], dtype=int64)

In [7]:
train['mother'].unique()

array([0], dtype=int64)

In [8]:
train['gender'].unique()

array([0], dtype=int64)

In [9]:
train['trait'].unique()

array([2, 1], dtype=int64)

In [10]:
train.drop(['id', 'father', 'mother', 'gender'], axis = 1, inplace = True)

- test data 역시 column 정리해주기

In [11]:
test.drop(['id', 'father', 'mother', 'gender'], axis = 1, inplace = True)

### Train, Test Split

In [12]:
from sklearn.model_selection import train_test_split

In [13]:
train_x = train.drop('class', axis = 1)
train_y = train['class']

In [14]:
x_train, x_test, y_train, y_test = train_test_split(train_x, train_y, test_size = 0.2, random_state = 32, stratify = train_y)

In [15]:
train_snp = x_train.columns[1:]
test_snp = x_test.columns[1:]
train_snp_data = []
test_snp_data = []

for col in train_snp :
    train_snp_data += list(x_train[col].values)

for col in test_snp :
    test_snp_data += list(x_test[col].values)

### Encoding

In [16]:
from sklearn.preprocessing import LabelEncoder

In [17]:
class_le = LabelEncoder()
snp_le = LabelEncoder()

In [18]:
y_train = class_le.fit_transform(y_train)
y_test = class_le.transform(y_test)

In [19]:
snp_le.fit_transform(train_snp_data)

array([1, 5, 0, ..., 4, 4, 0], dtype=int64)

In [20]:
train_x.columns

Index(['trait', 'SNP_01', 'SNP_02', 'SNP_03', 'SNP_04', 'SNP_05', 'SNP_06',
       'SNP_07', 'SNP_08', 'SNP_09', 'SNP_10', 'SNP_11', 'SNP_12', 'SNP_13',
       'SNP_14', 'SNP_15'],
      dtype='object')

In [21]:
for col in train_x.columns:
    if col in train_snp:
        x_train[col] = snp_le.transform(x_train[col])
        x_test[col] = snp_le.transform(x_test[col])
        test[col] = snp_le.transform(test[col])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x_train[col] = snp_le.transform(x_train[col])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x_test[col] = snp_le.transform(x_test[col])


### Modeling

#### SVM

In [22]:
import sklearn.svm as svm

- SVM
    - 비선형분리로 하이퍼파라미터 튜닝하지 않았을 때 베이스라인인 RandomForest 모델보다 더 높은 점수 기록
    - 베이스라인 : 0.9622 / **SVM : 0.9814**

##### SVM - rbf

In [23]:
svm_clf =svm.SVC(kernel = 'rbf')

In [26]:
svm_clf.fit(x_train, y_train)

SVC(kernel='poly')

In [28]:
svm_clf.score(x_train, y_train)

0.9904306220095693

##### SVM - poly

In [29]:
svm_clf =svm.SVC(kernel = 'poly')

In [31]:
svm_clf.fit(x_train, y_train)

SVC(kernel='poly')

In [33]:
svm_clf.score(x_train, y_train)

0.9904306220095693

##### SVM - sigmoid (가장 낮음)

In [34]:
svm_clf =svm.SVC(kernel = 'sigmoid')

In [35]:
svm_clf.fit(x_train, y_train)

SVC(kernel='sigmoid')

In [36]:
svm_clf.score(x_train, y_train)

0.7368421052631579

##### SVM - linear

In [37]:
svm_clf =svm.SVC(kernel = 'linear')

In [39]:
svm_clf.fit(x_train, y_train)

SVC(kernel='linear')

In [40]:
svm_clf.score(x_train, y_train)

0.9808612440191388

In [41]:
preds = svm_clf.predict(test)

In [38]:
submit = pd.read_csv('data/sample_submission.csv')

In [34]:
submit['class'] = class_le.inverse_transform(preds)

In [35]:
submit.to_csv('data/submit/submit_svm_linear.csv', index=False)

### Grid Search

In [42]:
from sklearn.model_selection import GridSearchCV

In [43]:
# accuracy_score : 0.943396
param_grid = {'C' : [0.1, 1, 10, 100, 1000],
            'gamma' : [1, 0.1, 0.01, 0.001, 0.0001],
            'kernel' : ['rbf', 'linear']}

In [44]:
grid = GridSearchCV(svm_clf, param_grid)

In [45]:
grid.fit(x_train, y_train)

GridSearchCV(estimator=SVC(kernel='linear'),
             param_grid={'C': [0.1, 1, 10, 100, 1000],
                         'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
                         'kernel': ['rbf', 'linear']})

In [46]:
grid_pred = grid.predict(x_test)

In [47]:
from sklearn.metrics import accuracy_score

In [48]:
accuracy_score(y_test, grid_pred)

0.9433962264150944

In [49]:
from sklearn.metrics import classification_report

In [50]:
print(classification_report(y_test, grid_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        14
           1       0.95      0.91      0.93        23
           2       0.88      0.94      0.91        16

    accuracy                           0.94        53
   macro avg       0.95      0.95      0.95        53
weighted avg       0.94      0.94      0.94        53



In [51]:
print(grid.best_params_)

{'C': 1, 'gamma': 0.01, 'kernel': 'rbf'}
