# Figure 1. Machine Learning

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler

# SVM
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
# RandomForest
from sklearn.ensemble import RandomForestClassifier
# KNN
from sklearn.neighbors import KNeighborsClassifier
# XGBoost
from xgboost import XGBClassifier

## Load Data

In [None]:
# 파일 불러오기 (biomarker 기준에 따라)
df = pd.read_csv('./New_Data/TCGA_GTEX_SLC_103.csv')
# df = pd.read_csv('./New_Data/TCGA_GTEX_SLC_.csv')

* 여러 조건에 맞춰 data 생성

In [None]:
# 1. 19개 암종 + GTEX + SLC 전체 
df_base = df.drop(columns=['sample', 'TCGA_GTEX_main_category', 'cancer', 'label'])
df_base.rename(columns={'label_GTEx_100':'label'}, inplace=True)

# 2. 19개 암종 + GTEX + SLC bio-marker
df_biomarker = df_bio.drop(columns=['sample','TCGA_GTEX_main_category','cancer','label'])
df_biomarker.rename(columns={'label_GTEx_100':'label'}, inplace=True)

# 3. 19개 암종 + GTEX500 + SLC 전체
df_500 = df_base[df_base['label']==100].sample(n=500).reset_index(drop=True)
df_base_500 = pd.concat([df_base[df_base['label']!=100], df_500])

# 4. 19개 암종 + GTEX500 + SLC bio-marker
df_biomarker_500 = df_biomarker[df_biomarker['label']==100].sample(n=500).reset_index(drop=True)
df_biomarker_500 = pd.concat([df_biomarker[df_biomarker['label']!=100], df_biomarker_500])

# 5. 19개 암종 + SLC 전체
df_TCGA = df_base[df_base['label']!=100]

# 6. 19개 암종 + SLC bio-marker
df_biomarker_TCGA = df_biomarker[df_biomarker['label']!=100]

###########################################################

# 7. TCGA 300개 이하 삭제 + 19개 암종 + GTEX + SLC 전체
over_300 = df_base['label'].value_counts().index[:13].tolist()
df_base_over_300 = df_base[df_base['label'].isin(over_300)]

# 8. TCGA 300개 이하 삭제 + 19개 암종 + GTEX + SLC bio-marker
bio_over_300 = df_biomarker['label'].value_counts().index[:13].tolist()
df_biomarker_over_300 = df_biomarker[df_biomarker['label'].isin(bio_over_300)]

# 9. TCGA 300개 이하 삭제 + 19개 암종 + GTEX500 + SLC 전체
over_500_300 = df_base_500['label'].value_counts().index[:13].tolist()
df_base_500_over_300 = df_base_500[df_base_500['label'].isin(over_500_300)]

# 10. TCGA 300개 이하 삭제 + 19개 암종 + GTEX500 + SLC bio-marker
bio_over_500_300 = df_biomarker_500['label'].value_counts().index[:13].tolist()
df_biomarker_500_over_300 = df_biomarker_500[df_biomarker_500['label'].isin(bio_over_500_300)]

# 11. TCGA 300개 이하 삭제 + 19개 암종 + SLC 전체
TCGA_over_300 = df_TCGA['label'].value_counts().index[:12].tolist()
df_TCGA_over_300 = df_TCGA[df_TCGA['label'].isin(TCGA_over_300)]

# 12. TCGA 300개 이하 삭제 + 19개 암종 + SLC bio-marker
biomarker_TCGA_over_300 = df_biomarker_TCGA['label'].value_counts().index[:12].tolist()
df_biomarker_TCGA_over_300 = df_biomarker_TCGA[df_biomarker_TCGA['label'].isin(biomarker_TCGA_over_300)]

## Function

* `Train` / `Score` 함수

In [None]:
# machine learning 학습 코드
def training_(df):
    
    # split X, y
    X = df.iloc[:,:-1]
    y = df.iloc[:,-1:]
    label_num = len(y['label'].unique())
    
    # make train test set
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle=True, random_state=123)
    
    
    # SVM
    SVM_model = Pipeline([
        ("svm_clf", SVC(C=5, kernel="rbf", degree=3, coef0=1))
    ])
    SVM_model.fit(X_train, y_train)
    y_pred_SVM = SVM_model.predict(X_test)
    
    # XGB
    XGB_model = XGBClassifier(random_state=123)
    XGB_model.fit(X_train, y_train)
    y_pred_XGB = XGB_model.predict(X_test)
    
    # Random Forest
    RF_model = RandomForestClassifier(random_state=123)
    RF_model.fit(X_train, y_train)
    y_pred_RF = RF_model.predict(X_test)
    
    # KNN
    KNN_model = KNeighborsClassifier(n_neighbors=label_num) # 라벨 개수 수정 완료
    KNN_model.fit(X_train, y_train)
    y_pred_KNN = KNN_model.predict(X_test)
    
    # 예측값 df 생성
    y_df = pd.DataFrame(np.array(y_test), columns=['test'])
    y_df['SVM_pred'] = y_pred_SVM
    y_df['XGB_pred'] = y_pred_XGB
    y_df['RF_pred'] = y_pred_RF
    y_df['KNN_pred'] = y_pred_KNN
    
    return y_df

In [None]:
# 학습 결과 score
def score_(y_df):
    score_df = pd.DataFrame(columns=['accracy','recall','precision'], index=['SVM','XGB','RF','KNN'])
    
    for i in range(0, y_df.shape[1]-1):
        score_df['accracy'][i] = accuracy_score(y_df.iloc[:,:1], y_df.iloc[:,i+1])
        score_df['recall'][i] = recall_score(y_df.iloc[:,:1], y_df.iloc[:,i+1], average='weighted')
        score_df['precision'][i] = precision_score(y_df.iloc[:,:1], y_df.iloc[:,i+1], average='weighted')
    
    score_df = score_df.T
    
    return score_df

## Run

In [None]:
# 사용하고 싶은 데이터 불러오기
df_ml = df_base.copy()

# train & score
y_df = training_(df_biomarker)
score_df = score_(y_df)