In [1]:
# Import Library
# 제출 파일 생성 관련
import os
import polars as pl
import polars.selectors as cs

# 데이터 처리 및 분석
import pandas as pd
import numpy as np
from scipy import stats
from tqdm import tqdm
import seaborn as sns
from matplotlib import pyplot as plt
import datetime

# 머신러닝 전처리
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.model_selection import StratifiedKFold
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline

# 모델 저장
import joblib

# To ignore all warnings
import warnings
warnings.filterwarnings('ignore')


In [24]:
input_file = open('variant_summary.txt','r')
output_file = open('output.txt','w')
for lines in range(500):
    line = input_file.readline()
    output_file.write(line)

# EDA

In [2]:
train_all=pd.read_csv('./train.csv').drop(columns="ID")
test_all=pd.read_csv('./test.csv')

In [3]:
train_all

Unnamed: 0,SUBCLASS,A2M,AAAS,AADAT,AARS1,ABAT,ABCA1,ABCA2,ABCA3,ABCA4,...,ZNF292,ZNF365,ZNF639,ZNF707,ZNFX1,ZNRF4,ZPBP,ZW10,ZWINT,ZYX
0,KIPAN,WT,WT,WT,WT,WT,WT,WT,WT,WT,...,WT,WT,WT,WT,WT,WT,WT,WT,WT,WT
1,SARC,WT,WT,WT,WT,WT,WT,WT,WT,WT,...,WT,WT,WT,WT,WT,WT,WT,WT,WT,WT
2,SKCM,R895R,WT,WT,WT,WT,WT,WT,WT,WT,...,WT,WT,WT,WT,WT,WT,WT,WT,WT,WT
3,KIRC,WT,WT,WT,WT,WT,WT,WT,WT,WT,...,WT,WT,WT,WT,WT,WT,WT,WT,WT,WT
4,GBMLGG,WT,WT,WT,WT,WT,WT,WT,WT,WT,...,WT,WT,WT,WT,WT,WT,WT,WT,WT,WT
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6196,LUAD,WT,WT,WT,WT,WT,WT,WT,WT,WT,...,WT,WT,WT,WT,WT,WT,WT,WT,WT,WT
6197,LGG,WT,WT,WT,WT,WT,WT,WT,WT,WT,...,WT,WT,WT,WT,WT,WT,WT,WT,WT,WT
6198,COAD,WT,WT,WT,WT,WT,WT,WT,WT,WT,...,WT,WT,WT,WT,WT,WT,WT,WT,T181S,WT
6199,TGCT,WT,WT,WT,WT,WT,WT,WT,WT,WT,...,WT,WT,WT,WT,WT,WT,WT,WT,WT,WT


In [5]:
def get_null_info(df):
    null_info = []
    null_columns_count={}
    for index, row in df.iterrows():
        null_columns = row.index[row.isnull()].tolist()
        if null_columns:
            null_info.append({
                'Row': index,
                'Null Columns': ', '.join(null_columns)
            })
            for col in null_columns:
                if col not in null_columns_count:
                    null_columns_count[col]=1
                else:
                    null_columns_count[col]+=1
                    
    return pd.DataFrame(null_info), null_columns_count

train_null_summary, train_null_columns_count = get_null_info(train_all)
test_null_summary, test_null_columns_count = get_null_info(test_all)

print("\nnull 값 정보:")
print(train_null_summary)
print(test_null_summary)


null 값 정보:
Empty DataFrame
Columns: []
Index: []
      Row          Null Columns
0      12                   AK2
1      15               TNFAIP6
2      27        CNOT2, TNFAIP6
3      38                   AK2
4      63        CNOT2, TNFAIP6
..    ...                   ...
123  2441                   AK2
124  2449                   AK2
125  2460        CNOT2, TNFAIP6
126  2499                  GUK1
127  2543  CNOT2, RBM5, TNFAIP6

[128 rows x 2 columns]


In [8]:
# 모든 열의 고유값을 추출합니다.
train_unique_values = set()
for column in train_all.columns:
    train_unique_values.update(train_all[column].unique())
    
# 모든 열의 고유값을 추출합니다.
test_unique_values = set()
for column in test_all.columns:
    test_unique_values.update(test_all[column].unique())
    
test_unique_values.remove(np.nan)

In [18]:

train_tf_length=[len(value.split(" ")) for value in train_unique_values]
test_tf_length=[len(value.split(" ")) for value in test_unique_values]
train_unique_tf=[tf for value in train_unique_values for tf in value.split(" ")]
test_unique_tf=[tf for value in test_unique_values for tf in value.split(" ")]


In [20]:
print("train unique count: ", len(train_unique_values))
print("test unique count: ", len(test_unique_values))
print("max train transformation count: ", max(train_tf_length))
print("max test transformation count: ", max(test_tf_length))
print("unique train transformation count: ", len(train_unique_tf))
print("unique test transformation count: ", len(test_unique_tf))

train unique count:  118873
test unique count:  124942
max train transformation count:  64
max test transformation count:  51
unique train transformation count:  154844
unique test transformation count:  238604


In [22]:
import re

def parse_alphanumeric(text):
    # 첫 번째 패턴: 알파벳/*(선택적) + 숫자 + 알파벳/*(선택적)
    pattern1 = r'^([a-zA-Z*]*)(\d+)([a-zA-Z*]*)$'
    
    # 두 번째 패턴: 숫자_숫자알파벳>알파벳
    pattern2 = r'^(\d+_\d+)([a-zA-Z*]*)(>[a-zA-Z*]*)$'
    
    # 세 번째 패턴: -숫자알파벳
    pattern3 = r'^(-?)(\d+)([a-zA-Z]+)$'
    
    matches1 = re.match(pattern1, text)
    matches2 = re.match(pattern2, text)
    matches3 = re.match(pattern3, text)
    
    if matches1:
        return {
            'prefix': matches1.group(1),
            'number': matches1.group(2),
            'suffix': matches1.group(3)
        }
    elif matches2:
        return {
            'prefix': matches2.group(2),
            'number': matches2.group(1),
            'suffix': matches2.group(3)
        }
    elif matches3:
        return {
            'prefix': matches3.group(1),
            'number': matches3.group(2),
            'suffix': matches3.group(3)
        }
    else:
        return None


In [23]:
tf_dict={}

for tf in train_unique_tf:
    tf_dict[tf]=parse_alphanumeric(tf)
    
tf_dict

{'Y222*': {'prefix': 'Y', 'number': '222', 'suffix': '*'},
 'R1257K': {'prefix': 'R', 'number': '1257', 'suffix': 'K'},
 'Y632D': {'prefix': 'Y', 'number': '632', 'suffix': 'D'},
 'C94C': {'prefix': 'C', 'number': '94', 'suffix': 'C'},
 'G1218*': {'prefix': 'G', 'number': '1218', 'suffix': '*'},
 'MLV74fs': {'prefix': 'MLV', 'number': '74', 'suffix': 'fs'},
 'V122F': {'prefix': 'V', 'number': '122', 'suffix': 'F'},
 'C1116G': {'prefix': 'C', 'number': '1116', 'suffix': 'G'},
 'L883P': {'prefix': 'L', 'number': '883', 'suffix': 'P'},
 'T239M': {'prefix': 'T', 'number': '239', 'suffix': 'M'},
 'V1804I': {'prefix': 'V', 'number': '1804', 'suffix': 'I'},
 'L869L': {'prefix': 'L', 'number': '869', 'suffix': 'L'},
 'P419fs': {'prefix': 'P', 'number': '419', 'suffix': 'fs'},
 'H130L': {'prefix': 'H', 'number': '130', 'suffix': 'L'},
 'E237Q': {'prefix': 'E', 'number': '237', 'suffix': 'Q'},
 'T1363M': {'prefix': 'T', 'number': '1363', 'suffix': 'M'},
 'P848S': {'prefix': 'P', 'number': '848',

In [25]:
train_all['EGFR']

0          WT
1          WT
2          WT
3       L838M
4          WT
        ...  
6196       WT
6197       WT
6198       WT
6199       WT
6200       WT
Name: EGFR, Length: 6201, dtype: object