In [99]:
import random
import os

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import train_test_split

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

from rdkit import DataStructs
from rdkit.Chem import PandasTools, AllChem

In [100]:
def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    torch.manual_seed(seed)

seed_everything(42) # Seed 고정

In [117]:
train = pd.read_csv('./train.csv')
train = train.groupby('SMILES').mean().reset_index()
test = pd.read_csv('./test.csv')

  train = train.groupby('SMILES').mean().reset_index()


In [118]:
PandasTools.AddMoleculeColumnToFrame(train,'SMILES','Molecule')
PandasTools.AddMoleculeColumnToFrame(test,'SMILES','Molecule')
def mol2fp(mol):
    fp = AllChem.GetHashedMorganFingerprint(mol, 8, nBits=1024)
    ar = np.zeros((1,), dtype=np.int8)
    DataStructs.ConvertToNumpyArray(fp, ar)
    return ar

train["FPs"] = train.Molecule.apply(mol2fp)
test["FPs"] = test.Molecule.apply(mol2fp)

droping = []
tp = pd.DataFrame(train['SMILES'].str.contains('e'))
droping.extend(list(tp[tp['SMILES'] == True].index))
tp = pd.DataFrame(train['SMILES'].str.contains('I'))
droping.extend(list(tp[tp['SMILES'] == True].index))
tp = pd.DataFrame(train['SMILES'].str.contains('6'))
droping.extend(list(tp[tp['SMILES'] == True].index))

train = train.drop(droping)

feed = [list(i) for i in train.FPs]
feed.extend([list(reversed(i)) for i in train.FPs])
feed = np.array(feed)
feed_test = np.array([list(i) for i in test.FPs])

In [119]:
feed.shape

(6912, 1024)

In [84]:
tp = [[j for j in i] for i in train['SMILES']]
ans = []
for i in tp:
    ans.extend(i)
    
import collections
data = pd.DataFrame([collections.Counter(ans).keys(),collections.Counter(ans).values()]).transpose()
data.columns = ['atom','counter']
data.sort_values('counter',ascending=False).reset_index(drop=True)

Unnamed: 0,atom,counter
0,c,39264
1,C,25700
2,(,14618
3,),14618
4,1,10186
5,O,8400
6,2,8150
7,n,6743
8,=,5689
9,N,4742


In [85]:
test['SMILES'] = [i.replace('P','') for i in test['SMILES']]
tp = [[j for j in i] for i in test['SMILES']]
ans = []
for i in tp:
    ans.extend(i)
    
import collections
data = pd.DataFrame([collections.Counter(ans).keys(),collections.Counter(ans).values()]).transpose()
data.columns = ['atom','counter']
data.sort_values('counter',ascending=False).reset_index(drop=True)

Unnamed: 0,atom,counter
0,c,5169
1,C,3720
2,),1926
3,(,1926
4,1,1382
5,2,1164
6,O,1143
7,n,965
8,=,751
9,N,683


In [86]:
tp = [[j for j in i] for i in test['SMILES']]
ans = []
for i in tp:
    ans.extend(i)
    
import collections
data_t = pd.DataFrame([collections.Counter(ans).keys(),collections.Counter(ans).values()]).transpose()
data_t.columns = ['atom','counter']
data_t.sort_values('counter',ascending=False).reset_index(drop=True)

vectorizer_rule = pd.merge(data,data_t,how='left',on='atom').sort_values('counter_x',ascending=False).reset_index(drop=True)
vectorizer_rule

Unnamed: 0,atom,counter_x,counter_y
0,c,5169,5169
1,C,3720,3720
2,),1926,1926
3,(,1926,1926
4,1,1382,1382
5,2,1164,1164
6,O,1143,1143
7,n,965,965
8,=,751,751
9,N,683,683


In [87]:
tp = [[j for j in i] for i in train['SMILES']]

In [88]:
vectorizer_rule = {vectorizer_rule.loc[i,'atom']:i for i in range(len(vectorizer_rule))}

In [89]:
max([len([vectorizer_rule[j] for j in i]) for i in tp])

174

In [90]:
vector = [[vectorizer_rule[j] for j in i] for i in tp]
# max_len = max([len([vectorizer_rule[j] for j in i]) for i in tp])
# for i in vector:
#     i.extend([0 for _ in range(max_len-len(i))])
    
train['SMILES'] = vector
train['lenS'] = [len(i) for i in vector]
train.describe()

Unnamed: 0,MLM,HLM,AlogP,Molecular_Weight,Num_H_Acceptors,Num_H_Donors,Num_RotatableBonds,LogD,Molecular_PolarSurfaceArea,lenS
count,3456.0,3456.0,3454.0,3456.0,3456.0,3456.0,3456.0,3456.0,3456.0,3456.0
mean,37.306141,53.156214,2.837014,363.396068,4.011285,1.101852,4.563368,2.677176,79.71555,44.364005
std,35.653558,36.035416,1.606464,79.276628,1.510461,0.977629,2.359335,1.666328,28.883174,10.836799
min,0.0,0.0,-4.137,172.18,0.0,0.0,0.0,-4.92,3.24,18.0
25%,2.448,16.125,1.82725,304.36225,3.0,0.0,3.0,1.606,60.0025,37.0
50%,26.085,58.11,2.879,349.438,4.0,1.0,4.0,2.7055,77.455,43.0
75%,70.70225,87.719,3.95625,409.284,5.0,2.0,6.0,3.858,96.5325,50.0
max,131.72,135.336,9.19,1360.467,15.0,10.0,38.0,9.19,296.43,174.0


In [14]:
vector = [[vectorizer_rule[j] for j in i] for i in tp]
max_len = max([len([vectorizer_rule[j] for j in i]) for i in tp])
for i in vector:
    i.extend([0 for _ in range(max_len-len(i))])
    
from sklearn.decomposition import PCA
pca = PCA(n_components=64)
train_vec = pca.fit_transform(vector)
train_vec

array([[-7.35201636, 10.97352362, 11.26505711, ..., -0.36834302,
        -0.19286098,  0.11016194],
       [-2.0580382 ,  1.57701787,  9.40202196, ..., -0.33950158,
         0.06261559,  0.0326336 ],
       [-0.61270958,  3.06538666,  7.75639635, ..., -0.25672986,
        -0.56959269,  0.27005752],
       ...,
       [14.30919053, -6.75374513, -5.73552399, ...,  0.05104296,
        -0.11570895,  0.40040758],
       [-4.25409022, -5.93884742, -2.87805926, ...,  0.13518571,
         0.21903181, -0.04915645],
       [ 7.6199656 ,  0.16055819, -2.66457005, ...,  0.39800065,
         0.07270499, -0.030626  ]])

In [122]:
feed.shape

(6912, 1024)

In [128]:
np.array(list(train['MLM'])*2).shape

(6912,)

In [134]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv1D, MaxPooling1D, Flatten, Dropout, Normalization
from sklearn.model_selection import train_test_split
import numpy as np
from tensorflow.keras.optimizers import Adam


X_numeric = feed
y_regression = np.array(list(train['MLM'])*2)

X_train, X_test, y_train, y_test = train_test_split(X_numeric, y_regression, test_size=0.1, random_state=219842194)

model = Sequential()
# model.add(Dense(units=256, activation='relu', input_dim=X_numeric.shape[1]))
model.add(Conv1D(filters=16, kernel_size=3, activation='relu', input_shape=(X_numeric.shape[1], 1)))
model.add(MaxPooling1D(pool_size=4))
model.add(Flatten())
model.add(Dropout(0.5))
                                                    
model.add(Dense(128, activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Normalization())
model.add(Dropout(0.5))

model.add(Dense(64, activation='relu'))
model.add(Dense(16, activation='relu'))
model.add(Normalization())
model.add(Dropout(0.5))

model.add(Dense(64, activation='relu'))
model.add(Dense(16, activation='relu'))
model.add(Normalization())
model.add(Dropout(0.5))

model.add(Dense(64, activation='relu'))
model.add(Dense(16, activation='relu'))
model.add(Normalization())
model.add(Dropout(0.5))

model.add(Dense(1))

custom_learning_rate = 0.01
adam_optimizer = Adam(learning_rate=custom_learning_rate)
model.compile(optimizer=adam_optimizer, loss='mean_squared_error', metrics=[tf.keras.metrics.RootMeanSquaredError()])

model.summary()
model.fit(X_train, y_train, epochs=10, batch_size=2, validation_split=0.2)

# 모델 평가 (RMSE는 평가 함수에 RMSE 지표가 포함되어 있어야 함)
loss, rmse = model.evaluate(X_test, y_test)
print(f"평가 손실 (MSE): {loss}")
print(f"평가 RMSE: {rmse}")



Model: "sequential_49"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv1d_36 (Conv1D)          (None, 1022, 16)          64        
                                                                 
 max_pooling1d_36 (MaxPoolin  (None, 255, 16)          0         
 g1D)                                                            
                                                                 
 flatten_35 (Flatten)        (None, 4080)              0         
                                                                 
 dropout_105 (Dropout)       (None, 4080)              0         
                                                                 
 dense_258 (Dense)           (None, 128)               522368    
                                                                 
 dense_259 (Dense)           (None, 64)                8256      
                                                     

KeyboardInterrupt: 

In [132]:
loss, rmse = model.evaluate(X_test, y_test)
print(f"평가 손실 (MSE): {loss}")
print(f"평가 RMSE: {rmse}")


평가 손실 (MSE): 1299.7803955078125
평가 RMSE: 36.052467346191406


In [75]:
# tp = [[j for j in i] for i in test['SMILES']]
# vector = [[vectorizer_rule[j] for j in i] for i in tp]
# max_len = max([len([vectorizer_rule[j] for j in i]) for i in tp])
# for i in vector:
#     i.extend([0 for _ in range(max_len-len(i))])
    
# from sklearn.decomposition import PCA
# pca = PCA(n_components=64)
# test_vec = pca.fit_transform(vector)
# test_vec

mlm = model.predict(feed_test)



In [76]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv1D, MaxPooling1D, Flatten, Dropout
from sklearn.model_selection import train_test_split
import numpy as np
from tensorflow.keras.optimizers import Adam


# 가상의 정형 데이터 생성
X_numeric = feed  # 100개의 샘플과 5개의 특성
y_regression = train['HLM'] # 회귀 타겟 값

# 데이터를 학습용과 테스트용으로 분리
X_train, X_test, y_train, y_test = train_test_split(X_numeric, y_regression, test_size=0.01, random_state=42)

# 모델 생성
model = Sequential()

# 입력 레이어와 첫 번째 Dense 레이어 추가
# model.add(Dense(units=64, activation='sigmoid', input_dim=X_numeric.shape[1]))
# 입력 레이어와 1D Convolutional 레이어 추가
model.add(Conv1D(filters=16, kernel_size=3, activation='relu', input_shape=(X_numeric.shape[1], 1)))
model.add(Dropout(0.5))
# MaxPooling 레이어 추가 (옵션)
model.add(MaxPooling1D(pool_size=2))

# Flatten 레이어 추가
model.add(Flatten())

# 추가적인 Dense 레이어 추가 가능
model.add(Dense(units=128, activation='sigmoid'))
model.add(Dense(units=64, activation='sigmoid'))
model.add(Dropout(0.5))

model.add(Dense(units=128, activation='sigmoid'))
model.add(Dense(units=64, activation='sigmoid'))
model.add(Dropout(0.5))

model.add(Dense(units=16, activation='sigmoid'))

# 출력 레이어 추가 (회귀 문제이므로 활성화 함수 없이 단일 유닛 사용)
model.add(Dense(units=1))

custom_learning_rate = 0.01 # 원하는 학습률로 수정
adam_optimizer = Adam(learning_rate=custom_learning_rate)

# 모델 컴파일 (회귀 손실 함수로 MSE 설정 및 RMSE 평가 지표 추가)
model.compile(optimizer=adam_optimizer, loss='mean_squared_error', metrics=[tf.keras.metrics.RootMeanSquaredError()])

# 모델 요약 출력
model.summary()

# 모델 학습
model.fit(X_train, y_train, epochs=10, batch_size=3, validation_split=0.2)

# 모델 평가 (RMSE는 평가 함수에 RMSE 지표가 포함되어 있어야 함)
# loss, rmse = model.evaluate(X_test, y_test)
# print(f"평가 손실 (MSE): {loss}")
# print(f"평가 RMSE: {rmse}")


Model: "sequential_44"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv1d_32 (Conv1D)          (None, 1022, 16)          64        
                                                                 
 dropout_87 (Dropout)        (None, 1022, 16)          0         
                                                                 
 max_pooling1d_32 (MaxPoolin  (None, 511, 16)          0         
 g1D)                                                            
                                                                 
 flatten_31 (Flatten)        (None, 8176)              0         
                                                                 
 dense_223 (Dense)           (None, 128)               1046656   
                                                                 
 dense_224 (Dense)           (None, 64)                8256      
                                                     

<keras.callbacks.History at 0x17820545b20>

In [47]:
# tp = [[j for j in i] for i in test['SMILES']]
# vector = [[vectorizer_rule[j] for j in i] for i in tp]
# max_len = max([len([vectorizer_rule[j] for j in i]) for i in tp])
# for i in vector:
#     i.extend([0 for _ in range(max_len-len(i))])
    
# from sklearn.decomposition import PCA
# pca = PCA(n_components=64)
# test_vec = pca.fit_transform(vector)
# test_vec

hlm = model.predict(feed_test)



In [48]:
sub = pd.read_csv('./sample_submission.csv')
sub['MLM'] = mlm
sub['HLM'] = hlm
sub = sub.set_index('id')
sub.to_csv('./submission.csv')