In [None]:
!pip install rdkit

Collecting rdkit
  Downloading rdkit-2025.9.1-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (4.1 kB)
Downloading rdkit-2025.9.1-cp312-cp312-manylinux_2_28_x86_64.whl (36.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m36.2/36.2 MB[0m [31m54.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rdkit
Successfully installed rdkit-2025.9.1


In [None]:
!pip install sklearn

Collecting sklearn
  Downloading sklearn-0.0.post12.tar.gz (2.6 kB)
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mpython setup.py egg_info[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m See above for output.
  
  [1;35mnote[0m: This error originates from a subprocess, and is likely not a problem with pip.
  Preparing metadata (setup.py) ... [?25l[?25herror
[1;31merror[0m: [1mmetadata-generation-failed[0m

[31m×[0m Encountered error while generating package metadata.
[31m╰─>[0m See above for output.

[1;35mnote[0m: This is an issue with the package mentioned above, not pip.
[1;36mhint[0m: See above for details.


In [None]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import Descriptors, AllChem
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score
import numpy as np

# --- 1. QSAR 모델 학습 (ChEMBL 실험 데이터) ---

chembl_file = 'DOWNLOAD-8dRaz_qDYU-J_i5rS_2Bc9QINR090J70prRwzmvqaiY_eq_.csv'

# 1. 오류 처리 옵션을 사용하여 파일 로드
df_chembl = pd.read_csv(
    chembl_file,
    delimiter=';',
    encoding='utf-8',
    on_bad_lines='skip' # 오류가 있는 행 건너뛰기
)

# 2. 데이터 정제 및 pIC50 준비 (Standard Relation이 "'='"인 정량 데이터만 사용)
df_filtered = df_chembl[
    (df_chembl['Standard Type'] == 'IC50') &
    (df_chembl['Standard Relation'] == "'='") &
    (df_chembl['pChEMBL Value'].notna()) &
    (df_chembl['Smiles'].notna())
].copy()

df_filtered['pIC50'] = df_filtered['pChEMBL Value'].astype(float)
df_filtered = df_filtered.rename(columns={'Smiles': 'smi'})
df_filtered.reset_index(drop=True, inplace=True)

# 3. 분자 기술자 및 Fingerprint 계산 함수 정의
features = ['MW', 'LogP', 'HBD', 'HBA', 'RotatableBonds']
def calculate_descriptors_and_fp(smiles, nbits=2048):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return {f: np.nan for f in features}, None

    desc = {
        'MW': Descriptors.MolWt(mol),
        'LogP': Descriptors.MolLogP(mol),
        'HBD': Descriptors.NumHDonors(mol),
        'HBA': Descriptors.NumHAcceptors(mol),
        'RotatableBonds': Descriptors.NumRotatableBonds(mol)
    }
    # Morgan Fingerprint (ECFP4 equivalent)
    fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=nbits)
    return desc, np.array(fp)

# 4. 기술자 및 Fingerprint 계산 및 QSAR 훈련 데이터 준비
fp_size = 2048
results = df_filtered['smi'].apply(lambda s: calculate_descriptors_and_fp(s, nbits=fp_size))
desc_df = pd.DataFrame([r[0] for r in results.tolist()])
fp_list = [r[1] for r in results.tolist()]

df_qsar_train = pd.concat([df_filtered, desc_df], axis=1)
df_qsar_train['FP'] = fp_list # Fingerprint list is assigned here
df_qsar_cleaned = df_qsar_train.dropna(subset=['pIC50'] + features + ['FP']).copy()

# QSAR 모델 데이터 준비 (Fingerprint 사용)
X_fp = np.stack(df_qsar_cleaned['FP'].values)
Y = df_qsar_cleaned['pIC50']

# 데이터 분할
X_train, X_test, Y_train, Y_test = train_test_split(X_fp, Y, test_size=0.2, random_state=42)

# 5. 모델 구축 및 훈련 (Random Forest Regressor)
qsar_model_exp = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
qsar_model_exp.fit(X_train, Y_train)

# 모델 평가 (R2 출력)
r2_qsar = r2_score(Y_test, qsar_model_exp.predict(X_test))

# --- 6. Vina 도킹 화합물 예측 및 최종 데이터프레임 구축 ---

df_vina = pd.read_csv('top4700_filtered.csv')
df_vina.reset_index(drop=True, inplace=True)

# Vina 데이터 기술자 및 Fingerprint 계산
vina_results = df_vina['smi'].apply(lambda s: calculate_descriptors_and_fp(s, nbits=fp_size))
vina_desc_df = pd.DataFrame([r[0] for r in vina_results.tolist()])
vina_fp_list = [r[1] for r in vina_results.tolist()]

df_vina_full = pd.concat([df_vina, vina_desc_df], axis=1)
df_vina_full['FP'] = vina_fp_list

# 결측치 처리 및 예측 데이터 준비
df_vina_full = df_vina_full.dropna(subset=features + ['FP']).copy()

# Lipinski's Rule of Five 계산
df_vina_full['Lipinski_Failures'] = 0
df_vina_full.loc[df_vina_full['MW'] > 500, 'Lipinski_Failures'] += 1
df_vina_full.loc[df_vina_full['LogP'] > 5, 'Lipinski_Failures'] += 1
df_vina_full.loc[df_vina_full['HBD'] > 5, 'Lipinski_Failures'] += 1
df_vina_full.loc[df_vina_full['HBA'] > 10, 'Lipinski_Failures'] += 1

# 예측 실행
X_predict_fp = np.stack(df_vina_full['FP'].values)
df_vina_full['Predicted_pIC50'] = qsar_model_exp.predict(X_predict_fp)

# --- 7. 데이터 정렬 및 CSV 저장 ---

output_file = 'ptp1b_final_candidates_by_pic50_fp.csv'

# Predicted_pIC50을 기준으로 정렬
df_final_save = df_vina_full.sort_values(by='Predicted_pIC50', ascending=False)

# 저장할 컬럼 선택
columns_to_keep = ['ID', 'smi', 'Affinity_kcal/mol', 'Predicted_pIC50',
                   'NR_AhR_Toxicity_Prob', 'MW', 'LogP', 'HBD', 'HBA',
                   'RotatableBonds', 'Lipinski_Failures']

# CSV 파일로 저장
df_final_save[columns_to_keep].to_csv(output_file, index=False)

print(f"\n--- Morgan Fingerprint QSAR 모델 성능 (R-제곱): {r2_qsar:.4f} ---")
print("\n✅ 전체 후보 물질 데이터를 Predicted_pIC50 순으로 정렬하여 CSV 파일로 저장했습니다.")
print(f"저장된 파일: {output_file}")

[1;30;43m스트리밍 출력 내용이 길어서 마지막 5000줄이 삭제되었습니다.[0m



--- Morgan Fingerprint QSAR 모델 성능 (R-제곱): 0.6993 ---

✅ 전체 후보 물질 데이터를 Predicted_pIC50 순으로 정렬하여 CSV 파일로 저장했습니다.
저장된 파일: ptp1b_final_candidates_by_pic50_fp.csv
