Datensatz laden und verstehen

In [1]:
import pandas as pd

In [2]:
# Lade den Datensatz
df = pd.read_csv("cox2.csv")

# Zeige die Dimensionen des Datensatzes
print("Shape of dataset:", df.shape)

# Zeige die ersten 5 Zeilen
print("First 5 rows:")
print(df.head())

# Übersicht über die Spalten
print("\nColumn info:")
print(df.info())

# Grundlegende Statistik
print("\nDescriptive statistics:")
print(df.describe(include='all'))

Shape of dataset: (462, 257)
First 5 rows:
   QikProp_.stars  QikProp_.amine  QikProp_.acid  QikProp_.rotor  \
0               1               0              0               1   
1               0               0              0               2   
2               0               0              0               3   
3               1               0              0               2   
4               1               0              0               2   

   QikProp_.rctvFG  QikProp_CNS  QikProp_MW  QikProp_dipole  QikProp_SASA  \
0                0            0     358.882           7.645       635.022   
1                0           -1     359.870           7.959       622.548   
2                0           -1     355.451           7.500       610.972   
3                0           -1     393.423           9.342       651.852   
4                0           -1     394.315           9.409       637.688   

   QikProp_FOSA  ...  moe2D_vsa_acc  moe2D_vsa_don  moe2D_vsa_hyd  \
0       273.363 

Datensatz in Trainings- und Testdaten splitten

4_4 letzter Punkt: accuracy, wie viele label richtig, genauigkeit. Capa?

In [3]:
from sklearn.model_selection import train_test_split

In [5]:
#Zielvariable ermitteln
print(df.columns)             
print(df.nunique())           
print(df.head())             

Index(['QikProp_.stars', 'QikProp_.amine', 'QikProp_.acid', 'QikProp_.rotor',
       'QikProp_.rctvFG', 'QikProp_CNS', 'QikProp_MW', 'QikProp_dipole',
       'QikProp_SASA', 'QikProp_FOSA',
       ...
       'moe2D_vsa_acc', 'moe2D_vsa_don', 'moe2D_vsa_hyd', 'moe2D_vsa_other',
       'moe2D_vsa_pol', 'moe2D_weinerPath', 'moe2D_weinerPol', 'moe2D_zagreb',
       'IC50', 'cox2Class'],
      dtype='object', length=257)
QikProp_.stars        6
QikProp_.amine        2
QikProp_.acid         2
QikProp_.rotor        8
QikProp_.rctvFG       2
                   ... 
moe2D_weinerPath    169
moe2D_weinerPol      25
moe2D_zagreb         32
IC50                196
cox2Class             2
Length: 257, dtype: int64
   QikProp_.stars  QikProp_.amine  QikProp_.acid  QikProp_.rotor  \
0               1               0              0               1   
1               0               0              0               2   
2               0               0              0               3   
3               1 

In [6]:
# Features und Ziel definieren
X = df.drop(columns=['cox2Class', 'IC50'])  # IC50 ebenfalls entfernen, da nicht als Feature
y = df['cox2Class']

# In numerische Klassen umwandeln (optional, je nach Modell)
y = y.map({'Inactive': 0, 'Active': 1})

# Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42, stratify=y
)

print("Train shape:", X_train.shape)
print("Test shape:", X_test.shape)

Train shape: (346, 255)
Test shape: (116, 255)


Klassifikation

In [7]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score


In [8]:
#Fehlende Werte füllen
X_train = X_train.fillna(0)
X_test = X_test.fillna(0)

# Random Forest Modell erstellen und trainieren
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Vorhersage auf Testdaten
y_pred = rf.predict(X_test)

# Evaluation
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.8362068965517241

Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.94      0.90        93
           1       0.62      0.43      0.51        23

    accuracy                           0.84       116
   macro avg       0.75      0.69      0.71       116
weighted avg       0.82      0.84      0.82       116



Hyperparameter optimieren

In [9]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

In [10]:
# Parameter-Grid definieren
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'max_features': ['sqrt', 'log2', None]  # entspricht mtry in R
}

# Random Forest Modell
rf = RandomForestClassifier(random_state=42)

# GridSearchCV Setup
grid_search = GridSearchCV(
    estimator=rf,
    param_grid=param_grid,
    cv=5,                 
    scoring='accuracy',  
    n_jobs=-1,            
    verbose=2
)

# Fit auf Trainingsdaten
grid_search.fit(X_train, y_train)

# Beste Parameter
print("Best parameters:", grid_search.best_params_)
print("Best cross-validation score:", grid_search.best_score_)

# Bestes Modell anwenden
best_rf = grid_search.best_estimator_
y_pred = best_rf.predict(X_test)

# Evaluation
from sklearn.metrics import classification_report, accuracy_score
print("\nTest Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Fitting 5 folds for each of 36 candidates, totalling 180 fits
[CV] END max_depth=None, max_features=sqrt, n_estimators=100; total time=   0.2s
[CV] END max_depth=None, max_features=sqrt, n_estimators=100; total time=   0.2s
[CV] END max_depth=None, max_features=sqrt, n_estimators=100; total time=   0.2s
[CV] END max_depth=None, max_features=sqrt, n_estimators=100; total time=   0.2s
[CV] END max_depth=None, max_features=sqrt, n_estimators=100; total time=   0.2s
[CV] END max_depth=None, max_features=sqrt, n_estimators=200; total time=   0.5s
[CV] END max_depth=None, max_features=sqrt, n_estimators=200; total time=   0.5s
[CV] END max_depth=None, max_features=sqrt, n_estimators=200; total time=   0.5s
[CV] END max_depth=None, max_features=sqrt, n_estimators=200; total time=   0.4s
[CV] END max_depth=None, max_features=sqrt, n_estimators=200; total time=   0.5s
[CV] END max_depth=None, max_features=log2, n_estimators=100; total time=   0.1s
[CV] END max_depth=None, max_features=log2, n_e