# Ryhmä-190

## Python-paketit

In [34]:
# Jupyter-notebookissa voi asentaa paketit samaan tapaan kuin komentoriviltä.
# Tähän käytetään "magic commandeja", jotka alkavat %-merkillä.
# Näiden asentamisessa voi mennä muutama minuutti, mutta sen jälkeen niitä ei tarvitse asentaa uudestaan. 
#%pip install seaborn
#%pip install mlflow azureml azureml-core azureml-mlflow azure-identity
%pip install optuna





[notice] A new release of pip is available: 24.2 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [35]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, make_scorer, r2_score
from sklearn.model_selection import KFold, cross_val_score
#from azureml.core import Workspace
#from azure.identity import InteractiveBrowserCredential
#import mlflow

## MLFlow-seuranta (ei tällä hetkellä käytössä)

Tarkistetaan ajoympäristö (Azure vai oma kone) ja autentikoidutaan sen mukaan.

In [36]:
def is_running_in_azure():
    return 'AZUREML_RUN_ID' in os.environ

def get_workspace():
    if is_running_in_azure():
        return Workspace.from_config()
    else:
        return Workspace(subscription_id='1c0e26b6-0fcb-4b6d-911c-2a0836275ea4',
                         resource_group='rg-AML',
                         workspace_name='aml-data_science_masters')
#ws = get_workspace()

In [37]:
#mlflow.set_tracking_uri(ws.get_mlflow_tracking_uri())
#experiment_name = 'group-190-tracking'
#mlflow.set_experiment(experiment_name)
#mlflow.autolog()

In [38]:
def evaluate_model(model, X_train, y_train, random_state=42):
    ### Train loss
    y_train_pred = model.predict(X_train)
    train_loss = mean_squared_error(y_train, y_train_pred)

    ### 5-fold cross-validation
    kf = KFold(n_splits=5, shuffle=True, random_state=random_state)
    mse_scorer = make_scorer(mean_squared_error)
    cv_scores = cross_val_score(model, X_train, y_train, cv=kf, scoring=mse_scorer)
    cv_loss_mean = cv_scores.mean()

    r2_train = r2_score(y_train, y_train_pred)
    r2_cv = cross_val_score(model, X_train, y_train, cv=kf, scoring='r2').mean()

    print('Train loss:', train_loss)
    print('CV loss mean:', cv_loss_mean)
    print('Train R^2:', r2_train)
    print('CV R^2:', r2_cv)

def transform_gbr(df):
    X_train_gbr = df.copy()
    df = df.replace([np.inf, -np.inf], np.nan).fillna(0)

    # Add meaningful features
    X_train_gbr['AtomFraction'] = (X_train_gbr['NumOfC'] + X_train_gbr['NumOfO'] + X_train_gbr['NumOfN']) / X_train_gbr['NumOfAtoms']
    X_train_gbr['Polarity'] = X_train_gbr['NumHBondDonors'] / X_train_gbr['MW']
    X_train_gbr['HBondDensity'] = df['NumHBondDonors'] / X_train_gbr['NumOfAtoms']
    X_train_gbr['GroupDensity_CarboxylicAcid'] = X_train_gbr['carboxylic acid'] / X_train_gbr['MW']
    X_train_gbr['Unsaturation'] = X_train_gbr['C=C (non-aromatic)'] + X_train_gbr['C=C-C=O in non-aromatic ring']
    X_train_gbr['ConfigurationalComplexity'] = X_train_gbr['NumOfConf'] / X_train_gbr['MW']
    X_train_gbr['NumOfConf'] = np.log(X_train_gbr['NumOfConf'])
    # X_train_gbr['NumOfConfUsed'] = (X_train_gbr['NumOfConfUsed'] == 40).astype(int)

    return X_train_gbr

## Tietoaineistojen lataaminen

In [39]:
df_train = pd.read_csv('../data/train.csv', encoding='utf-8', header=0)
df_test = pd.read_csv('../data/test.csv', encoding='utf-8', header=0)

## Esiprosessointi

In [40]:
df_train['parentspecies'] = df_train['parentspecies'].astype('category')
df_train['parentspecies'] = df_train['parentspecies'].cat.codes
df_test['parentspecies'] = df_test['parentspecies'].astype('category')
df_test['parentspecies'] = df_test['parentspecies'].cat.codes

## Mallien kouluttaminen

In [41]:
X_train, y_train = df_train.drop(columns=['log_pSat_Pa', 'ID']), df_train['log_pSat_Pa']
X_test = df_test.drop(columns=['ID'])
X_train_gbr = transform_gbr(X_train)
X_test_gbr = transform_gbr(X_test)
X_test_gbr.head()

Unnamed: 0,MW,NumOfAtoms,NumOfC,NumOfO,NumOfN,NumHBondDonors,NumOfConf,NumOfConfUsed,parentspecies,C=C (non-aromatic),...,peroxide,hydroperoxide,carbonylperoxyacid,nitroester,AtomFraction,Polarity,HBondDensity,GroupDensity_CarboxylicAcid,Unsaturation,ConfigurationalComplexity
0,327.966253,26,6,14,2,2,5.57973,39.0,5,0,...,1,0,1,0,0.846154,0.006098,0.076923,0.003049,0,0.80801
1,361.971732,30,6,16,2,2,5.68358,40.0,5,0,...,1,2,0,0,0.8,0.005525,0.066667,0.0,0,0.812218
2,237.012081,23,6,9,1,2,4.234107,12.0,5,0,...,0,0,1,0,0.695652,0.008438,0.086957,0.0,0,0.291124
3,342.054674,37,9,12,2,2,5.605802,16.0,0,0,...,0,1,1,0,0.621622,0.005847,0.054054,0.0,0,0.795195
4,311.971338,25,6,13,2,1,4.369448,34.0,5,0,...,0,0,0,0,0.84,0.003205,0.04,0.0,0,0.253228


Erilaisia malleja alla. Kaikki mallit käyttävät samaa nimeä `model`, eli vain viimeisenä koulutettu menee testeihin ja tallentuu.

### 1. Dummy

In [42]:
model = DummyRegressor(strategy='mean')
model.fit(X_train, y_train)

df_test['log_pSat_Pa'] = model.predict(X_test)

### 2. Yksinkertainen regressio

In [43]:
model = LinearRegression()
model.fit(X_train_gbr, y_train)

evaluate_model(model, X_train_gbr, y_train)

Train loss: 2.800060105661606
CV loss mean: 2.8078076279671977
Train R^2: 0.7123785997619239
CV R^2: 0.7114073009200169


### 3. Random Forest

In [44]:
#model = RandomForestRegressor(random_state=190)
#model.fit(X_train_gbr, y_train)

#evaluate_model(model, X_train_gbr, y_train)

### 4. Gradient Boosting Regressor

In [None]:
import optuna

#from sklearn.model_selection import train_test_split
#X_train_gbr, X_vali_gbr, y_train, y_vali = train_test_split(X_train_gbr, y_train, test_size=0.2, random_state=190)

from sklearn.metrics import mean_absolute_error

def objective(trial):
    gbr_params = {
        "n_estimators": trial.suggest_int("n_estimators", 50, 150),
        "max_depth": trial.suggest_int("max_depth", 3, 9),
        "min_samples_split": trial.suggest_int("min_samples_split", 5, 15),
        "learning_rate": trial.suggest_float("learning_rate", 0.05, 0.2),
        "loss": "squared_error",
        "random_state": 190,
        "subsample": trial.suggest_float("subsample", 0.3, 0.9),
    }
    
    model = GradientBoostingRegressor(**gbr_params)
    #model.fit(X_train_gbr, y_train)

    score = cross_val_score(model, X_train, y_train, cv=5, scoring='r2').mean()
    return score
#evaluate_model(model, X_train_gbr, y_train)

study_name = "group-190c"
storage = "sqlite:///optuna_190.sqlite3"

study = optuna.create_study(
    direction="maximize",
    #sampler=optuna.samplers.TPESampler(seed=190),
    study_name=study_name,
    storage=storage,
    load_if_exists=True
)

study.optimize(objective, n_trials=10)

loaded_study = optuna.load_study(study_name=study_name, storage=storage)

print(f"The best score: {loaded_study.best_value}")
print(f"The best hyperparameter combination: {loaded_study.best_params}")



[I 2024-11-22 18:40:41,853] A new study created in RDB with name: group-190c
[I 2024-11-22 18:41:30,244] Trial 0 finished with value: -0.7311020025347774 and parameters: {'n_estimators': 124, 'max_depth': 8, 'min_samples_split': 10, 'learning_rate': 0.1243080499411806, 'subsample': 0.64256128912516}. Best is trial 0 with value: -0.7311020025347774.
[I 2024-11-22 18:41:40,212] Trial 1 finished with value: -0.7067989932813308 and parameters: {'n_estimators': 61, 'max_depth': 3, 'min_samples_split': 10, 'learning_rate': 0.0923874525700649, 'subsample': 0.6550459300646078}. Best is trial 0 with value: -0.7311020025347774.
[I 2024-11-22 18:42:28,065] Trial 2 finished with value: -0.740590154988417 and parameters: {'n_estimators': 112, 'max_depth': 8, 'min_samples_split': 15, 'learning_rate': 0.08254101759152674, 'subsample': 0.670273508154}. Best is trial 2 with value: -0.740590154988417.
[I 2024-11-22 18:43:20,078] Trial 3 finished with value: -0.7084566991186383 and parameters: {'n_estima

In [None]:
import plotly.io as pio
pio.renderers.default = "notebook"

## Ennustuksen tallentaminen

In [None]:
df_test['log_pSat_Pa'] = model.predict(X_test_gbr.drop(columns=['log_pSat_Pa'], axis=1))
df_test[['ID', 'log_pSat_Pa']].to_csv('../submission/submission.csv', index=False)

ValueError: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- log_pSat_Pa
