# Ryhmä-190

## Python-paketit

In [633]:
# Jupyter-notebookissa voi asentaa paketit samaan tapaan kuin komentoriviltä.
# Tähän käytetään "magic commandeja", jotka alkavat %-merkillä.
# Näiden asentamisessa voi mennä muutama minuutti, mutta sen jälkeen niitä ei tarvitse asentaa uudestaan. 
#%pip install seaborn
#%pip install mlflow azureml azureml-core azureml-mlflow azure-identity

In [1]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, make_scorer, r2_score
from sklearn.model_selection import KFold, cross_val_score
#from azureml.core import Workspace
#from azure.identity import InteractiveBrowserCredential
#import mlflow

## MLFlow-seuranta (ei tällä hetkellä käytössä)

Tarkistetaan ajoympäristö (Azure vai oma kone) ja autentikoidutaan sen mukaan.

In [635]:
def is_running_in_azure():
    return 'AZUREML_RUN_ID' in os.environ

def get_workspace():
    if is_running_in_azure():
        return Workspace.from_config()
    else:
        return Workspace(subscription_id='1c0e26b6-0fcb-4b6d-911c-2a0836275ea4',
                         resource_group='rg-AML',
                         workspace_name='aml-data_science_masters')
#ws = get_workspace()

In [636]:
#mlflow.set_tracking_uri(ws.get_mlflow_tracking_uri())
#experiment_name = 'group-190-tracking'
#mlflow.set_experiment(experiment_name)
#mlflow.autolog()

In [2]:
def evaluate_model(model, X_train, y_train, random_state=42):
    ### Train loss
    y_train_pred = model.predict(X_train)
    train_loss = mean_squared_error(y_train, y_train_pred)

    ### 5-fold cross-validation
    kf = KFold(n_splits=5, shuffle=True, random_state=random_state)
    mse_scorer = make_scorer(mean_squared_error)
    cv_scores = cross_val_score(model, X_train, y_train, cv=kf, scoring=mse_scorer)
    cv_loss_mean = cv_scores.mean()

    r2_train = r2_score(y_train, y_train_pred)
    r2_cv = cross_val_score(model, X_train, y_train, cv=kf, scoring='r2').mean()

    print('Train loss:', train_loss)
    print('CV loss mean:', cv_loss_mean)
    print('Train R^2:', r2_train)
    print('CV R^2:', r2_cv)


## Tietoaineistojen lataaminen

In [3]:
df_train = pd.read_csv('../data/train.csv', encoding='utf-8', header=0)
df_test = pd.read_csv('../data/test.csv', encoding='utf-8', header=0)

## Esiprosessointi

In [639]:
#df_train['parentspecies'] = df_train['parentspecies'].astype('category')
#df_train['parentspecies'] = df_train['parentspecies'].cat.codes
#df_test['parentspecies'] = df_test['parentspecies'].astype('category')
#df_test['parentspecies'] = df_test['parentspecies'].cat.codes

df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26637 entries, 0 to 26636
Data columns (total 27 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   ID                            26637 non-null  int64  
 1   log_pSat_Pa                   26637 non-null  float64
 2   MW                            26637 non-null  float64
 3   NumOfAtoms                    26637 non-null  int64  
 4   NumOfC                        26637 non-null  int64  
 5   NumOfO                        26637 non-null  int64  
 6   NumOfN                        26637 non-null  int64  
 7   NumHBondDonors                26637 non-null  int64  
 8   NumOfConf                     26637 non-null  float64
 9   NumOfConfUsed                 26637 non-null  float64
 10  parentspecies                 26427 non-null  object 
 11  C=C (non-aromatic)            26637 non-null  int64  
 12  C=C-C=O in non-aromatic ring  26637 non-null  int64  
 13  h

# Feature engineering

Seuraavassa taustaa. Ajatuksena on luoda "meaningful features". Bayesian Data Analysis kurssilla puhuttiin Generalized Linear Regression yhteydessä, että on kannattavaa olla tarkkana datasetin kanssa. Esimerkissä käytettiin puun lehtien painon arviointia. Siinä yksittäiset mitat kuten puun korkeus leveys yms ei antanut kovin hyvää mallia. Näistä kuitenkin pystyttiin luomaan uusia predictoreita, kuten puun muoto, lehtiosan volyymi jne.

Koska me emme ole alan asiantuntijoita, tulkitsen tekoälyn [käyttösääntöjä](https://studies.helsinki.fi/kurssit/toteutus/hy-opt-cur-2425-b5ccfa1b-ac12-4a9a-bef8-b46c0e808555/DATA11002) kurssilla, että voidaan kysyä mitä näistä voidaan laskea. Joten näin tehtiin ja saatiin seuraavia kaavoja.

BDA kurssilla myös poistettiin tämän jälkeen alkuperäiset arvot sotkemasta!

In [4]:
def transform_gbr(orig_df):
    df = orig_df.copy()
    df = df.replace([np.inf, -np.inf], np.nan).fillna(0)

    # Add meaningful features
    df['AtomFraction'] = (df['NumOfC'] + df['NumOfO'] + df['NumOfN']) / df['NumOfAtoms']
    df['Polarity'] = df['NumHBondDonors'] / df['MW']
    df['HBondDensity'] = df['NumHBondDonors'] / df['NumOfAtoms']
    df['GroupDensity_CarboxylicAcid'] = df['carboxylic acid'] / df['MW']
    df['Unsaturation'] = df['C=C (non-aromatic)'] + df['C=C-C=O in non-aromatic ring']
    df['ConfigurationalComplexity'] = df['NumOfConf'] / df['MW']

    # More derived features
    df = pd.get_dummies(df, columns=['parentspecies'], drop_first=True)
    df['HydrogenBondPotential'] = df['NumHBondDonors'] + (df['NumOfO'] + df['NumOfN'])
    polar_groups = ['hydroxyl (alkyl)', 'aldehyde', 'ketone', 'carboxylic acid', 'ester', 'nitro']
    df['PolarGroupCount'] = df[polar_groups].sum(axis=1)
    df['AromaticGroupFraction'] = df['aromatic hydroxyl'] / (df[polar_groups + ['aromatic hydroxyl']].sum(axis=1) + 1e-9)
    df['MolecularSize'] = df['MW'] + df['NumOfAtoms']
    df['DegreeOfUnsaturation'] = df['NumOfC'] + 1 - (df['NumOfAtoms'] - df['NumOfC']) / 2
    df['FlexibilityRatio'] = df['NumOfConfUsed'] / (df['NumOfConf'] + 1e-9)
    df['PolarityIndex'] = (df['NumOfO'] + df['NumOfN'] + df[polar_groups].sum(axis=1)) / (df['NumOfC'] + df['C=C (non-aromatic)'] + 1e-9)
    df['Hydrophobicity'] = df['NumOfC'] / (df['NumOfAtoms'] + 1e-9)
    functional_groups = [
        'C=C (non-aromatic)', 'hydroxyl (alkyl)', 'aldehyde', 'ketone',
        'carboxylic acid', 'ester', 'ether (alicyclic)', 'nitrate',
        'nitro', 'aromatic hydroxyl', 'carbonylperoxynitrate', 'peroxide',
        'hydroperoxide', 'carbonylperoxyacid', 'nitroester'
    ]
    for group in functional_groups:
        df[f'{group}_Indicator'] = (df[group] > 0).astype(int)

    symmetric_groups = ['ester', 'ether (alicyclic)', 'ketone']
    asymmetric_groups = ['aldehyde', 'carboxylic acid']
    df['SymmetryIndex'] = df[symmetric_groups].sum(axis=1) / (df[symmetric_groups + asymmetric_groups].sum(axis=1) + 1e-9)
    df['ShapeCompactness'] = df['NumOfAtoms'] / (df['NumOfConfUsed'] + 1e-9)
    df['HydrogenBondDensity'] = df['NumHBondDonors'] / (df['MW'] + 1e-9)
    df['VolatilityIndex'] = (
        df['NumOfC'] / (df[polar_groups].sum(axis=1) + 1e-9) *
        1 / (df['MW'] + 1e-9)
    )
    df['OxygenToCarbonRatio'] = df['NumOfO'] / (df['NumOfC'] + 1e-9)
    #df = df.drop(columns=['NumOfAtoms', 'NumOfC', 'NumOfO', 'NumOfN', 'C=C (non-aromatic)', 'hydroxyl (alkyl)', 'aldehyde', 'ketone', 'carboxylic acid', 'ester', 'ether (alicyclic)', 'nitrate', 'nitro', 'aromatic hydroxyl', 'carbonylperoxynitrate', 'peroxide', 'hydroperoxide', 'carbonylperoxyacid', 'nitroester', 'NumOfConf', 'NumOfConfUsed'], axis=1)


    # df['C:O_ratio'] = df['NumOfC'] / df['NumOfO']
    # df['C:N_ratio'] = df['NumOfC'] / df['NumOfN']
    # df['Percent_C'] = (df['NumOfC'] / df['NumOfAtoms'])
    # df['Percent_O'] = (df['NumOfO'] / df['NumOfAtoms'])
    # df['Percent_N'] = (df['NumOfN'] / df['NumOfAtoms'])
    # df['HBD_fraction'] = df['NumHBondDonors'] / df['NumOfAtoms']
    # df['FractionOfConfsUsed'] = df['NumOfConfUsed'] / df['NumOfConf']
    # df['Has_Conjugated_System'] = np.where((df['C=C-C=O in non-aromatic ring'] > 0) | (df['C=C (non-aromatic)'] > 0), 1, 0)

    # X_train_gbr['NumOfConf'] = np.log(X_train_gbr['NumOfConf'])
    # X_train_gbr['NumOfConfUsed'] = (X_train_gbr['NumOfConfUsed'] == 40).astype(int)
    drop_cols = [
        # 'NumOfC',
        # 'NumOfO',
        # 'NumOfN',
        # 'NumOfAtoms',
        # 'NumHBondDonors',
        # 'carboxylic acid',
        # 'MW',
        # 'C=C-C=O in non-aromatic ring',
        # 'C=C (non-aromatic)',
        #'NumOfConf',
        'NumOfConfUsed',
        'parentspecies',
    ]
    #df = df.drop(columns=drop_cols, axis=1)

    return df

## Mallien kouluttaminen

In [5]:
X_train, y_train = df_train.drop(columns=['log_pSat_Pa', 'ID']), df_train['log_pSat_Pa']
X_test = df_test.drop(columns=['ID'])
X_train_gbr = transform_gbr(X_train)
X_test_gbr = transform_gbr(X_test)
X_train_gbr.info()

#selected_features = select_features(X_train_gbr, y_train)
#X_train_gbr = X_train_gbr[selected_features]
#X_test_gbr = X_test_gbr[selected_features]


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26637 entries, 0 to 26636
Data columns (total 65 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   MW                                 26637 non-null  float64
 1   NumOfAtoms                         26637 non-null  int64  
 2   NumOfC                             26637 non-null  int64  
 3   NumOfO                             26637 non-null  int64  
 4   NumOfN                             26637 non-null  int64  
 5   NumHBondDonors                     26637 non-null  int64  
 6   NumOfConf                          26637 non-null  float64
 7   NumOfConfUsed                      26637 non-null  float64
 8   C=C (non-aromatic)                 26637 non-null  int64  
 9   C=C-C=O in non-aromatic ring       26637 non-null  int64  
 10  hydroxyl (alkyl)                   26637 non-null  int64  
 11  aldehyde                           26637 non-null  int

Erilaisia malleja alla. Kaikki mallit käyttävät samaa nimeä `model`, eli vain viimeisenä koulutettu vaikuttaa tiedostoon `submission.csv`.

### 1. Dummy

In [644]:
model = DummyRegressor(strategy='mean')
model.fit(X_train, y_train)

df_test['log_pSat_Pa'] = model.predict(X_test)

### 2. Yksinkertainen regressio

In [None]:
import itertools
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn.discriminant_analysis import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.svm import SVR
from scipy.stats import boxcox, yeojohnson
import sys



# Assuming evaluate_model is already defined
def evaluate_model(model, X, y):
    scores = cross_val_score(model, X, y, cv=5, scoring='r2')
    return scores.mean()

always_include = ['AtomFraction', 'GroupDensity_CarboxylicAcid', 'ConfigurationalComplexity', 'carbonylperoxynitrate_Indicator', 'PolarityIndex', 'parentspecies_apin', 'aldehyde_Indicator', 'peroxide_Indicator', 'ShapeCompactness', 'FlexibilityRatio', 'VolatilityIndex', 'PolarGroupCount', 'HBondDensity', 'HydrogenBondPotential', 'nitro_Indicator', 'ester_Indicator', 'parentspecies_toluene', 'DegreeOfUnsaturation', 'Hydrophobicity', 'ketone_Indicator', 'nitroester_Indicator', 'hydroxyl (alkyl)', 'carbonylperoxyacid', 'NumOfConfUsed', 'aldehyde', 'SymmetryIndex', 'NumOfConf', 'NumOfC', 'nitro', 'C=C (non-aromatic)', 'C=C (non-aromatic)_Indicator', 'nitrate', 'carbonylperoxyacid_Indicator', 'hydroperoxide_Indicator', 'nitroester', 'carboxylic acid_Indicator', 'aromatic hydroxyl_Indicator', 'NumOfO', 'ether (alicyclic)_Indicator']

remaining_cols = [col for col in X_train_gbr.columns if col not in always_include]
print(remaining_cols)
#X_train_gbr = auto_transform(X_train_gbr, y_train)
#X_train_gbr['DegreeOfUnsaturation'] = X_train_gbr['DegreeOfUnsaturation'].fillna(0)

#model = LinearRegression()
model = make_pipeline(StandardScaler(), SVR(C=3.3, epsilon=0.205, kernel='rbf'))

# print head of each always_include column's value counts
#for col in always_include:
#    print(X_train_gbr[col].value_counts().head())

#sys.exit()

best_score = -np.inf
best_combination = None

mode = 'remove'
approved_transformations = [
    ('AtomFraction', 'cube'),
    ('GroupDensity_CarboxylicAcid', 'sqrt'),
    ('ConfigurationalComplexity', 'yeojohnson'),
    ('ShapeCompactness', 'log'),
    ('VolatilityIndex', 'reciprocal'),
    ('FlexibilityRatio', 'sqrt'),
    ('PolarityIndex', 'sqrt'),
    ('NumOfConfUsed', 'boxcox'),
    ('DegreeOfUnsaturation', 'yeojohnson'),
    ('Hydrophobicity', 'yeojohnson'),
    ('HydrogenBondPotential', 'none'),
    ('HBondDensity', 'none'),
    ('SymmetryIndex', 'sqrt'),
    ('NumOfConf', 'sqrt'),
    ('NumOfC', 'cube'),
    ('NumOfO', 'sqrt'),

]
approved_transformations_OLR = [
    ('AtomFraction', 'reciprocal'),
    ('FlexibilityRatio', 'log'),
    ('Hydrophobicity', 'reciprocal'),
    ('PolarGroupCount', 'sqrt'),
    ('VolatilityIndex', 'boxcox'),
    ('HBondDensity', 'square'),
    ('DegreeOfUnsaturation', 'yeojohnson'),
    ('GroupDensity_CarboxylicAcid', 'yeojohnson'),
    ('carbonylperoxynitrate_Indicator', 'reciprocal'),
    ('PolarityIndex', 'yeojohnson'),
    ('carbonylperoxyacid', 'log'),
    ]

transformations = {
    #'none': lambda x: x,
    'log': lambda x: np.log(x + 1e-9),
    'sqrt': lambda x: np.sqrt(x),
    'square': lambda x: x**2,
    'cube': lambda x: x**3,
    'exp': lambda x: np.exp(x),
    'reciprocal': lambda x: 1 / (x + 1e-9),
    'boxcox': lambda x: boxcox(x + 1e-9)[0] if (x > 0).all() else x,
    'yeojohnson': lambda x: yeojohnson(x)[0]
}


if mode == 'add':
    for L in range(0, len(remaining_cols) + 1):
        for subset in itertools.combinations(remaining_cols, L):
            current_combination = list(always_include) + list(subset)
            X_subset = X_train_gbr[current_combination]
            model.fit(X_subset, y_train)
            score = evaluate_model(model, X_subset, y_train)
            
            if score > best_score:
                best_score = score
                best_combination = current_combination
                print(f'New best combination: {best_combination} with score: {best_score:.10f}')
elif mode == 'remove':
    X_transformed = X_train_gbr.copy()
    for col, trans_name in approved_transformations:
        if trans_name in transformations:
            X_transformed[col] = transformations[trans_name](X_transformed[col].astype(float))
    for L in range(0, len(always_include) + 1):
        for subset in itertools.combinations(always_include, L):
            current_combination = [col for col in always_include if col not in subset]
            X_subset = X_transformed[current_combination]
            model.fit(X_subset, y_train)
            score = evaluate_model(model, X_subset, y_train)
            
            if score > best_score:
                best_score = score
                best_combination = current_combination
                print(f'New best combination: {best_combination} with score: {best_score:.10f}')
elif mode == "replace":
    for included_col in always_include:
        for excluded_col in remaining_cols:
            current_combination = [col for col in always_include if col != included_col] + [excluded_col]
            X_subset = X_train_gbr[current_combination]
            model.fit(X_subset, y_train)
            score = evaluate_model(model, X_subset, y_train)
            
            if score > best_score:
                best_score = score
                best_combination = current_combination
                print(f'New best combination: {best_combination} with score: {best_score:.10f}')
elif mode == "transform":
    X_transformed = X_train_gbr.copy()
    for col, trans_name in approved_transformations:
        if trans_name in transformations:
            X_transformed[col] = transformations[trans_name](X_transformed[col].astype(float))
    
    ### Otetaan baseline ilman muunnoksia ensin
    X_subset = X_transformed[always_include]
    model.fit(X_subset, y_train)
    baseline_score = evaluate_model(model, X_subset, y_train)
    print(f'Baseline score with no transformation: {baseline_score:.10f}')
    
    ### Käydään kaikki transformaatiot läpi
    for col in always_include:
        if any(col == approved_col for approved_col, _ in approved_transformations):
            continue  # Skip columns with approved transformations
        if X_train_gbr[col].nunique() < 10:
            continue
        for name, transform in transformations.items():
            try:
                X_temp = X_transformed.copy()
                X_temp[col] = transform(X_temp[col].astype(float))
                
                if np.any(np.isinf(X_temp[col])) or np.any(np.abs(X_temp[col]) > 1e10):
                    raise ValueError(f"Transformation {name} for {col} resulted in infinities or excessively large values.")
                
                X_subset = X_temp[always_include]
                model.fit(X_subset, y_train)
                score = evaluate_model(model, X_subset, y_train)
                print(f'Transformation: {col} with {name} transformation, score: {score:.10f}')
                if score > best_score:
                    best_score = score
                    best_combination = (col, name)
                    #print(f'New best transformation: {col} with {name} transformation, score: {best_score:.10f}')
            except Exception as e:
                print(f"Skipping transformation {name} for {col} due to error: {e}")
elif mode == "transform_replace":
    for included_col in always_include:
        for excluded_col in remaining_cols:
            for name, transform in transformations.items():
                try:
                    X_temp = X_train_gbr.copy()
                    X_temp[excluded_col] = transform(X_temp[excluded_col].astype(float))
                    
                    # Check for infinities or excessively large values
                    if np.any(np.isinf(X_temp[excluded_col])) or np.any(np.abs(X_temp[excluded_col]) > 1e10):
                        raise ValueError(f"Transformation {name} for {excluded_col} resulted in infinities or excessively large values.")
                    
                    current_combination = [col for col in always_include if col != included_col] + [excluded_col]
                    X_subset = X_temp[current_combination]
                    model.fit(X_subset, y_train)
                    score = evaluate_model(model, X_subset, y_train)
                    
                    if score > best_score:
                        best_score = score
                        best_combination = (included_col, excluded_col, name)
                        print(f'New best transformation and replacement: Replace {included_col} with {excluded_col} using {name} transformation, score: {best_score:.10f}')
                except Exception as e:
                    print(f"Skipping transformation {name} for {excluded_col} due to error: {e}")
elif mode == "transform_add":
    for excluded_col in remaining_cols:
        for name, transform in transformations.items():
            try:
                X_temp = X_train_gbr.copy()
                X_temp[excluded_col] = transform(X_temp[excluded_col].astype(float))
                
                # Check for infinities or excessively large values
                if np.any(np.isinf(X_temp[excluded_col])) or np.any(np.abs(X_temp[excluded_col]) > 1e10):
                    raise ValueError(f"Transformation {name} for {excluded_col} resulted in infinities or excessively large values.")
                
                current_combination = list(always_include) + [excluded_col]
                X_subset = X_temp[current_combination]
                model.fit(X_subset, y_train)
                score = evaluate_model(model, X_subset, y_train)
                
                if score > best_score:
                    best_score = score
                    best_combination = (excluded_col, name)
                    print(f'New best transformation and addition: Add {excluded_col} using {name} transformation, score: {best_score:.10f}')
            except Exception as e:
                print(f"Skipping transformation {name} for {excluded_col} due to error: {e}")

print(f'Best combination: {best_combination} with score: {best_score:.10f}')


['MW', 'NumOfAtoms', 'NumOfN', 'NumHBondDonors', 'C=C-C=O in non-aromatic ring', 'ketone', 'carboxylic acid', 'ester', 'ether (alicyclic)', 'aromatic hydroxyl', 'carbonylperoxynitrate', 'peroxide', 'hydroperoxide', 'Polarity', 'Unsaturation', 'parentspecies_apin_decane', 'parentspecies_apin_decane_toluene', 'parentspecies_apin_toluene', 'parentspecies_decane', 'parentspecies_decane_toluene', 'AromaticGroupFraction', 'MolecularSize', 'nitrate_Indicator', 'HydrogenBondDensity', 'OxygenToCarbonRatio']
New best combination: ['AtomFraction', 'GroupDensity_CarboxylicAcid', 'ConfigurationalComplexity', 'carbonylperoxynitrate_Indicator', 'PolarityIndex', 'parentspecies_apin', 'aldehyde_Indicator', 'peroxide_Indicator', 'ShapeCompactness', 'FlexibilityRatio', 'VolatilityIndex', 'PolarGroupCount', 'HBondDensity', 'HydrogenBondPotential', 'nitro_Indicator', 'ester_Indicator', 'parentspecies_toluene', 'DegreeOfUnsaturation', 'Hydrophobicity', 'hydroxyl (alkyl)_Indicator', 'ketone_Indicator', 'nitr

KeyboardInterrupt: 

In [9]:
import numpy as np
import pandas as pd
from scipy.stats import boxcox, yeojohnson

def apply_approved_transformations(df, approved_transformations):
    transformations = {
        'none': lambda x: x,
        'log': lambda x: np.log(x + 1e-9),  # Adding a small constant to avoid log(0)
        'sqrt': lambda x: np.sqrt(x),
        'square': lambda x: x**2,
        'cube': lambda x: x**3,
        'exp': lambda x: np.exp(x),
        'reciprocal': lambda x: 1 / (x + 1e-9),  # Adding a small constant to avoid division by zero
        'boxcox': lambda x: boxcox(x + 1e-9)[0] if (x > 0).all() else x,  # Box-Cox requires positive values
        'yeojohnson': lambda x: yeojohnson(x)[0]  # Yeo-Johnson can handle zero and negative values
    }
    
    df_transformed = df.copy()
    for col, trans_name in approved_transformations:
        if trans_name in transformations:
            try:
                df_transformed[col] = transformations[trans_name](df_transformed[col].astype(float))
            except Exception as e:
                print(f"Skipping transformation {trans_name} for {col} due to error: {e}")
    
    return df_transformed

### 3. Random Forest

In [None]:

approved_transformations = [
    ('AtomFraction', 'reciprocal'),
    ('FlexibilityRatio', 'log'),
    ('Hydrophobicity', 'reciprocal'),
    ('PolarGroupCount', 'sqrt'),
    ('VolatilityIndex', 'boxcox'),
    ('HBondDensity', 'square'),
    ('DegreeOfUnsaturation', 'yeojohnson'),
    ('GroupDensity_CarboxylicAcid', 'yeojohnson'),
    ('carbonylperoxynitrate_Indicator', 'reciprocal'),
    ('PolarityIndex', 'yeojohnson'),
    ('carbonylperoxyacid', 'log'),
    ]
included = ['AtomFraction', 'GroupDensity_CarboxylicAcid', 'ConfigurationalComplexity', 'carbonylperoxynitrate_Indicator', 'PolarityIndex', 'parentspecies_apin', 'aldehyde_Indicator', 'peroxide_Indicator', 'ShapeCompactness', 'FlexibilityRatio', 'VolatilityIndex', 'PolarGroupCount', 'HBondDensity', 'HydrogenBondPotential', 'nitro_Indicator', 'ester_Indicator', 'parentspecies_toluene', 'DegreeOfUnsaturation', 'Hydrophobicity', 'hydroxyl (alkyl)_Indicator', 'ketone_Indicator', 'nitroester_Indicator', 'hydroxyl (alkyl)', 'carbonylperoxyacid', 'NumOfConfUsed', 'aldehyde', 'SymmetryIndex', 'NumOfConf', 'NumOfC', 'nitro', 'C=C (non-aromatic)', 'C=C (non-aromatic)_Indicator', 'nitrate', 'carbonylperoxyacid_Indicator', 'hydroperoxide_Indicator', 'nitroester', 'carboxylic acid_Indicator', 'aromatic hydroxyl_Indicator', 'NumOfO', 'ether (alicyclic)_Indicator']
X_train_sel = X_train_gbr[included]
X_test_sel = X_test_gbr[included]

X_train_sel = apply_approved_transformations(X_train_sel, approved_transformations)
X_test_sel = apply_approved_transformations(X_test_sel, approved_transformations)

model = RandomForestRegressor(random_state=190)
model.fit(X_train_sel, y_train)

evaluate_model(model, X_train_sel, y_train)

np.float64(0.714071177562855)

### 4. Gradient Boosting Regressor

In [None]:
# OPTUNA

import optuna
from sklearn.discriminant_analysis import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.svm import SVR


def objective(trial):
    gbr_params = {
        "n_estimators": trial.suggest_int("n_estimators", 50, 150),
        "max_depth": trial.suggest_int("max_depth", 3, 9),
        "min_samples_split": trial.suggest_int("min_samples_split", 5, 15),
        "learning_rate": trial.suggest_float("learning_rate", 0.05, 0.2),
        "loss": "squared_error",
        "random_state": 190,
        "subsample": trial.suggest_float("subsample", 0.3, 0.9),
    }
        
    model = GradientBoostingRegressor(**gbr_params)

    score = cross_val_score(model, X_train, y_train, cv=5, scoring='r2').mean()
    return score

study_name = "group-190-669"
storage = "sqlite:///optuna_190.sqlite3"

study = optuna.create_study(
    direction="maximize",
    #sampler=optuna.samplers.TPESampler(seed=190),
    study_name=study_name,
    storage=storage,
    load_if_exists=True
)

#study.optimize(objective, n_trials=5)

#loaded_study = optuna.load_study(study_name=study_name, storage=storage)

#print(f"The best score: {loaded_study.best_value}")
#print(f"The best hyperparameter combination: {loaded_study.best_params}")



[I 2024-11-24 17:55:41,161] Using an existing study with name 'group-190-669' instead of creating a new one.


In [None]:
gbr_params = {
    "n_estimators": 100,
    "max_depth": 6,
    "min_samples_split": 10,
    "learning_rate": 0.1,
    "loss": "squared_error",
    "random_state": 42,
    # "subsample": 0.6,
}

approved_transformations = [
    ('AtomFraction', 'reciprocal'),
    ('FlexibilityRatio', 'log'),
    ('Hydrophobicity', 'reciprocal'),
    ('PolarGroupCount', 'sqrt'),
    ('VolatilityIndex', 'boxcox'),
    ('HBondDensity', 'square'),
    ('DegreeOfUnsaturation', 'yeojohnson'),
    ('GroupDensity_CarboxylicAcid', 'yeojohnson'),
    ('carbonylperoxynitrate_Indicator', 'reciprocal'),
    ('PolarityIndex', 'yeojohnson'),
    ('carbonylperoxyacid', 'log'),
    ]
included = ['AtomFraction', 'GroupDensity_CarboxylicAcid', 'ConfigurationalComplexity', 'carbonylperoxynitrate_Indicator', 'PolarityIndex', 'parentspecies_apin', 'aldehyde_Indicator', 'peroxide_Indicator', 'ShapeCompactness', 'FlexibilityRatio', 'VolatilityIndex', 'PolarGroupCount', 'HBondDensity', 'HydrogenBondPotential', 'nitro_Indicator', 'ester_Indicator', 'parentspecies_toluene', 'DegreeOfUnsaturation', 'Hydrophobicity', 'hydroxyl (alkyl)_Indicator', 'ketone_Indicator', 'nitroester_Indicator', 'hydroxyl (alkyl)', 'carbonylperoxyacid', 'NumOfConfUsed', 'aldehyde', 'SymmetryIndex', 'NumOfConf', 'NumOfC', 'nitro', 'C=C (non-aromatic)', 'C=C (non-aromatic)_Indicator', 'nitrate', 'carbonylperoxyacid_Indicator', 'hydroperoxide_Indicator', 'nitroester', 'carboxylic acid_Indicator', 'aromatic hydroxyl_Indicator', 'NumOfO', 'ether (alicyclic)_Indicator']
X_train_sel = X_train_gbr[included]
X_test_sel = X_test_gbr[included]

X_train_sel = apply_approved_transformations(X_train_sel, approved_transformations)
X_test_sel = apply_approved_transformations(X_test_sel, approved_transformations)

    
model = GradientBoostingRegressor(**gbr_params)
model.fit(X_train_sel, y_train)

evaluate_model(model, X_train_sel, y_train)



np.float64(0.7393820305668634)

# 5. SVR

In [None]:
# OPTUNA

import optuna
from sklearn.discriminant_analysis import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.svm import SVR


def objective(trial):
    # C=1.0, epsilon=0.2, kernel='rbf'
    svr_params = {        
        "C": trial.suggest_float("C", 3.2, 3.8),
        "epsilon": trial.suggest_float("epsilon", 0.35, 0.45),
        "kernel": 'rbf',
        "degree": trial.suggest_int("degree", 45, 160),
        "gamma": trial.suggest_float("gamma", 0.014, 0.020),
        "coef0": trial.suggest_float("coef0", 0.032, 0.038),
        #"shrinking": trial.suggest_categorical("shrinking", [True, False]),
        "tol": trial.suggest_float("tol", 0.5e-5, 1.5e-5),
        #"cache_size": 200,
        #"verbose": False,
        #"max_iter": -1,
    }

    approved_transformations = [
        ('AtomFraction', 'cube'),
        ('GroupDensity_CarboxylicAcid', 'sqrt'),
        ('ConfigurationalComplexity', 'yeojohnson'),
        ('ShapeCompactness', 'log'),
        ('VolatilityIndex', 'reciprocal'),
        ('FlexibilityRatio', 'sqrt'),
        ('PolarityIndex', 'sqrt'),
        ('NumOfConfUsed', 'boxcox'),
        ('DegreeOfUnsaturation', 'yeojohnson'),
        ('Hydrophobicity', 'yeojohnson'),
        ('HydrogenBondPotential', 'none'),
        ('HBondDensity', 'none'),
        ('SymmetryIndex', 'sqrt'),
        ('NumOfConf', 'sqrt'),
        ('NumOfC', 'cube'),
        ('NumOfO', 'sqrt'),

    ]
    included = ['AtomFraction', 'GroupDensity_CarboxylicAcid', 'ConfigurationalComplexity', 'carbonylperoxynitrate_Indicator', 'PolarityIndex', 'parentspecies_apin', 'aldehyde_Indicator', 'peroxide_Indicator', 'ShapeCompactness', 'FlexibilityRatio', 'VolatilityIndex', 'PolarGroupCount', 'HBondDensity', 'HydrogenBondPotential', 'nitro_Indicator', 'ester_Indicator', 'parentspecies_toluene', 'DegreeOfUnsaturation', 'Hydrophobicity', 'ketone_Indicator', 'nitroester_Indicator', 'hydroxyl (alkyl)', 'carbonylperoxyacid', 'NumOfConfUsed', 'aldehyde', 'SymmetryIndex', 'NumOfConf', 'NumOfC', 'nitro', 'C=C (non-aromatic)', 'C=C (non-aromatic)_Indicator', 'nitrate', 'carbonylperoxyacid_Indicator', 'hydroperoxide_Indicator', 'nitroester', 'carboxylic acid_Indicator', 'aromatic hydroxyl_Indicator', 'NumOfO', 'ether (alicyclic)_Indicator']
    X_train_sel = X_train_gbr[included]
    X_test_sel = X_test_gbr[included]

    X_train_sel = apply_approved_transformations(X_train_sel, approved_transformations)
    X_test_sel = apply_approved_transformations(X_test_sel, approved_transformations)

    model = make_pipeline(StandardScaler(), SVR(**svr_params))
    score = cross_val_score(model, X_train_sel, y_train, cv=5, scoring='r2').mean()
    return score

study_name = "group-190-667f"
storage = "sqlite:///optuna_190.sqlite3"

study = optuna.create_study(
    direction="maximize",
    #sampler=optuna.samplers.TPESampler(seed=190),
    study_name=study_name,
    storage=storage,
    load_if_exists=True
)

study.optimize(objective, n_trials=100)

loaded_study = optuna.load_study(study_name=study_name, storage=storage)

print(f"The best score: {loaded_study.best_value}")
print(f"The best hyperparameter combination: {loaded_study.best_params}")

# [I 2024-11-28 03:56:55,052] Trial 162 finished with value: 0.7550233244230112 and parameters: {'C': 3.3889499180212046, 'epsilon': 0.3949724292326939, 'gamma': 0.01629368746690572, 'coef0': 0.035253379505719544}. Best is trial 162 with value: 0.7550233244230112.
# [I 2024-11-28 10:28:52,385] Trial 211 finished with value: 0.7550375967452467 and parameters: {'C': 3.495788287556519, 'epsilon': 0.4100394880570758, 'gamma': 0.017128803431606398, 'coef0': 0.033820308817272056, 'tol': 1.5237910149381403e-05}. Best is trial 211 with value: 0.7550375967452467.
# [I 2024-11-28 10:51:24,245] Trial 214 finished with value: 0.7550381054851701 and parameters: {'C': 3.5008365564570987, 'epsilon': 0.4102253260252876, 'gamma': 0.017129461016578463, 'coef0': 0.03382417774707513, 'tol': 1.2799003456300463e-05}. Best is trial 214 with value: 0.7550381054851701.
# [I 2024-11-28 11:57:47,695] Trial 230 finished with value: 0.7550386104917828 and parameters: {'C': 3.525966730359343, 'epsilon': 0.4161806381425711, 'degree': 50, 'gamma': 0.017044909846706827, 'coef0': 0.03355524800380342, 'tol': 1.4992408322713575e-05}. Best is trial 230 with value: 0.7550386104917828.
# Trial 245 finished with value: 0.755060795916482 and parameters: {'C': 3.562419636487071, 'epsilon': 0.41491928287107616, 'degree': 49, 'gamma': 0.016956921892014226, 'coef0': 0.033605859008015805, 'tol': 1.2410612671358723e-05}. Best is trial 245 with value: 0.755060795916482.
# Trial 266 finished with value: 0.7550800203964114 and parameters: {'C': 3.5932643801855946, 'epsilon': 0.4198403843342681, 'degree': 50, 'gamma': 0.016636022497975342, 'coef0': 0.03323365359851923, 'tol': 1.3296053068995646e-05}. Best is trial 266 with value: 0.7550800203964114.
#Trial 325 finished with value: 0.7551044756099635 and parameters: {'C': 3.6799102291229433, 'epsilon': 0.4223672555813418, 'degree': 78, 'gamma': 0.016533970458680023, 'coef0': 0.0331150871485373, 'tol': 1.4266551897047913e-05}. Best is trial 325 with value: 0.7551044756099635.

[I 2024-11-28 11:52:06,903] Using an existing study with name 'group-190-667f' instead of creating a new one.
[I 2024-11-28 11:54:58,125] Trial 229 finished with value: 0.7550380842626584 and parameters: {'C': 3.5169525044878216, 'epsilon': 0.41593364547566913, 'degree': 49, 'gamma': 0.017043034574977915, 'coef0': 0.03352070905873489, 'tol': 1.4991888934634567e-05}. Best is trial 214 with value: 0.7550381054851701.
[I 2024-11-28 11:57:47,695] Trial 230 finished with value: 0.7550386104917828 and parameters: {'C': 3.525966730359343, 'epsilon': 0.4161806381425711, 'degree': 50, 'gamma': 0.017044909846706827, 'coef0': 0.03355524800380342, 'tol': 1.4992408322713575e-05}. Best is trial 230 with value: 0.7550386104917828.
[I 2024-11-28 12:00:41,919] Trial 231 finished with value: 0.7550361393604244 and parameters: {'C': 3.51640460861277, 'epsilon': 0.4160091653583347, 'degree': 50, 'gamma': 0.01705582623969709, 'coef0': 0.03339665548067246, 'tol': 1.4996984906908675e-05}. Best is trial 230 w

In [None]:
from sklearn.discriminant_analysis import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.svm import SVR

#C': 5268845153126, 'epsilon': 0.16816582344166467, 'degree': 33, 'coef0': 0.03616869628689967
# 'C': , 'epsilon': 0.20653558438803182, 'degree': 60, 'coef0': 0.03747367910141413}. Best is trial 15 with value: 0.7499176794994977.
# use apply_approved_transformations to apply transformations
approved_transformations = [
    ('AtomFraction', 'reciprocal'),
    ('FlexibilityRatio', 'log'),
    ('Hydrophobicity', 'reciprocal'),
    ('PolarGroupCount', 'sqrt'),
    ('VolatilityIndex', 'boxcox'),
    ('HBondDensity', 'square'),
    ('DegreeOfUnsaturation', 'yeojohnson'),
    ('GroupDensity_CarboxylicAcid', 'yeojohnson'),
    ('carbonylperoxynitrate_Indicator', 'reciprocal'),
    ('PolarityIndex', 'yeojohnson'),
    ('carbonylperoxyacid', 'log'),
    ]
included = ['AtomFraction', 'GroupDensity_CarboxylicAcid', 'ConfigurationalComplexity', 'carbonylperoxynitrate_Indicator', 'PolarityIndex', 'parentspecies_apin', 'aldehyde_Indicator', 'peroxide_Indicator', 'ShapeCompactness', 'FlexibilityRatio', 'VolatilityIndex', 'PolarGroupCount', 'HBondDensity', 'HydrogenBondPotential', 'nitro_Indicator', 'ester_Indicator', 'parentspecies_toluene', 'DegreeOfUnsaturation', 'Hydrophobicity', 'hydroxyl (alkyl)_Indicator', 'ketone_Indicator', 'nitroester_Indicator', 'hydroxyl (alkyl)', 'carbonylperoxyacid', 'NumOfConfUsed', 'aldehyde', 'SymmetryIndex', 'NumOfConf', 'NumOfC', 'nitro', 'C=C (non-aromatic)', 'C=C (non-aromatic)_Indicator', 'nitrate', 'carbonylperoxyacid_Indicator', 'hydroperoxide_Indicator', 'nitroester', 'carboxylic acid_Indicator', 'aromatic hydroxyl_Indicator', 'NumOfO', 'ether (alicyclic)_Indicator']
X_train_sel = X_train_gbr[included]
X_test_sel = X_test_gbr[included]

X_train_sel = apply_approved_transformations(X_train_sel, approved_transformations)
X_test_sel = apply_approved_transformations(X_test_sel, approved_transformations)
model = make_pipeline(StandardScaler(), SVR(C=3.3, epsilon=0.205, kernel='rbf'))
model.fit(X_train_sel, y_train)

evaluate_model(model, X_train_sel, y_train)
# degree=100 np.float64(0.7505216318650372)
# np.float64(0.7505328719492336)
#model = make_pipeline(StandardScaler(), SVR(C=3.30, epsilon=0.20653558438803182, kernel='rbf', degree=100, coef0=0.03747367910141413))
# np.float64(0.7505287163472338)
# model = make_pipeline(StandardScaler(), SVR(C=3, epsilon=0.205, kernel='rbf'))



np.float64(0.7494715461746723)

## Ennustuksen tallentaminen

In [None]:
df_test['log_pSat_Pa'] = model.predict(X_test_gbr.drop(columns=['log_pSat_Pa'], axis=1))
df_test[['ID', 'log_pSat_Pa']].to_csv('../submission/submission.csv', index=False)