# Ryhmä-190

## Python-paketit

In [12]:
import pandas as pd
import numpy as np
from scipy.stats import boxcox, yeojohnson
from sklearn.metrics import mean_squared_error, make_scorer, r2_score
from sklearn.model_selection import KFold, cross_val_score

# Hyödylliset funktiot

In [13]:
def add_meaningful_features(orig_df):
    df = orig_df.copy()
    df = df.replace([np.inf, -np.inf], np.nan).fillna(0)

    df['AtomFraction'] = (df['NumOfC'] + df['NumOfO'] + df['NumOfN']) / df['NumOfAtoms']
    df['Polarity'] = df['NumHBondDonors'] / df['MW']
    df['HBondDensity'] = df['NumHBondDonors'] / df['NumOfAtoms']
    df['GroupDensity_CarboxylicAcid'] = df['carboxylic acid'] / df['MW']
    df['Unsaturation'] = df['C=C (non-aromatic)'] + df['C=C-C=O in non-aromatic ring']
    df['ConfigurationalComplexity'] = df['NumOfConf'] / df['MW']

    df = pd.get_dummies(df, columns=['parentspecies'], drop_first=True)
    df['HydrogenBondPotential'] = df['NumHBondDonors'] + (df['NumOfO'] + df['NumOfN'])
    polar_groups = ['hydroxyl (alkyl)', 'aldehyde', 'ketone', 'carboxylic acid', 'ester', 'nitro']
    df['PolarGroupCount'] = df[polar_groups].sum(axis=1)
    df['AromaticGroupFraction'] = df['aromatic hydroxyl'] / (df[polar_groups + ['aromatic hydroxyl']].sum(axis=1) + 1e-9)
    df['MolecularSize'] = df['MW'] + df['NumOfAtoms']
    df['DegreeOfUnsaturation'] = df['NumOfC'] + 1 - (df['NumOfAtoms'] - df['NumOfC']) / 2
    df['FlexibilityRatio'] = df['NumOfConfUsed'] / (df['NumOfConf'] + 1e-9)
    df['PolarityIndex'] = (df['NumOfO'] + df['NumOfN'] + df[polar_groups].sum(axis=1)) / (df['NumOfC'] + df['C=C (non-aromatic)'] + 1e-9)
    df['Hydrophobicity'] = df['NumOfC'] / (df['NumOfAtoms'] + 1e-9)
    functional_groups = [
        'C=C (non-aromatic)', 'hydroxyl (alkyl)', 'aldehyde', 'ketone',
        'carboxylic acid', 'ester', 'ether (alicyclic)', 'nitrate',
        'nitro', 'aromatic hydroxyl', 'carbonylperoxynitrate', 'peroxide',
        'hydroperoxide', 'carbonylperoxyacid', 'nitroester'
    ]
    for group in functional_groups:
        df[f'{group}_Indicator'] = (df[group] > 0).astype(int)

    symmetric_groups = ['ester', 'ether (alicyclic)', 'ketone']
    asymmetric_groups = ['aldehyde', 'carboxylic acid']
    df['SymmetryIndex'] = df[symmetric_groups].sum(axis=1) / (df[symmetric_groups + asymmetric_groups].sum(axis=1) + 1e-9)
    df['ShapeCompactness'] = df['NumOfAtoms'] / (df['NumOfConfUsed'] + 1e-9)
    df['HydrogenBondDensity'] = df['NumHBondDonors'] / (df['MW'] + 1e-9)
    df['VolatilityIndex'] = (
        df['NumOfC'] / (df[polar_groups].sum(axis=1) + 1e-9) *
        1 / (df['MW'] + 1e-9)
    )
    df['OxygenToCarbonRatio'] = df['NumOfO'] / (df['NumOfC'] + 1e-9)

    return df

In [14]:
def apply_transformations(df, selected_transformations):
    trans_functions = {
        'none': lambda x: x,
        'log': lambda x: np.log(x + 1e-9),
        'sqrt': lambda x: np.sqrt(x),
        'square': lambda x: x**2,
        'cube': lambda x: x**3,
        'exp': lambda x: np.exp(x),
        'reciprocal': lambda x: 1 / (x + 1e-9),
        'boxcox': lambda x: boxcox(x + 1e-9)[0] if (x > 0).all() else x,
        'yeojohnson': lambda x: yeojohnson(x)[0]
    }
    
    df_transformed = df.copy()
    for col, func in selected_transformations:
        if func in trans_functions:
            df_transformed[col] = trans_functions[func](df_transformed[col].astype(float))
    
    return df_transformed

In [15]:
def evaluate_model(model, X_train, y_train, random_state=42):
    ### Train loss
    y_train_pred = model.predict(X_train)
    train_loss = mean_squared_error(y_train, y_train_pred)

    ### 5-fold cross-validation
    kf = KFold(n_splits=5, shuffle=True, random_state=random_state)
    mse_scorer = make_scorer(mean_squared_error)
    cv_scores = cross_val_score(model, X_train, y_train, cv=kf, scoring=mse_scorer)
    cv_loss_mean = cv_scores.mean()

    r2_train = r2_score(y_train, y_train_pred)
    r2_cv = cross_val_score(model, X_train, y_train, cv=kf, scoring='r2').mean()
    
    return train_loss, cv_loss_mean, r2_train, r2_cv

In [16]:
best_transformations = [
    ('AtomFraction', 'cube'),
    ('GroupDensity_CarboxylicAcid', 'sqrt'),
    ('ConfigurationalComplexity', 'yeojohnson'),
    ('ShapeCompactness', 'log'),
    ('VolatilityIndex', 'reciprocal'),
    ('FlexibilityRatio', 'sqrt'),
    ('PolarityIndex', 'sqrt'),
    ('NumOfConfUsed', 'boxcox'),
    ('DegreeOfUnsaturation', 'yeojohnson'),
    ('Hydrophobicity', 'yeojohnson'),
    ('HydrogenBondPotential', 'none'),
    ('HBondDensity', 'none'),
    ('SymmetryIndex', 'sqrt'),
    ('NumOfConf', 'sqrt'),
    ('NumOfC', 'cube'),
    ('NumOfO', 'sqrt'),
]
best_features = ['AtomFraction', 'GroupDensity_CarboxylicAcid', 'ConfigurationalComplexity', 'carbonylperoxynitrate_Indicator', 'PolarityIndex', 'parentspecies_apin', 'aldehyde_Indicator', 'peroxide_Indicator', 'ShapeCompactness', 'FlexibilityRatio', 'VolatilityIndex', 'PolarGroupCount', 'HBondDensity', 'HydrogenBondPotential', 'nitro_Indicator', 'ester_Indicator', 'parentspecies_toluene', 'DegreeOfUnsaturation', 'Hydrophobicity', 'ketone_Indicator', 'nitroester_Indicator', 'hydroxyl (alkyl)', 'carbonylperoxyacid', 'NumOfConfUsed', 'aldehyde', 'SymmetryIndex', 'NumOfConf', 'NumOfC', 'nitro', 'C=C (non-aromatic)', 'C=C (non-aromatic)_Indicator', 'nitrate', 'carbonylperoxyacid_Indicator', 'hydroperoxide_Indicator', 'nitroester', 'carboxylic acid_Indicator', 'aromatic hydroxyl_Indicator', 'NumOfO', 'ether (alicyclic)_Indicator']

## Tietoaineistojen lataaminen

In [17]:
df_train = pd.read_csv('../data/train.csv', encoding='utf-8', header=0)
df_test = pd.read_csv('../data/test.csv', encoding='utf-8', header=0)

## Datan myllääminen

In [18]:
# Erotetaan X ja y
X_train, y_train = df_train.drop(columns=['log_pSat_Pa', 'ID']), df_train['log_pSat_Pa']
X_test = df_test.drop(columns=['ID'])

# Lisätään uusia featureita
X_train_trans = add_meaningful_features(X_train)
X_test_trans = add_meaningful_features(X_test)

# Käytetään vain parhaita featureita
X_train_trans = X_train_trans[best_features]
X_test_trans = X_test_trans[best_features]

# Käytetään vain parhaita transformaatioita
X_train_trans = apply_transformations(X_train_trans, selected_transformations=best_transformations)
X_test_trans = apply_transformations(X_test_trans, selected_transformations=best_transformations)


# Hyperparametrien optimointi

In [None]:
import optuna
from sklearn.discriminant_analysis import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.svm import SVR

def objective(trial):
    svr_params = {        
        "C": trial.suggest_float("C", 3.2, 3.8),
        "epsilon": trial.suggest_float("epsilon", 0.35, 0.45),
        "kernel": 'rbf',
        "degree": trial.suggest_int("degree", 100, 160),
        "gamma": trial.suggest_float("gamma", 0.014, 0.020),
        "coef0": trial.suggest_float("coef0", 0.032, 0.038),
        #"shrinking": trial.suggest_categorical("shrinking", [True, False]),
        "tol": trial.suggest_float("tol", 0.5e-5, 1.5e-5),
        #"cache_size": 200,
        #"verbose": False,
        #"max_iter": -1,
    }

    model = make_pipeline(StandardScaler(), SVR(**svr_params))
    #score = evaluate_model(model, X_train_trans, y_train)[3]
    score = cross_val_score(model, X_train_trans, y_train, cv=5, scoring='r2').mean()
    return score

study_name = "group-190-667f"
storage = "sqlite:///optuna_190.sqlite3"

study = optuna.create_study(
    direction="maximize",
    #sampler=optuna.samplers.TPESampler(seed=190),
    study_name=study_name,
    storage=storage,
    load_if_exists=True
)

study.optimize(objective, n_trials=100)

loaded_study = optuna.load_study(study_name=study_name, storage=storage)

print(f"The best score: {loaded_study.best_value}")
print(f"The best hyperparameter combination: {loaded_study.best_params}")

# Trial 367 finished with value: 0.7551662254972231 and parameters: {'C': 3.746384763392366, 'epsilon': 0.4178386759465985, 'degree': 109, 'gamma': 0.015928184517496148, 'coef0': 0.03219101131725596, 'tol': 1.4843329842875477e-05}. Best is trial 367 with value: 0.7551662254972231.

[I 2024-11-28 21:52:54,943] Using an existing study with name 'group-190-667f' instead of creating a new one.
[I 2024-11-28 21:55:53,944] Trial 365 finished with value: 0.7551616528174083 and parameters: {'C': 3.743714072771044, 'epsilon': 0.4181396083593376, 'degree': 109, 'gamma': 0.016054894026586383, 'coef0': 0.03224675604183521, 'tol': 1.488194100684841e-05}. Best is trial 363 with value: 0.7551643425213456.
[I 2024-11-28 21:59:39,434] Trial 366 finished with value: 0.7551647012847951 and parameters: {'C': 3.7440393529083638, 'epsilon': 0.4180445845887296, 'degree': 109, 'gamma': 0.015934893436197604, 'coef0': 0.03225762864511606, 'tol': 1.486922198548235e-05}. Best is trial 366 with value: 0.7551647012847951.
[I 2024-11-28 22:03:05,391] Trial 367 finished with value: 0.7551662254972231 and parameters: {'C': 3.746384763392366, 'epsilon': 0.4178386759465985, 'degree': 109, 'gamma': 0.015928184517496148, 'coef0': 0.03219101131725596, 'tol': 1.4843329842875477e-05}. Best is trial 367

In [None]:
model = make_pipeline(StandardScaler(), SVR(C=3.3, epsilon=0.205, kernel='rbf'))
model.fit(X_train_trans, y_train)
score = evaluate_model(model, X_train_trans, y_train)
print(score)

np.float64(0.7494715461746723)

## Ennustuksen tallentaminen

In [None]:
df_test_trans['log_pSat_Pa'] = model.predict(X_test_trans.drop(columns=['log_pSat_Pa'], axis=1))
df_test_trans[['ID', 'log_pSat_Pa']].to_csv('../submission/submission.csv', index=False)

# Lisää työkaluja

In [None]:
import itertools
from sklearn.model_selection import cross_val_score
from sklearn.discriminant_analysis import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.svm import SVR
import sys

sys.exit()

def evaluate_model(model, X, y):
    scores = cross_val_score(model, X, y, cv=5, scoring='r2')
    return scores.mean()

always_include = ['AtomFraction', 'GroupDensity_CarboxylicAcid', 'ConfigurationalComplexity', 'carbonylperoxynitrate_Indicator', 'PolarityIndex', 'parentspecies_apin', 'aldehyde_Indicator', 'peroxide_Indicator', 'ShapeCompactness', 'FlexibilityRatio', 'VolatilityIndex', 'PolarGroupCount', 'HBondDensity', 'HydrogenBondPotential', 'nitro_Indicator', 'ester_Indicator', 'parentspecies_toluene', 'DegreeOfUnsaturation', 'Hydrophobicity', 'ketone_Indicator', 'nitroester_Indicator', 'hydroxyl (alkyl)', 'carbonylperoxyacid', 'NumOfConfUsed', 'aldehyde', 'SymmetryIndex', 'NumOfConf', 'NumOfC', 'nitro', 'C=C (non-aromatic)', 'C=C (non-aromatic)_Indicator', 'nitrate', 'carbonylperoxyacid_Indicator', 'hydroperoxide_Indicator', 'nitroester', 'carboxylic acid_Indicator', 'aromatic hydroxyl_Indicator', 'NumOfO', 'ether (alicyclic)_Indicator']

remaining_cols = [col for col in X_train_gbr.columns if col not in always_include]
print(remaining_cols)
#X_train_gbr = auto_transform(X_train_gbr, y_train)
#X_train_gbr['DegreeOfUnsaturation'] = X_train_gbr['DegreeOfUnsaturation'].fillna(0)

model = make_pipeline(StandardScaler(), SVR(C=3.3, epsilon=0.205, kernel='rbf'))

best_score = -np.inf
best_combination = None

mode = 'none'

if mode == 'add':
    for L in range(0, len(remaining_cols) + 1):
        for subset in itertools.combinations(remaining_cols, L):
            current_combination = list(always_include) + list(subset)
            X_subset = X_train_gbr[current_combination]
            model.fit(X_subset, y_train)
            score = evaluate_model(model, X_subset, y_train)
            
            if score > best_score:
                best_score = score
                best_combination = current_combination
                print(f'New best combination: {best_combination} with score: {best_score:.10f}')
elif mode == 'remove':
    X_transformed = X_train_gbr.copy()
    for col, trans_name in approved_transformations:
        if trans_name in transformations:
            X_transformed[col] = transformations[trans_name](X_transformed[col].astype(float))
    for L in range(0, len(always_include) + 1):
        for subset in itertools.combinations(always_include, L):
            current_combination = [col for col in always_include if col not in subset]
            X_subset = X_transformed[current_combination]
            model.fit(X_subset, y_train)
            score = evaluate_model(model, X_subset, y_train)
            
            if score > best_score:
                best_score = score
                best_combination = current_combination
                print(f'New best combination: {best_combination} with score: {best_score:.10f}')
elif mode == "replace":
    for included_col in always_include:
        for excluded_col in remaining_cols:
            current_combination = [col for col in always_include if col != included_col] + [excluded_col]
            X_subset = X_train_gbr[current_combination]
            model.fit(X_subset, y_train)
            score = evaluate_model(model, X_subset, y_train)
            
            if score > best_score:
                best_score = score
                best_combination = current_combination
                print(f'New best combination: {best_combination} with score: {best_score:.10f}')
elif mode == "transform":
    X_transformed = X_train_gbr.copy()
    for col, trans_name in approved_transformations:
        if trans_name in transformations:
            X_transformed[col] = transformations[trans_name](X_transformed[col].astype(float))
    
    ### Otetaan baseline ilman muunnoksia ensin
    X_subset = X_transformed[always_include]
    model.fit(X_subset, y_train)
    baseline_score = evaluate_model(model, X_subset, y_train)
    print(f'Baseline score with no transformation: {baseline_score:.10f}')
    
    ### Käydään kaikki transformaatiot läpi
    for col in always_include:
        if any(col == approved_col for approved_col, _ in approved_transformations):
            continue  # Skip columns with approved transformations
        if X_train_gbr[col].nunique() < 10:
            continue
        for name, transform in transformations.items():
            try:
                X_temp = X_transformed.copy()
                X_temp[col] = transform(X_temp[col].astype(float))
                
                if np.any(np.isinf(X_temp[col])) or np.any(np.abs(X_temp[col]) > 1e10):
                    raise ValueError(f"Transformation {name} for {col} resulted in infinities or excessively large values.")
                
                X_subset = X_temp[always_include]
                model.fit(X_subset, y_train)
                score = evaluate_model(model, X_subset, y_train)
                print(f'Transformation: {col} with {name} transformation, score: {score:.10f}')
                if score > best_score:
                    best_score = score
                    best_combination = (col, name)
                    #print(f'New best transformation: {col} with {name} transformation, score: {best_score:.10f}')
            except Exception as e:
                print(f"Skipping transformation {name} for {col} due to error: {e}")
elif mode == "transform_replace":
    for included_col in always_include:
        for excluded_col in remaining_cols:
            for name, transform in transformations.items():
                try:
                    X_temp = X_train_gbr.copy()
                    X_temp[excluded_col] = transform(X_temp[excluded_col].astype(float))
                    
                    # Check for infinities or excessively large values
                    if np.any(np.isinf(X_temp[excluded_col])) or np.any(np.abs(X_temp[excluded_col]) > 1e10):
                        raise ValueError(f"Transformation {name} for {excluded_col} resulted in infinities or excessively large values.")
                    
                    current_combination = [col for col in always_include if col != included_col] + [excluded_col]
                    X_subset = X_temp[current_combination]
                    model.fit(X_subset, y_train)
                    score = evaluate_model(model, X_subset, y_train)
                    
                    if score > best_score:
                        best_score = score
                        best_combination = (included_col, excluded_col, name)
                        print(f'New best transformation and replacement: Replace {included_col} with {excluded_col} using {name} transformation, score: {best_score:.10f}')
                except Exception as e:
                    print(f"Skipping transformation {name} for {excluded_col} due to error: {e}")
elif mode == "transform_add":
    for excluded_col in remaining_cols:
        for name, transform in transformations.items():
            try:
                X_temp = X_train_gbr.copy()
                X_temp[excluded_col] = transform(X_temp[excluded_col].astype(float))
                
                # Check for infinities or excessively large values
                if np.any(np.isinf(X_temp[excluded_col])) or np.any(np.abs(X_temp[excluded_col]) > 1e10):
                    raise ValueError(f"Transformation {name} for {excluded_col} resulted in infinities or excessively large values.")
                
                current_combination = list(always_include) + [excluded_col]
                X_subset = X_temp[current_combination]
                model.fit(X_subset, y_train)
                score = evaluate_model(model, X_subset, y_train)
                
                if score > best_score:
                    best_score = score
                    best_combination = (excluded_col, name)
                    print(f'New best transformation and addition: Add {excluded_col} using {name} transformation, score: {best_score:.10f}')
            except Exception as e:
                print(f"Skipping transformation {name} for {excluded_col} due to error: {e}")

print(f'Best combination: {best_combination} with score: {best_score:.10f}')
