# Ryhmä-190

## Python-paketit

In [37]:
# Jupyter-notebookissa voi asentaa paketit samaan tapaan kuin komentoriviltä.
# Tähän käytetään "magic commandeja", jotka alkavat %-merkillä.
# Näiden asentamisessa voi mennä muutama minuutti, mutta sen jälkeen niitä ei tarvitse asentaa uudestaan. 
#%pip install seaborn
#%pip install mlflow azureml azureml-core azureml-mlflow azure-identity

In [None]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, make_scorer, r2_score
from sklearn.model_selection import KFold, cross_val_score
#from azureml.core import Workspace
#from azure.identity import InteractiveBrowserCredential
#import mlflow

## MLFlow-seuranta (ei tällä hetkellä käytössä)

Tarkistetaan ajoympäristö (Azure vai oma kone) ja autentikoidutaan sen mukaan.

In [39]:
def is_running_in_azure():
    return 'AZUREML_RUN_ID' in os.environ

def get_workspace():
    if is_running_in_azure():
        return Workspace.from_config()
    else:
        return Workspace(subscription_id='1c0e26b6-0fcb-4b6d-911c-2a0836275ea4',
                         resource_group='rg-AML',
                         workspace_name='aml-data_science_masters')
#ws = get_workspace()

In [40]:
#mlflow.set_tracking_uri(ws.get_mlflow_tracking_uri())
#experiment_name = 'group-190-tracking'
#mlflow.set_experiment(experiment_name)
#mlflow.autolog()

In [None]:
def evaluate_model(model, X_train, y_train, random_state=42):
    ### Train loss
    y_train_pred = model.predict(X_train)
    train_loss = mean_squared_error(y_train, y_train_pred)

    ### 5-fold cross-validation
    kf = KFold(n_splits=5, shuffle=True, random_state=random_state)
    mse_scorer = make_scorer(mean_squared_error)
    cv_scores = cross_val_score(model, X_train, y_train, cv=kf, scoring=mse_scorer)
    cv_loss_mean = cv_scores.mean()

    r2_train = r2_score(y_train, y_train_pred)
    r2_cv = cross_val_score(model, X_train, y_train, cv=kf, scoring='r2').mean()

    print('Train loss:', train_loss)
    print('CV loss mean:', cv_loss_mean)
    print('Train R^2:', r2_train)
    print('CV R^2:', r2_cv)


## Tietoaineistojen lataaminen

In [87]:
df_train = pd.read_csv('../data/train.csv', encoding='utf-8', header=0)
df_test = pd.read_csv('../data/test.csv', encoding='utf-8', header=0)

## Esiprosessointi

In [92]:
df_train['parentspecies'] = df_train['parentspecies'].astype('category')
df_train['parentspecies'] = df_train['parentspecies'].cat.codes
df_test['parentspecies'] = df_test['parentspecies'].astype('category')
df_test['parentspecies'] = df_test['parentspecies'].cat.codes
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26637 entries, 0 to 26636
Data columns (total 27 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   ID                            26637 non-null  int64  
 1   log_pSat_Pa                   26637 non-null  float64
 2   MW                            26637 non-null  float64
 3   NumOfAtoms                    26637 non-null  int64  
 4   NumOfC                        26637 non-null  int64  
 5   NumOfO                        26637 non-null  int64  
 6   NumOfN                        26637 non-null  int64  
 7   NumHBondDonors                26637 non-null  int64  
 8   NumOfConf                     26637 non-null  float64
 9   NumOfConfUsed                 26637 non-null  float64
 10  parentspecies                 26637 non-null  int8   
 11  C=C (non-aromatic)            26637 non-null  int64  
 12  C=C-C=O in non-aromatic ring  26637 non-null  int64  
 13  h

# Feature engineering

Seuraavassa taustaa. Ajatuksena on luoda "meaningful features". Bayesian Data Analysis kurssilla puhuttiin Generalized Linear Regression yhteydessä, että on kannattavaa olla tarkkana datasetin kanssa. Esimerkissä käytettiin puun lehtien painon arviointia. Siinä yksittäiset mitat kuten puun korkeus leveys yms ei antanut kovin hyvää mallia. Näistä kuitenkin pystyttiin luomaan uusia predictoreita, kuten puun muoto, lehtiosan volyymi jne.

Koska me emme ole alan asiantuntijoita, tulkitsen tekoälyn [käyttösääntöjä](https://studies.helsinki.fi/kurssit/toteutus/hy-opt-cur-2425-b5ccfa1b-ac12-4a9a-bef8-b46c0e808555/DATA11002) kurssilla, että voidaan kysyä mitä näistä voidaan laskea. Joten näin tehtiin ja saatiin seuraavia kaavoja.

BDA kurssilla myös poistettiin tämän jälkeen alkuperäiset arvot sotkemasta!

In [125]:
def transform_gbr(orig_df):
    df = orig_df.copy()
    df = df.replace([np.inf, -np.inf], np.nan).fillna(0)

    # Add meaningful features
    df['AtomFraction'] = (df['NumOfC'] + df['NumOfO'] + df['NumOfN']) / df['NumOfAtoms']
    df['Polarity'] = df['NumHBondDonors'] / df['MW']
    df['HBondDensity'] = df['NumHBondDonors'] / df['NumOfAtoms']
    df['GroupDensity_CarboxylicAcid'] = df['carboxylic acid'] / df['MW']
    df['Unsaturation'] = df['C=C (non-aromatic)'] + df['C=C-C=O in non-aromatic ring']
    df['ConfigurationalComplexity'] = df['NumOfConf'] / df['MW']

    # df['C:O_ratio'] = df['NumOfC'] / df['NumOfO']
    # df['C:N_ratio'] = df['NumOfC'] / df['NumOfN']
    # df['Percent_C'] = (df['NumOfC'] / df['NumOfAtoms'])
    # df['Percent_O'] = (df['NumOfO'] / df['NumOfAtoms'])
    # df['Percent_N'] = (df['NumOfN'] / df['NumOfAtoms'])
    # df['HBD_fraction'] = df['NumHBondDonors'] / df['NumOfAtoms']
    # df['FractionOfConfsUsed'] = df['NumOfConfUsed'] / df['NumOfConf']
    # df['Has_Conjugated_System'] = np.where((df['C=C-C=O in non-aromatic ring'] > 0) | (df['C=C (non-aromatic)'] > 0), 1, 0)

    # X_train_gbr['NumOfConf'] = np.log(X_train_gbr['NumOfConf'])
    # X_train_gbr['NumOfConfUsed'] = (X_train_gbr['NumOfConfUsed'] == 40).astype(int)
    drop_cols = [
        # 'NumOfC',
        # 'NumOfO',
        # 'NumOfN',
        # 'NumOfAtoms',
        # 'NumHBondDonors',
        # 'carboxylic acid',
        # 'MW',
        # 'C=C-C=O in non-aromatic ring',
        # 'C=C (non-aromatic)',
        # 'NumOfConf',
        # 'NumOfConfUsed',
        'parentspecies',
    ]
    df = df.drop(columns=drop_cols, axis=1)

    return df

## Mallien kouluttaminen

In [123]:
X_train, y_train = df_train.drop(columns=['log_pSat_Pa', 'ID']), df_train['log_pSat_Pa']
X_test = df_test.drop(columns=['ID'])
X_train_gbr = transform_gbr(X_train)
X_test_gbr = transform_gbr(X_test)
X_train_gbr.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26637 entries, 0 to 26636
Data columns (total 31 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   MW                            26637 non-null  float64
 1   NumOfAtoms                    26637 non-null  int64  
 2   NumOfC                        26637 non-null  int64  
 3   NumOfO                        26637 non-null  int64  
 4   NumOfN                        26637 non-null  int64  
 5   NumHBondDonors                26637 non-null  int64  
 6   NumOfConf                     26637 non-null  float64
 7   NumOfConfUsed                 26637 non-null  float64
 8   parentspecies                 26637 non-null  int8   
 9   C=C (non-aromatic)            26637 non-null  int64  
 10  C=C-C=O in non-aromatic ring  26637 non-null  int64  
 11  hydroxyl (alkyl)              26637 non-null  int64  
 12  aldehyde                      26637 non-null  int64  
 13  k

Erilaisia malleja alla. Kaikki mallit käyttävät samaa nimeä `model`, eli vain viimeisenä koulutettu menee testeihin ja tallentuu.

### 1. Dummy

In [None]:
model = DummyRegressor(strategy='mean')
model.fit(X_train, y_train)

df_test['log_pSat_Pa'] = model.predict(X_test)

### 2. Yksinkertainen regressio

In [116]:
model = LinearRegression()
model.fit(X_train_gbr, y_train)

evaluate_model(model, X_train_gbr, y_train)

Train loss: 3.3887127891789484
CV loss mean: 3.396174128182229
Train R^2: 0.6519123587891593
CV R^2: 0.6509291141100213


### 3. Random Forest

In [61]:
model = RandomForestRegressor(random_state=190)
model.fit(X_train_gbr, y_train)

evaluate_model(model, X_train_gbr, y_train)

Train loss: 0.3863467117691719
CV loss mean: 2.7737210268490045
Train R^2: 0.9603145725365886
CV R^2: 0.714920252540877


### 4. Gradient Boosting Regressor

In [126]:
gbr_params = {
    "n_estimators": 100,
    "max_depth": 6,
    "min_samples_split": 10,
    "learning_rate": 0.1,
    "loss": "squared_error",
    "random_state": 42,
    # "subsample": 0.6,
}
    
model = GradientBoostingRegressor(**gbr_params)
model.fit(X_train_gbr, y_train)

evaluate_model(model, X_train_gbr, y_train)



Train loss: 2.0323132250606983
CV loss mean: 2.506041208220943
Train R^2: 0.7912413471652237
CV R^2: 0.7424163443507199


# 5. SVR

In [80]:
from sklearn.discriminant_analysis import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.svm import SVR

model = make_pipeline(StandardScaler(), SVR(C=1.0, epsilon=0.2, kernel='rbf'))
model.fit(X_train_gbr, y_train)

evaluate_model(model, X_train_gbr, y_train)

Train loss: 2.295314805640857
CV loss mean: 2.47441384480745
Train R^2: 0.7642258974902892
CV R^2: 0.7456636248982956


## Mallien arviointi

In [81]:
### Train loss
y_train_pred = model.predict(X_train)
train_loss = mean_squared_error(y_train, y_train_pred)

### 5-fold cross-validation
results = []
kf = KFold(n_splits=5, shuffle=True, random_state=190)
mse_scorer = make_scorer(mean_squared_error)
cv_scores = cross_val_score(model, X_train, y_train, cv=kf, scoring=mse_scorer)
cv_loss_mean = cv_scores.mean()

r2_train = r2_score(y_train, y_train_pred)
r2_cv = cross_val_score(model, X_train, y_train, cv=kf, scoring='r2').mean()

print('Train loss:', train_loss)
print('CV loss mean:', cv_loss_mean)
print('Train R^2:', r2_train)
print('CV R^2:', r2_cv)

ValueError: The feature names should match those that were passed during fit.
Feature names seen at fit time, yet now missing:
- AtomFraction
- ConfigurationalComplexity
- GroupDensity_CarboxylicAcid
- HBondDensity
- Polarity
- ...


## Ennustuksen tallentaminen

In [82]:
df_test['log_pSat_Pa'] = model.predict(X_test_gbr.drop(columns=['log_pSat_Pa'], axis=1))
df_test[['ID', 'log_pSat_Pa']].to_csv('../submission/submission.csv', index=False)