# Ryhmä-190

## Python-paketit

In [17]:
# Jupyter-notebookissa voi asentaa paketit samaan tapaan kuin komentoriviltä.
# Tähän käytetään "magic commandeja", jotka alkavat %-merkillä.
# Näiden asentamisessa voi mennä muutama minuutti, mutta sen jälkeen niitä ei tarvitse asentaa uudestaan. 
#%pip install seaborn
#%pip install mlflow azureml azureml-core azureml-mlflow azure-identity

In [18]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, make_scorer, r2_score
from sklearn.model_selection import KFold, cross_val_score
#from azureml.core import Workspace
#from azure.identity import InteractiveBrowserCredential
#import mlflow

## MLFlow-seuranta (ei tällä hetkellä käytössä)

Tarkistetaan ajoympäristö (Azure vai oma kone) ja autentikoidutaan sen mukaan.

In [19]:
def is_running_in_azure():
    return 'AZUREML_RUN_ID' in os.environ

def get_workspace():
    if is_running_in_azure():
        return Workspace.from_config()
    else:
        return Workspace(subscription_id='1c0e26b6-0fcb-4b6d-911c-2a0836275ea4',
                         resource_group='rg-AML',
                         workspace_name='aml-data_science_masters')
#ws = get_workspace()

In [20]:
#mlflow.set_tracking_uri(ws.get_mlflow_tracking_uri())
#experiment_name = 'group-190-tracking'
#mlflow.set_experiment(experiment_name)
#mlflow.autolog()

## Tietoaineistojen lataaminen

In [21]:
df_train = pd.read_csv('../data/train.csv', encoding='utf-8', header=0)
df_test = pd.read_csv('../data/test.csv', encoding='utf-8', header=0)

## Esiprosessointi

In [22]:
df_train['parentspecies'] = df_train['parentspecies'].astype('category')
df_train['parentspecies'] = df_train['parentspecies'].cat.codes
df_test['parentspecies'] = df_test['parentspecies'].astype('category')
df_test['parentspecies'] = df_test['parentspecies'].cat.codes

## Mallien kouluttaminen

In [23]:
X_train, y_train = df_train.drop(columns=['log_pSat_Pa', 'ID']), df_train['log_pSat_Pa']
X_test = df_test.drop(columns=['ID'])

Erilaisia malleja alla. Kaikki mallit käyttävät samaa nimeä `model`, eli vain viimeisenä koulutettu menee testeihin ja tallentuu.

### 1. Dummy

In [24]:
dummy_model = DummyRegressor(strategy='mean')
dummy_model.fit(X_train, y_train)

df_test['log_pSat_Pa'] = dummy_model.predict(X_test)

### 2. Yksinkertainen regressio

In [25]:
linear_regression_model = LinearRegression()
linear_regression_model.fit(X_train, y_train)

df_test['log_pSat_Pa'] = linear_regression_model.predict(X_test)

### 3. Random Forest

In [26]:
random_forest_model = RandomForestRegressor(random_state=190)
random_forest_model.fit(X_train, y_train)

df_test['log_pSat_Pa'] = random_forest_model.predict(X_test)

### 4. Gradient Boosting Regressor

In [27]:
gradient_boost_model = GradientBoostingRegressor(random_state=190)
gradient_boost_model.fit(X_train, y_train)

df_test['log_pSat_Pa'] = gradient_boost_model.predict(X_test)

## Mallien arviointi

In [None]:
def evaluate(model):
  ### Train loss
  y_train_pred = model.predict(X_train)
  train_loss = mean_squared_error(y_train, y_train_pred)

  ### 5-fold cross-validation
  kf = KFold(n_splits=5, shuffle=True, random_state=190)
  mse_scorer = make_scorer(mean_squared_error)
  cv_scores = cross_val_score(model, X_train, y_train, cv=kf, scoring=mse_scorer)
  cv_loss_mean = cv_scores.mean()

  r2_train = r2_score(y_train, y_train_pred)
  r2_cv = cross_val_score(model, X_train, y_train, cv=kf, scoring='r2').mean()

  #print('Train loss:', train_loss)
  #print('CV loss mean:', cv_loss_mean)
  #print('Train R^2:', r2_train)
  #print('CV R^2:', r2_cv)
  return {
        "Model": model.__class__.__name__,
        "Train Loss": train_loss,
        "CV Loss Mean": cv_loss_mean,
        "Train R^2": r2_train,
        "CV R^2": r2_cv
    }

results = []
models = [
  dummy_model,
  linear_regression_model,
  random_forest_model,
  gradient_boost_model
]

for model in models:
    result = evaluate(model)
    results.append(result)

results_df = pd.DataFrame(results)

print(results_df)



Train loss: 9.735228683762335
CV loss mean: 9.73650114690938
Train R^2: 0.0
CV R^2: -0.00036408103842227034
Train loss: 2.892565558900267
CV loss mean: 2.8984225451995593
Train R^2: 0.7028764651697541
CV R^2: 0.7021942588637746
Train loss: 0.38877270805996705
CV loss mean: 2.77370594268536
Train R^2: 0.9600653748680386
CV R^2: 0.7150132769504335
Train loss: 2.6116312914299806
CV loss mean: 2.6794213727758844
Train R^2: 0.7317339554862234
CV R^2: 0.7246891200107113
                       Model  Train Loss  CV Loss Mean  Train R^2    CV R^2
0             DummyRegressor    9.735229      9.736501   0.000000 -0.000364
1           LinearRegression    2.892566      2.898423   0.702876  0.702194
2      RandomForestRegressor    0.388773      2.773706   0.960065  0.715013
3  GradientBoostingRegressor    2.611631      2.679421   0.731734  0.724689


## Ennustuksen tallentaminen

In [None]:
df_test[['ID', 'log_pSat_Pa']].to_csv('../submission/submission.csv', index=False)
# If you want to save the model stats:
#csv_name = 'something.csv'
#path = '../docs/'
#results_df.to_csv(path+csv_name, index=False)