<a href="https://colab.research.google.com/github/leonardoub/SCRIPT_PALERMO/blob/master/pipeline_2_regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import random
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy
import seaborn as sns

In [0]:
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

In [3]:
# Seed value
# Apparently you may use different seed values at each stage
seed_value= 0

# 1. Set `PYTHONHASHSEED` environment variable at a fixed value
import os
os.environ['PYTHONHASHSEED']=str(seed_value)

# 2. Set `python` built-in pseudo-random generator at a fixed value
import random
random.seed(seed_value)

# 3. Set `numpy` pseudo-random generator at a fixed value
import numpy as np
np.random.seed(seed_value)

# 4. Set `tensorflow` pseudo-random generator at a fixed value
import tensorflow as tf
tf.set_random_seed(seed_value)

# 5. Configure a new global `tensorflow` session
from keras import backend as K
session_conf = tf.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
sess = tf.Session(graph=tf.get_default_graph(), config=session_conf)
K.set_session(sess)

Using TensorFlow backend.


#Load data

In [4]:
#load data from Drive
from google.colab import drive
drive.mount('/gdrive')
%cd /gdrive

Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).
/gdrive


In [0]:
train_dataset_path = '/gdrive/My Drive/AIM_PA/database_training2.csv'
test_dataset_path = '/gdrive/My Drive/AIM_PA/database_nostro_without_nan.csv'

In [0]:
df_train = pd.read_csv(train_dataset_path)
df_test = pd.read_csv(test_dataset_path)

In [0]:
df_train.rename(columns={'Survival.time (months)':'Surv_time_months'}, inplace=True)

In [0]:
df_test.rename(columns={'Survival.time (months)':'Surv_time_months'}, inplace=True)

In [0]:
df_train.rename(columns={'Overall.Stage':'Overall_Stage'}, inplace=True)
df_test.rename(columns={'Overall.Stage':'Overall_Stage'}, inplace=True)

In [0]:
public_data = df_train.drop(['Histology', 'Surv_time_months', 'OS', 'deadstatus.event','Overall_Stage'], axis=1)

In [0]:
PA_data = df_test.drop(['Histology', 'Surv_time_months', 'OS', 'deadstatus.event','Overall_Stage'], axis=1)

In [0]:
public_labels = df_train.Surv_time_months

In [0]:
PA_labels = df_test.Surv_time_months

#Train Test Split

In [0]:
from sklearn.model_selection import train_test_split

In [0]:
X_train, X_test, y_train, y_test = train_test_split(public_data, public_labels, test_size=0.3, random_state=1)

#Scalers

In [0]:
from sklearn.preprocessing import StandardScaler, RobustScaler, QuantileTransformer

In [0]:
scalers_to_test = [StandardScaler(), RobustScaler()]

#LinearRegression

In [0]:
from sklearn.linear_model import LinearRegression

steps = [('scaler', StandardScaler()), ('red_dim', LinearDiscriminantAnalysis()), ('reg', LinearRegression(normalize=False))]

from sklearn.pipeline import Pipeline
pipeline = Pipeline(steps)

from sklearn.model_selection import GridSearchCV

n_features_to_test = np.arange(1, 11)

parameteres = [{'scaler':scalers_to_test, 'red_dim':[LinearDiscriminantAnalysis()], 'red_dim__n_components':[2], 'reg__fit_intercept':[True, False]},
               {'scaler':scalers_to_test, 'red_dim':[PCA()], 'red_dim__n_components':n_features_to_test, 'reg__fit_intercept':[True, False]}]


In [0]:
grid = GridSearchCV(pipeline, param_grid=parameteres, scoring='neg_mean_absolute_error', cv=5, verbose=1) #è disponibile l'opposto del mean absolute error

In [40]:
grid.fit(X_train, y_train)

Fitting 5 folds for each of 44 candidates, totalling 220 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 220 out of 220 | elapsed:    4.5s finished


GridSearchCV(cv=5, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('scaler',
                                        StandardScaler(copy=True,
                                                       with_mean=True,
                                                       with_std=True)),
                                       ('red_dim',
                                        LinearDiscriminantAnalysis(n_components=None,
                                                                   priors=None,
                                                                   shrinkage=None,
                                                                   solver='svd',
                                                                   store_covariance=False,
                                                                   tol=0.0001)),
                                       ('reg',
                                        LinearRegression(cop

In [30]:
print(f'score = {grid.score(X_test, y_test)}')
print(grid.best_params_)

score = -12.701529716293098
{'red_dim': PCA(copy=True, iterated_power='auto', n_components=2, random_state=None,
    svd_solver='auto', tol=0.0, whiten=False), 'red_dim__n_components': 2, 'reg__fit_intercept': True, 'scaler': StandardScaler(copy=True, with_mean=True, with_std=True)}


#RandomForestRegressor


In [43]:
from sklearn.ensemble import RandomForestRegressor

steps = [('scaler', StandardScaler()), ('red_dim', LinearDiscriminantAnalysis()), ('reg', RandomForestRegressor(random_state=0))]

from sklearn.pipeline import Pipeline
pipeline = Pipeline(steps)

from sklearn.model_selection import GridSearchCV

n_features_to_test = np.arange(1, 11)
n_tree = np.arange(10, 120, 10)

parameteres = [{'scaler':scalers_to_test, 'red_dim':[LinearDiscriminantAnalysis()], 'red_dim__n_components':[2], 'reg__n_estimators':n_tree, 'reg__criterion':['mse', 'mae']},
               {'scaler':scalers_to_test, 'red_dim':[PCA()], 'red_dim__n_components':n_features_to_test, 'reg__n_estimators':n_tree, 'reg__criterion':['mse', 'mae']}]

grid_GS = GridSearchCV(pipeline, param_grid=parameteres, scoring='neg_mean_absolute_error', cv=5, verbose=1) #è disponibile l'opposto del mean absolute error
grid_GS.fit(X_train, y_train)


Fitting 5 folds for each of 484 candidates, totalling 2420 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 2420 out of 2420 | elapsed:  5.8min finished


GridSearchCV(cv=5, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('scaler',
                                        StandardScaler(copy=True,
                                                       with_mean=True,
                                                       with_std=True)),
                                       ('red_dim',
                                        LinearDiscriminantAnalysis(n_components=None,
                                                                   priors=None,
                                                                   shrinkage=None,
                                                                   solver='svd',
                                                                   store_covariance=False,
                                                                   tol=0.0001)),
                                       ('reg',
                                        RandomForestRegresso

In [44]:
print(f'score = {grid_GS.score(X_test, y_test)}')
print(grid_GS.best_params_)

score = -11.992500000000001
{'red_dim': PCA(copy=True, iterated_power='auto', n_components=5, random_state=None,
    svd_solver='auto', tol=0.0, whiten=False), 'red_dim__n_components': 5, 'reg__criterion': 'mae', 'reg__n_estimators': 10, 'scaler': StandardScaler(copy=True, with_mean=True, with_std=True)}


In [47]:
from sklearn.model_selection import RandomizedSearchCV
grid_RS = RandomizedSearchCV(pipeline, param_distributions=parameteres, scoring='neg_mean_absolute_error', n_iter=30, cv=5, verbose=1)
grid_RS.fit(X_train, y_train)

Fitting 5 folds for each of 30 candidates, totalling 150 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 150 out of 150 | elapsed:   21.1s finished


RandomizedSearchCV(cv=5, error_score=nan,
                   estimator=Pipeline(memory=None,
                                      steps=[('scaler',
                                              StandardScaler(copy=True,
                                                             with_mean=True,
                                                             with_std=True)),
                                             ('red_dim',
                                              LinearDiscriminantAnalysis(n_components=None,
                                                                         priors=None,
                                                                         shrinkage=None,
                                                                         solver='svd',
                                                                         store_covariance=False,
                                                                         tol=0.0001)),
                             

In [48]:
print(f'score = {grid_RS.score(X_test, y_test)}')
print(grid_RS.best_params_)

score = -12.614375
{'scaler': StandardScaler(copy=True, with_mean=True, with_std=True), 'reg__n_estimators': 40, 'reg__criterion': 'mae', 'red_dim__n_components': 5, 'red_dim': PCA(copy=True, iterated_power='auto', n_components=5, random_state=None,
    svd_solver='auto', tol=0.0, whiten=False)}


#SVR

In [0]:
from sklearn.svm import SVR

steps = [('scaler', StandardScaler()), ('red_dim', LinearDiscriminantAnalysis()), ('reg', SVR())]

from sklearn.pipeline import Pipeline
pipeline = Pipeline(steps)

from sklearn.model_selection import GridSearchCV

n_features_to_test = np.arange(1, 11)

parameteres = [{'scaler':scalers_to_test, 'red_dim':[LinearDiscriminantAnalysis()], 'red_dim__n_components':[2], 'reg__C':[0.001,0.1,10], 'reg__kernel':['linear', 'rbf', 'sigmoid'], 'reg__gamma':[0.1,0.01]},
               {'scaler':scalers_to_test, 'red_dim':[PCA()], 'red_dim__n_components':n_features_to_test,'reg__C':[0.001,0.1,10], 'reg__kernel':['poly'], 'reg__degree':[1,2,3,4], 'reg__gamma':[0.1,0.01]}]

grid_GS = GridSearchCV(pipeline, param_grid=parameteres, scoring='neg_mean_absolute_error', cv=5, verbose=1) #è disponibile l'opposto del mean absolute error
grid_GS.fit(X_train, y_train)


Fitting 5 folds for each of 516 candidates, totalling 2580 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


In [0]:
print(f'score = {grid_GS.score(X_test, y_test)}')
print(grid_GS.best_params_)

In [0]:
from sklearn.svm import SVR

steps = [('scaler', StandardScaler()), ('red_dim', LinearDiscriminantAnalysis()), ('reg', SVR())]

from sklearn.pipeline import Pipeline
pipeline = Pipeline(steps)

from sklearn.model_selection import RandomizedSearchCV

n_features_to_test = np.arange(1, 11)

parameteres = [{'scaler':scalers_to_test, 'red_dim':[LinearDiscriminantAnalysis()], 'red_dim__n_components':[2], 'reg__C':[0.001,0.1,10], 'reg__kernel':['linear', 'rbf', 'sigmoid'], 'reg__gamma':[0.1,0.01]},
               {'scaler':scalers_to_test, 'red_dim':[PCA()], 'red_dim__n_components':n_features_to_test,'reg__C':[0.001,0.1,10], 'reg__kernel':['poly'], 'reg__degree':[1,2,3,4], 'reg__gamma':[0.1,0.01]}]

grid_RS = RandomizedSearchCV(pipeline, param_distributions=parameteres, scoring='neg_mean_absolute_error', n_iter=30, cv=5, verbose=1)
grid_RS.fit(X_train, y_train)

Fitting 5 folds for each of 30 candidates, totalling 150 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


In [0]:
print(f'score = {grid_RS.score(X_test, y_test)}')
print(grid_RS.best_params_)