<a href="https://colab.research.google.com/github/leonardoub/SCRIPT_PALERMO/blob/master/Pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy

In [2]:
# Seed value
# Apparently you may use different seed values at each stage
seed_value= 0

# 1. Set `PYTHONHASHSEED` environment variable at a fixed value
import os
os.environ['PYTHONHASHSEED']=str(seed_value)

# 2. Set `python` built-in pseudo-random generator at a fixed value
import random
random.seed(seed_value)

# 3. Set `numpy` pseudo-random generator at a fixed value
import numpy as np
np.random.seed(seed_value)

# 4. Set `tensorflow` pseudo-random generator at a fixed value
import tensorflow as tf
tf.set_random_seed(seed_value)

# 5. Configure a new global `tensorflow` session
from keras import backend as K
session_conf = tf.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
sess = tf.Session(graph=tf.get_default_graph(), config=session_conf)
K.set_session(sess)

Using TensorFlow backend.


#Load data

In [3]:
#load data from Drive
from google.colab import drive
drive.mount('/gdrive')
%cd /gdrive

Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).
/gdrive


In [0]:
train_dataset_path = '/gdrive/My Drive/AIM_PA/database_training2.csv'
test_dataset_path = '/gdrive/My Drive/AIM_PA/database_nostro_without_nan.csv'

In [0]:
df_train = pd.read_csv(train_dataset_path)
df_test = pd.read_csv(test_dataset_path)

In [0]:
df_train.rename(columns={'Survival.time (months)':'Surv_time_months'}, inplace=True)

In [0]:
df_test.rename(columns={'Survival.time (months)':'Surv_time_months'}, inplace=True)

In [0]:
df_train.rename(columns={'Overall.Stage':'Overall_Stage'}, inplace=True)
df_test.rename(columns={'Overall.Stage':'Overall_Stage'}, inplace=True)

In [0]:
public_data = df_train.drop(['Histology', 'Surv_time_months', 'OS', 'deadstatus.event','Overall_Stage'], axis=1)

In [0]:
PA_data = df_test.drop(['Histology', 'Surv_time_months', 'OS', 'deadstatus.event','Overall_Stage'], axis=1)

In [0]:
public_labels = df_train.Histology

In [0]:
PA_labels = df_test.Histology

#Train Test Split

In [0]:
from sklearn.model_selection import train_test_split

In [0]:
X_train, X_test, y_train, y_test = train_test_split(public_data, public_labels, test_size=13, stratify=public_labels, random_state=1)

In [15]:
X_train.shape

(118, 107)

#Vettorizzare i label

In [0]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
train_labels_encoded = encoder.fit_transform(y_train)
test_labels_encoded = encoder.transform(y_test)

#Pipeline

In [0]:
from sklearn.preprocessing import StandardScaler, RobustScaler, QuantileTransformer
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.feature_selection import SelectKBest
from sklearn.svm import SVC

In [0]:
scalers_to_test = [StandardScaler(), RobustScaler(), QuantileTransformer()]

In [0]:
steps = [('scaler', StandardScaler()), ('red_dim', PCA()), ('SVM', SVC())]

In [0]:
from sklearn.pipeline import Pipeline
pipeline = Pipeline(steps)

#Parameters of grid search

In [0]:
from sklearn.model_selection import GridSearchCV

In [0]:
n_features_to_test = np.arange(1, 11)

In [0]:
parameteres = [{'scaler':scalers_to_test, 'red_dim':[LinearDiscriminantAnalysis()], 'red_dim__n_components':[2],'SVM__C':[0.001,0.1,10,100,10e5],'SVM__gamma':[0.1,0.01]},
               {'scaler':scalers_to_test, 'red_dim':[PCA()], 'red_dim__n_components':n_features_to_test,'SVM__C':[0.001,0.1,10,100,10e5],'SVM__gamma':[0.1,0.01]},
               {'scaler':scalers_to_test, 'SVM__C':[0.001,0.1,10,100,10e5],'SVM__gamma':[0.1,0.01]}]

In [0]:
#{'scaler':scalers_to_test, 'red_dim':[PCA()], 'red_dim__n_components':n_features_to_test,'SVM__C':[0.001,0.1,10,100,10e5],'SVM__gamma':[0.1,0.01]}
#{'scaler':scalers_to_test, 'red_dim':[LinearDiscriminantAnalysis()], 'red_dim__n_components':[2],'SVM__C':[0.001,0.1,10,100,10e5],'SVM__gamma':[0.1,0.01]}
#{'scaler':scalers_to_test, 'red_dim':[SelectKBest()], 'red_dim__k':n_features_to_test,'SVM__C':[0.001,0.1,10,100,10e5],'SVM__gamma':[0.1,0.01]}

In [0]:
grid = GridSearchCV(pipeline, param_grid=parameteres, cv=5, verbose=1)

In [37]:
grid.fit(X_train, y_train)

Fitting 5 folds for each of 360 candidates, totalling 1800 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
  % (self.n_quantiles, n_samples))
  % (self.n_quantiles, n_samples))
  % (self.n_quantiles, n_samples))
  % (self.n_quantiles, n_samples))
  % (self.n_quantiles, n_samples))
  % (self.n_quantiles, n_samples))
  % (self.n_quantiles, n_samples))
  % (self.n_quantiles, n_samples))
  % (self.n_quantiles, n_samples))
  % (self.n_quantiles, n_samples))
  % (self.n_quantiles, n_samples))
  % (self.n_quantiles, n_samples))
  % (self.n_quantiles, n_samples))
  % (self.n_quantiles, n_samples))
  % (self.n_quantiles, n_samples))
  % (self.n_quantiles, n_samples))
  % (self.n_quantiles, n_samples))
  % (self.n_quantiles, n_samples))
  % (self.n_quantiles, n_samples))
  % (self.n_quantiles, n_samples))
  % (self.n_quantiles, n_samples))
  % (self.n_quantiles, n_samples))
  % (self.n_quantiles, n_samples))
  % (self.n_quantiles, n_samples))
  % (self.n_quantiles, n_samples))
  % (self.n_quantiles, n_samples))
  % (self

GridSearchCV(cv=5, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('scaler',
                                        StandardScaler(copy=True,
                                                       with_mean=True,
                                                       with_std=True)),
                                       ('red_dim',
                                        PCA(copy=True, iterated_power='auto',
                                            n_components=None,
                                            random_state=None,
                                            svd_solver='auto', tol=0.0,
                                            whiten=False)),
                                       ('SVM',
                                        SVC(C=1.0, break_ties=False,
                                            cache_size=200, class_weight=None,
                                            coef0=0.0,
                       

In [38]:
print(f'score = {grid.score(X_test, y_test)}')

score = 0.46153846153846156


In [39]:
print(grid.best_params_)

{'SVM__C': 10, 'SVM__gamma': 0.01, 'red_dim': PCA(copy=True, iterated_power='auto', n_components=5, random_state=None,
    svd_solver='auto', tol=0.0, whiten=False), 'red_dim__n_components': 5, 'scaler': StandardScaler(copy=True, with_mean=True, with_std=True)}


#Prova Model selection

In [48]:
from sklearn.metrics import accuracy_score, log_loss
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

scalers_to_test = [StandardScaler(), RobustScaler(), QuantileTransformer()]

classifiers = [KNeighborsClassifier(5),
    SVC(kernel="rbf", C=0.025, probability=True),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    AdaBoostClassifier(),
    GradientBoostingClassifier()]
    
    
for classifier in classifiers:
  pipe = Pipeline(steps=[('scaler', StandardScaler()), ('classifier', classifier)])
  pipe.fit(X_train, y_train)   
    
  print(classifier)
  print("model score: %.3f" % pipe.score(X_test, y_test))

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')
model score: 0.385
SVC(C=0.025, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=True, random_state=None, shrinking=True, tol=0.001,
    verbose=False)
model score: 0.462
DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')
model score: 0.308
RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
     

##Prova ottimizzazione GradientBoostingClassifier

In [0]:
from sklearn.preprocessing import StandardScaler, RobustScaler, QuantileTransformer
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.feature_selection import SelectKBest
from sklearn.svm import SVC

In [0]:
scalers_to_test = [StandardScaler(), RobustScaler(), QuantileTransformer()]

In [0]:
steps = [('scaler', StandardScaler()), ('red_dim', PCA()), ('classifier', GradientBoostingClassifier())]

In [0]:
from sklearn.pipeline import Pipeline
pipeline = Pipeline(steps)

In [0]:
parameteres = [{'scaler':scalers_to_test, 'red_dim':[LinearDiscriminantAnalysis()], 'red_dim__n_components':[2]},
               {'scaler':scalers_to_test, 'red_dim':[PCA()], 'red_dim__n_components':n_features_to_test},
               {'scaler':scalers_to_test, 'red_dim':[SelectKBest()], 'red_dim__k':n_features_to_test}]

In [0]:
grid = GridSearchCV(pipeline, param_grid=parameteres, cv=5, verbose=1)

In [55]:
grid.fit(X_train, y_train)

Fitting 5 folds for each of 63 candidates, totalling 315 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
  % (self.n_quantiles, n_samples))
  % (self.n_quantiles, n_samples))
  % (self.n_quantiles, n_samples))
  % (self.n_quantiles, n_samples))
  % (self.n_quantiles, n_samples))
  % (self.n_quantiles, n_samples))
  % (self.n_quantiles, n_samples))
  % (self.n_quantiles, n_samples))
  % (self.n_quantiles, n_samples))
  % (self.n_quantiles, n_samples))
  % (self.n_quantiles, n_samples))
  % (self.n_quantiles, n_samples))
  % (self.n_quantiles, n_samples))
  % (self.n_quantiles, n_samples))
  % (self.n_quantiles, n_samples))
  % (self.n_quantiles, n_samples))
  % (self.n_quantiles, n_samples))
  % (self.n_quantiles, n_samples))
  % (self.n_quantiles, n_samples))
  % (self.n_quantiles, n_samples))
  % (self.n_quantiles, n_samples))
  % (self.n_quantiles, n_samples))
  % (self.n_quantiles, n_samples))
  % (self.n_quantiles, n_samples))
  % (self.n_quantiles, n_samples))
  % (self.n_quantiles, n_samples))
  % (self

GridSearchCV(cv=5, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('scaler',
                                        StandardScaler(copy=True,
                                                       with_mean=True,
                                                       with_std=True)),
                                       ('red_dim',
                                        PCA(copy=True, iterated_power='auto',
                                            n_components=None,
                                            random_state=None,
                                            svd_solver='auto', tol=0.0,
                                            whiten=False)),
                                       ('classifier',
                                        GradientBoostingClassifier(ccp_alpha=0.0,
                                                                   criterion='friedman_mse',
                                            

In [56]:
print(f'score = {grid.score(X_test, y_test)}')

score = 0.6153846153846154


In [58]:
print(grid.best_params_)

{'red_dim': SelectKBest(k=5, score_func=<function f_classif at 0x7f2a10225268>), 'red_dim__k': 5, 'scaler': QuantileTransformer(copy=True, ignore_implicit_zeros=False, n_quantiles=1000,
                    output_distribution='uniform', random_state=None,
                    subsample=100000)}
