# TM10007 Assignment template

## Data loading and cleaning

Below are functions to load the dataset of your choice. After that, it is all up to you to create and evaluate a classification method. Beware, there may be missing values in these datasets. Good luck!

In [None]:
# import packages
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder, QuantileTransformer
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import VarianceThreshold

# Classifiers
from worclipo.load_data import load_data

### functions

In [None]:
# function to split the dataset into train and test
def split_set(X,y,test_size):

    if os.path.exists('./TEST_set.csv'):
        split_action = print('TEST_set.csv already exists')
    else:
        split_action = print('TEST_set.csv does not exist, generating new test and training sets')
        X_train_csv, X_test_csv, y_train_csv, y_test_csv = train_test_split(X, y, test_size=test_size, random_state=10)

        TESTSET = X_test_csv.merge(y_test_csv, left_index=True, right_index=True)
        TESTSET.to_csv('TEST_set.csv')

        TRAINSET = X_train_csv.merge(y_train_csv, left_index=True, right_index=True)
        TRAINSET.to_csv('TRAIN_set.csv')
        return split_action
    

# setting up the data to be processed

In [None]:
data = load_data()
print(f'The number of samples: {len(data.index)}')
print(f'The number of columns: {len(data.columns)}')
print(type(data))

# change lipoma = 1 and liposarcoma = 0 and encode labels
group_names = list(set(data.label))
data.loc[data['label'] == 'lipoma', 'label'] = 1
data.loc[data['label'] == 'liposarcoma', 'label'] = 0
data['label'] = pd.cut(data['label'], bins = 2, labels=group_names)
print(data['label'].unique())
label_diag = LabelEncoder()
data['label'] = label_diag.fit_transform(data['label'])

# assign X to measurements and y to outcome (lipoma/sarcoma)
X = data.drop('label', axis=1)
y = data['label']
test_size = 0.3

In [None]:
# code that splits the data into test and validation sets if this is not done already
split_set(X,y,test_size)

## import the training set

In [None]:
TRAIN = pd.read_csv('TRAIN_set.csv', index_col=0)
X_train = TRAIN.drop('label', axis=1)
y_train = TRAIN['label']

TRAIN['label'] = pd.cut(TRAIN['label'], bins = 2, labels=group_names)
print(TRAIN['label'].unique())
label_diag = LabelEncoder()
TRAIN['label'] = label_diag.fit_transform(TRAIN['label'])

# split into training and validation set
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.3, random_state=10)

from sklearn.pipeline import Pipeline

print(f'Size before preprocess: ', X_train.shape)

pipe_preprocess = Pipeline([
    ("check nan", SimpleImputer(missing_values=np.nan, strategy='mean')),
    ("scale", StandardScaler()),
    ("variance", VarianceThreshold(threshold=0)),
])

X_train = pipe_preprocess.fit_transform(X_train)
X_valid = pipe_preprocess.transform(X_valid)
print(f'Size after preprocess: ', X_train.shape)

# Classify

In [32]:
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV

import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import *


# setup classifiers
clf1 = RandomForestClassifier(random_state=1)
clf2 = SVC(probability=True, random_state=1)
clf3 = LogisticRegression(random_state=1)
clf4 = DecisionTreeClassifier(random_state=1)
clf5 = KNeighborsClassifier()
clf6 = MultinomialNB()
clf7 = GradientBoostingClassifier(random_state=1)

# Initiaze the hyperparameters for each dictionary
param1 = {}
param1['classifier__n_estimators'] = [1,10, 50, 100, 250]
param1['classifier__max_depth'] = [5, 10, 20]
param1['classifier__class_weight'] = [None, {0:1,1:5}, {0:1,1:10}, {0:1,1:25}]
param1['classifier'] = [clf1]

param2 = {}
param2['classifier__C'] = [1,10]
param2['classifier__class_weight'] = [None, {0:1,1:5}, {0:1,1:10}, {0:1,1:25}]
param2['classifier__kernel'] = ['linear','rbf','polynomial','sigmoid']
param2['classifier'] = [clf2]


param3 = {}
param3['classifier__C'] = [10**-2, 10**-1, 10**0, 10**1, 10**2]
param3['classifier__penalty'] = [None, 'l2']
param3['classifier__class_weight'] = [None, {0:1,1:5}, {0:1,1:6}, {0:1,1:7}]
param3['classifier'] = [clf3]

param4 = {}
param4['classifier__max_depth'] = [5,10,25,None]
param4['classifier__min_samples_split'] = [2,5,10]
param4['classifier__class_weight'] = [{0:1,1:2}, {0:1,1:3}, {0:1,1:4}, {0:1,1:5}]
param4['classifier'] = [clf4]

param5 = {}
param5['classifier__n_neighbors'] = [2,5,10,25,50]
param5['classifier'] = [clf5]

param6 = {}
param6['classifier__alpha'] = [10**0, 10**1, 10**2]
param6['classifier'] = [clf6]

param7 = {}
param7['classifier__n_estimators'] = [10, 50, 100, 250]
param7['classifier__max_depth'] = [5, 10, 20]
param7['classifier'] = [clf7]

pipeline = Pipeline([('classifier', clf1)])
params = [param1, param2, param3, param4, param5, param6, param7]

gs = GridSearchCV(pipeline, params, cv=20, n_jobs=-1, scoring='roc_auc').fit(X_train, y_train)

print(gs.best_score_)
print(gs.best_estimator_)
print(gs.best_params_)


# Test data performance
print("Test Precision:",precision_score(gs.predict(X_valid), y_valid))
print("Test Recall:",recall_score(gs.predict(X_valid), y_valid))
print("Test ROC AUC Score:",roc_auc_score(gs.predict(X_valid), y_valid))


0.875
Pipeline(steps=[('classifier',
                 LogisticRegression(C=0.01, penalty=None, random_state=1))])
{'classifier': LogisticRegression(C=0.01, penalty=None, random_state=1), 'classifier__C': 0.01, 'classifier__class_weight': None, 'classifier__penalty': None}
Test Precision: 0.46153846153846156
Test Recall: 0.6
Test ROC AUC Score: 0.55


220 fits failed out of a total of 4000.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
36 fits failed with the following error:
Traceback (most recent call last):
  File "c:\ProgramData\miniconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\ProgramData\miniconda3\lib\site-packages\sklearn\pipeline.py", line 406, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "c:\ProgramData\miniconda3\lib\site-packages\sklearn\svm\_base.py", line 180, in fit
    self._validate_params()
  File "c:\ProgramData\miniconda3\lib\site-packages\sklearn\base.py", line 570, in _validate_params
    validate_parameter_constraints(
  Fi

In [24]:
print([v for v in np.linspace(1,100,100)])

[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0, 37.0, 38.0, 39.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0, 47.0, 48.0, 49.0, 50.0, 51.0, 52.0, 53.0, 54.0, 55.0, 56.0, 57.0, 58.0, 59.0, 60.0, 61.0, 62.0, 63.0, 64.0, 65.0, 66.0, 67.0, 68.0, 69.0, 70.0, 71.0, 72.0, 73.0, 74.0, 75.0, 76.0, 77.0, 78.0, 79.0, 80.0, 81.0, 82.0, 83.0, 84.0, 85.0, 86.0, 87.0, 88.0, 89.0, 90.0, 91.0, 92.0, 93.0, 94.0, 95.0, 96.0, 97.0, 98.0, 99.0, 100.0]


In [33]:


from sklearn.model_selection import GridSearchCV

import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import *

gsb = GridSearchCV(
    estimator=SVC(probability=True,random_state=1,max_iter=1000),
    param_grid={'kernel':['linear','rbf','polynomial','sigmoid'],'C':[v for v in np.linspace(0.01,100,1000)]},
    cv = 10,
    n_jobs=-1,
).fit(X_train,y_train)

print(gsb.best_score_)
print(gsb.best_estimator_)

best_clf = LogisticRegression(C=0.01, penalty=None, random_state=1)
best_clf.fit(X_train,y_train)
pred_rfc = best_clf.predict(X_valid)

print('rfc', classification_report(y_valid, pred_rfc))
print(confusion_matrix(y_valid, pred_rfc))


results = pd.DataFrame(gsb.cv_results_)
results = results.sort_values(by=['rank_test_score'])
results.to_csv('results')


0.7866666666666666
SVC(C=0.11009009009009008, kernel='linear', max_iter=1000, probability=True,
    random_state=1)
rfc               precision    recall  f1-score   support

           0       0.50      0.64      0.56        11
           1       0.60      0.46      0.52        13

    accuracy                           0.54        24
   macro avg       0.55      0.55      0.54        24
weighted avg       0.55      0.54      0.54        24

[[7 4]
 [7 6]]


10000 fits failed out of a total of 40000.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
771 fits failed with the following error:
Traceback (most recent call last):
  File "c:\ProgramData\miniconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\ProgramData\miniconda3\lib\site-packages\sklearn\svm\_base.py", line 180, in fit
    self._validate_params()
  File "c:\ProgramData\miniconda3\lib\site-packages\sklearn\base.py", line 570, in _validate_params
    validate_parameter_constraints(
  File "c:\ProgramData\miniconda3\lib\site-packages\sklearn\utils\_param_validation.py", line 97, in validate_parameter_constraints
    raise InvalidPar