### CapstoneTwo : Part 3 - Training Data

### Cancer Patient Data
https://www.kaggle.com/rishidamarla/cancer-patients-data?select=cancer+patient+data+sets.xlsx

In [1]:
# import modules

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# import data from the csv file

data = pd.read_csv('cancer_patient_data sets.csv')
data.head()

Unnamed: 0,Age,Gender,Air Pollution,Alcohol use,Dust Allergy,OccuPational Hazards,Genetic Risk,chronic Lung Disease,Balanced Diet,Obesity,...,Fatigue,Weight Loss,Shortness of Breath,Wheezing,Swallowing Difficulty,Clubbing of Finger Nails,Frequent Cold,Dry Cough,Snoring,Level
0,33,1,2,4,5,4,3,2,2,4,...,3,4,2,2,3,1,2,3,4,Low
1,17,1,3,1,5,3,4,2,2,2,...,1,3,7,8,6,2,1,7,2,Medium
2,35,1,4,5,6,5,5,4,6,7,...,8,7,9,2,1,4,6,7,2,High
3,37,1,7,7,7,7,6,7,7,7,...,4,2,3,1,4,5,6,7,5,High
4,46,1,6,8,7,7,7,6,7,7,...,3,2,4,1,4,2,4,2,3,High


In [3]:
top_lifestyle = ['Age', 'Air Pollution', 'Alcohol use', 'Dust Allergy', 'OccuPational Hazards', 'Genetic Risk', 'chronic Lung Disease', \
         'Balanced Diet', 'Obesity', 'Smoking', 'Passive Smoker', 'Snoring', 'Level']

In [4]:

df = data[top_lifestyle]
df.rename(columns={"chronic Lung Disease": "Chronic Lung Disease", "OccuPational Hazards" : "Occupational Hazards"}, inplace = True)
df.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(


Unnamed: 0,Age,Air Pollution,Alcohol use,Dust Allergy,Occupational Hazards,Genetic Risk,Chronic Lung Disease,Balanced Diet,Obesity,Smoking,Passive Smoker,Snoring,Level
0,33,2,4,5,4,3,2,2,4,3,2,4,Low
1,17,3,1,5,3,4,2,2,2,2,4,2,Medium
2,35,4,5,6,5,5,4,6,7,2,3,2,High
3,37,7,7,7,7,6,7,7,7,7,7,5,High
4,46,6,8,7,7,7,6,7,7,8,7,3,High


In [5]:
#imported modules for StandardScaler and train_test_split from sklearn.

from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [6]:
X = df.drop(['Level'], axis = 1)
y = df['Level']

In [7]:
# split data into 80% training and 20% testing 

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 0, test_size = 0.2)

In [8]:
# apply StandardScaler

sc = StandardScaler()

X_train_sc = sc.fit_transform(X_train)
X_test_sc = sc.transform(X_test)

In [9]:
# factorizing "Level" 

y_train_fact = pd.factorize(y_train)[0]
y_test_fact = pd.factorize(y_test)[0]


In [10]:
# check how "Level" is factorized/mapped

print("BEFORE - factorizing:")
print(y_train.value_counts())

print("\nAFTER - factorizing:")
np.unique(y_train_fact, return_counts=True)



BEFORE - factorizing:
High      290
Medium    274
Low       236
Name: Level, dtype: int64

AFTER - factorizing:


(array([0, 1, 2], dtype=int64), array([290, 236, 274], dtype=int64))

#### "Level" factorized/mapped as
    0 : High
    1 : Low
    2 : Medium

In [11]:
print(X_train_sc)

[[ 1.28546344  0.05346758  0.13585028 ... -0.77601748 -0.51031997
  -0.63782251]
 [ 1.2033842  -0.43818832 -1.00815209 ...  1.21888607 -0.94417329
   0.04342984]
 [-1.58730983  1.03677939  1.27985265 ...  1.61786678  1.22509332
   0.04342984]
 ...
 [-1.25899289  1.03677939  1.27985265 ... -1.17499819 -0.94417329
  -0.63782251]
 [-0.02780434  1.52843529  0.89851853 ...  1.21888607  1.22509332
   1.40593453]
 [-0.10988358  1.03677939  0.89851853 ...  1.21888607  1.22509332
  -0.63782251]]


### Random Forest

In [12]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_absolute_error
from sklearn import metrics

from sklearn.metrics import roc_curve, auc, f1_score, accuracy_score, classification_report, roc_auc_score, confusion_matrix
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score, RandomizedSearchCV, GridSearchCV
from sklearn.svm import SVC

In [13]:
rf_model = RandomForestClassifier(n_jobs = 2, random_state = 0)

# fit your model
#rf_model.fit(X_train, y_train_fact)

In [14]:
n_estimators = [10, 100, 500, 1000]
max_features = ['sqrt', 'log2']
# max_depth = [5, 8, 15, 25, 30]
# min_samples_split = [2, 5, 10, 15, 100]
# min_samples_leaf = [1, 2, 5, 10] 

# define grid search
param_rf = dict(n_estimators = n_estimators, max_features = max_features)

grid_rf = GridSearchCV(rf_model, param_grid = param_rf, cv = 3, n_jobs = -1)
best_rf = grid_rf.fit(X_train, y_train_fact)

In [15]:
grid_rf.best_params_

{'max_features': 'sqrt', 'n_estimators': 10}

In [16]:
# fitting and predicting with hyperparameters that were optimized

model_rf2 = RandomForestClassifier(n_jobs = 2, random_state = 0, n_estimators = 10, max_features = 'sqrt')
model_rf2.fit(X_train, y_train_fact)

RandomForestClassifier(max_features='sqrt', n_estimators=10, n_jobs=2,
                       random_state=0)

In [17]:
y_rf = model_rf2.predict(X_test)

In [18]:
a_score = accuracy_score(y_rf, y_test_fact)
f_score = f1_score(y_rf, y_test_fact, average='macro')
rf_mae = mean_absolute_error(y_rf, y_test_fact)

print("RandomForest Classifier")
print("  Accuracy Score:", a_score)
print("  F1 Score:", f_score)
print("  mae:", rf_mae)

RandomForest Classifier
  Accuracy Score: 0.375
  F1 Score: 0.3333333333333333
  mae: 0.625


### Logistic Regression

In [19]:
from sklearn.linear_model import LogisticRegression

lr_model = LogisticRegression(random_state=0)
#lr_model.fit(X_train, y_train)

In [20]:

solvers = ['newton-cg', 'lbfgs', 'liblinear']
penalty = ['l2']
c_values = [100, 10, 1.0, 0.1, 0.01]
# define grid search
param_lr = dict(solver = solvers, penalty = penalty, C = c_values)

In [21]:

grid_lr = GridSearchCV(lr_model, param_grid = param_lr, n_jobs = -1, cv = 3)

best_lr = grid_lr.fit(X_train_sc, y_train_fact)

In [22]:
grid_lr.best_params_

{'C': 100, 'penalty': 'l2', 'solver': 'newton-cg'}

In [23]:
# fitting and predicting with hyperparameters that were optimized

lr_model2 = LogisticRegression(random_state=0, C = 100, penalty = 'l2', solver = 'newton-cg')
lr_model2.fit(X_train_sc, y_train_fact)
y_lr = lr_model2.predict(X_test_sc)

In [24]:
a_score = accuracy_score(y_lr, y_test_fact)
f_score = f1_score(y_lr, y_test_fact, average='macro')
lr_mae = mean_absolute_error(y_lr, y_test_fact)

print("Logistic Regression")
print("  Accuracy Score:", a_score)
print("  F1 Score:", f_score)
print("  mae:", lr_mae)

Logistic Regression
  Accuracy Score: 0.375
  F1 Score: 0.3333333333333333
  mae: 0.625


### GradientBoostingClassifier

In [25]:
from sklearn.ensemble import GradientBoostingClassifier

In [26]:
gb_model = GradientBoostingClassifier(random_state=0)


In [27]:
estimators = [5,50,250,500]
depth = [1,3,5,7,9]
learning = [0.01,0.1,1,10,100]

param_gb = dict(n_estimators = estimators, max_depth = depth, learning_rate = learning)

In [28]:
grid_gb = GridSearchCV(gb_model, param_grid = param_gb, cv = 3, verbose = 1, 
                      n_jobs = -1)
best_gb = grid_gb.fit(X_train, y_train_fact)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  52 tasks      | elapsed:    8.7s
[Parallel(n_jobs=-1)]: Done 208 tasks      | elapsed:   41.3s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:   59.9s finished


In [29]:
grid_gb.best_params_

{'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 250}

In [30]:
# fitting and predicting with hyperparameters that were optimized

gb_model2 = GradientBoostingClassifier(random_state=0, learning_rate = 0.01, max_depth = 3, n_estimators = 250)
gb_model2.fit(X_train, y_train_fact)
y_gb = gb_model2.predict(X_test_sc)

In [31]:
a_score = accuracy_score(y_gb, y_test_fact)
f_score = f1_score(y_gb, y_test_fact, average='macro')
gb_mae = mean_absolute_error(y_gb, y_test_fact)

print("GradientBoosting Classifier")
print("  Accuracy Score:", a_score)
print("  F1 Score:", f_score)
print("  mae:", gb_mae)

GradientBoosting Classifier
  Accuracy Score: 0.29
  F1 Score: 0.1498708010335917
  mae: 0.71
