### CapstoneTwo : Part 3 - Training Data

### Cancer Patient Data
https://www.kaggle.com/rishidamarla/cancer-patients-data?select=cancer+patient+data+sets.xlsx

In [1]:
# import modules

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# import data from the csv file

data = pd.read_csv('cancer_patient_data sets.csv')
data.head()

Unnamed: 0,Age,Gender,Air Pollution,Alcohol use,Dust Allergy,OccuPational Hazards,Genetic Risk,chronic Lung Disease,Balanced Diet,Obesity,...,Fatigue,Weight Loss,Shortness of Breath,Wheezing,Swallowing Difficulty,Clubbing of Finger Nails,Frequent Cold,Dry Cough,Snoring,Level
0,33,1,2,4,5,4,3,2,2,4,...,3,4,2,2,3,1,2,3,4,Low
1,17,1,3,1,5,3,4,2,2,2,...,1,3,7,8,6,2,1,7,2,Medium
2,35,1,4,5,6,5,5,4,6,7,...,8,7,9,2,1,4,6,7,2,High
3,37,1,7,7,7,7,6,7,7,7,...,4,2,3,1,4,5,6,7,5,High
4,46,1,6,8,7,7,7,6,7,7,...,3,2,4,1,4,2,4,2,3,High


In [3]:
# break them into Age Groups

data['AgeGroup'] = pd.cut(data.Age,[0, 19, 29, 39, 49, 59, 80], labels=['10s', '20s', '30s', '40s', '50s', '60s+'])
data.head()

Unnamed: 0,Age,Gender,Air Pollution,Alcohol use,Dust Allergy,OccuPational Hazards,Genetic Risk,chronic Lung Disease,Balanced Diet,Obesity,...,Weight Loss,Shortness of Breath,Wheezing,Swallowing Difficulty,Clubbing of Finger Nails,Frequent Cold,Dry Cough,Snoring,Level,AgeGroup
0,33,1,2,4,5,4,3,2,2,4,...,4,2,2,3,1,2,3,4,Low,30s
1,17,1,3,1,5,3,4,2,2,2,...,3,7,8,6,2,1,7,2,Medium,10s
2,35,1,4,5,6,5,5,4,6,7,...,7,9,2,1,4,6,7,2,High,30s
3,37,1,7,7,7,7,6,7,7,7,...,2,3,1,4,5,6,7,5,High,30s
4,46,1,6,8,7,7,7,6,7,7,...,2,4,1,4,2,4,2,3,High,40s


In [4]:
# checking the age group breakdown

data['AgeGroup'].value_counts()

30s     358
20s     234
40s     207
60s+     71
10s      67
50s      63
Name: AgeGroup, dtype: int64

In [5]:
top_lifestyle = ['Age', 'Air Pollution', 'Alcohol use', 'Dust Allergy', 'OccuPational Hazards', 'Genetic Risk', 'chronic Lung Disease', \
         'Balanced Diet', 'Obesity', 'Smoking', 'Passive Smoker', 'Snoring', 'Level']

In [6]:

df = data[top_lifestyle]
df.rename(columns={"chronic Lung Disease": "Chronic Lung Disease", "OccuPational Hazards" : "Occupational Hazards"}, inplace = True)
df.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(


Unnamed: 0,Age,Air Pollution,Alcohol use,Dust Allergy,Occupational Hazards,Genetic Risk,Chronic Lung Disease,Balanced Diet,Obesity,Smoking,Passive Smoker,Snoring,Level
0,33,2,4,5,4,3,2,2,4,3,2,4,Low
1,17,3,1,5,3,4,2,2,2,2,4,2,Medium
2,35,4,5,6,5,5,4,6,7,2,3,2,High
3,37,7,7,7,7,6,7,7,7,7,7,5,High
4,46,6,8,7,7,7,6,7,7,8,7,3,High


In [7]:
#imported modules for StandardScaler and train_test_split from sklearn.

from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [8]:
X = df.drop(['Level'], axis = 1)
y = df['Level']

In [9]:
# split data into 80% training and 20% testing 

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 0, test_size = 0.2)

In [10]:
# apply StandardScaler

sc = StandardScaler()

X_train_sc = sc.fit_transform(X_train)
X_test_sc = sc.transform(X_test)

In [11]:
# factorizing "Level" 

y_train_fact = pd.factorize(y_train)[0]
y_train_fact


array([0, 1, 0, 1, 1, 2, 1, 2, 2, 1, 2, 2, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1,
       2, 1, 2, 2, 1, 2, 0, 0, 2, 2, 2, 0, 1, 0, 1, 2, 2, 2, 0, 1, 0, 0,
       0, 0, 0, 0, 1, 2, 0, 2, 2, 2, 2, 2, 0, 0, 2, 1, 1, 1, 0, 2, 1, 1,
       0, 2, 2, 2, 1, 2, 1, 1, 0, 0, 0, 2, 0, 1, 0, 0, 2, 2, 0, 1, 2, 0,
       1, 0, 2, 0, 2, 2, 1, 1, 0, 2, 1, 0, 0, 0, 2, 2, 1, 1, 1, 0, 2, 1,
       1, 1, 2, 0, 0, 1, 0, 2, 2, 1, 1, 1, 1, 0, 2, 2, 1, 1, 1, 0, 2, 2,
       2, 0, 1, 0, 0, 0, 2, 2, 2, 1, 1, 2, 0, 0, 2, 0, 1, 0, 2, 2, 2, 2,
       2, 2, 0, 0, 2, 0, 0, 0, 2, 2, 0, 0, 2, 2, 0, 0, 2, 1, 2, 0, 0, 1,
       2, 0, 0, 2, 2, 1, 0, 0, 0, 2, 1, 0, 0, 1, 1, 0, 0, 2, 2, 2, 1, 0,
       1, 0, 0, 0, 0, 1, 2, 1, 0, 1, 0, 1, 2, 1, 0, 0, 0, 0, 1, 1, 2, 2,
       2, 2, 0, 2, 2, 0, 1, 1, 1, 1, 0, 1, 2, 0, 2, 1, 1, 1, 1, 2, 0, 2,
       1, 2, 0, 2, 0, 1, 2, 2, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 2, 1, 0,
       0, 1, 1, 0, 2, 1, 2, 2, 0, 2, 0, 2, 0, 2, 1, 2, 0, 2, 2, 0, 1, 0,
       0, 0, 2, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 2,

In [12]:
# check how "Level" is factorized/mapped

print("BEFORE - factorizing:")
print(y_train.value_counts())

print("\nAFTER - factorizing:")
np.unique(y_train_fact, return_counts=True)



BEFORE - factorizing:
High      290
Medium    274
Low       236
Name: Level, dtype: int64

AFTER - factorizing:


(array([0, 1, 2], dtype=int64), array([290, 236, 274], dtype=int64))

#### "Level" factorized/mapped as
    0 : High
    1 : Low
    2 : Medium

In [13]:
print(X_train_sc)

[[ 1.28546344  0.05346758  0.13585028 ... -0.77601748 -0.51031997
  -0.63782251]
 [ 1.2033842  -0.43818832 -1.00815209 ...  1.21888607 -0.94417329
   0.04342984]
 [-1.58730983  1.03677939  1.27985265 ...  1.61786678  1.22509332
   0.04342984]
 ...
 [-1.25899289  1.03677939  1.27985265 ... -1.17499819 -0.94417329
  -0.63782251]
 [-0.02780434  1.52843529  0.89851853 ...  1.21888607  1.22509332
   1.40593453]
 [-0.10988358  1.03677939  0.89851853 ...  1.21888607  1.22509332
  -0.63782251]]


In [14]:
print(y_train_fact)

[0 1 0 1 1 2 1 2 2 1 2 2 1 1 1 0 1 1 1 0 0 1 2 1 2 2 1 2 0 0 2 2 2 0 1 0 1
 2 2 2 0 1 0 0 0 0 0 0 1 2 0 2 2 2 2 2 0 0 2 1 1 1 0 2 1 1 0 2 2 2 1 2 1 1
 0 0 0 2 0 1 0 0 2 2 0 1 2 0 1 0 2 0 2 2 1 1 0 2 1 0 0 0 2 2 1 1 1 0 2 1 1
 1 2 0 0 1 0 2 2 1 1 1 1 0 2 2 1 1 1 0 2 2 2 0 1 0 0 0 2 2 2 1 1 2 0 0 2 0
 1 0 2 2 2 2 2 2 0 0 2 0 0 0 2 2 0 0 2 2 0 0 2 1 2 0 0 1 2 0 0 2 2 1 0 0 0
 2 1 0 0 1 1 0 0 2 2 2 1 0 1 0 0 0 0 1 2 1 0 1 0 1 2 1 0 0 0 0 1 1 2 2 2 2
 0 2 2 0 1 1 1 1 0 1 2 0 2 1 1 1 1 2 0 2 1 2 0 2 0 1 2 2 1 0 1 0 0 1 0 0 0
 0 0 2 1 0 0 1 1 0 2 1 2 2 0 2 0 2 0 2 1 2 0 2 2 0 1 0 0 0 2 1 1 0 1 0 0 0
 1 0 0 1 2 2 2 2 2 2 1 2 2 0 0 0 1 0 2 0 0 2 0 0 1 2 0 1 1 1 2 0 2 0 1 2 2
 0 0 1 0 0 2 1 2 0 2 1 2 0 2 0 2 2 1 0 0 0 0 0 1 1 0 0 1 1 2 1 2 1 2 0 2 0
 2 2 1 2 0 2 1 2 0 2 1 1 1 0 2 2 2 0 1 0 0 0 0 0 1 1 2 0 2 1 0 0 1 2 0 1 2
 2 0 2 1 0 1 1 1 2 0 0 1 0 2 1 0 0 2 0 1 2 0 0 0 2 0 2 0 2 0 2 0 2 2 1 1 2
 0 1 2 1 0 0 0 0 2 2 1 1 2 1 2 2 2 2 0 0 1 0 2 0 1 2 0 0 1 1 2 2 2 0 0 2 2
 2 0 2 0 0 0 1 0 0 0 0 1 