In [13]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

## Scaling
- Used to bring features having different magnitudes to similar scale

In [14]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler

In [15]:
data = [[12000,6],[30000,7],[15000,4],[22000,5]]

In [16]:
scale = MinMaxScaler()    # Scales values between 0 and 1 (minimum as 0 and maximum as 1)
model = scale.fit(data)
scaled_values = model.transform(data)
scaled_values

array([[0.        , 0.66666667],
       [1.        , 1.        ],
       [0.16666667, 0.        ],
       [0.55555556, 0.33333333]])

In [17]:
obj = StandardScaler()   #Scales values in terms of mean and standard deviation
model = obj.fit(data)
scales_data = model.transform(data)
scales_data

array([[-1.11643773,  0.4472136 ],
       [ 1.47657893,  1.34164079],
       [-0.68426829, -1.34164079],
       [ 0.32412708, -0.4472136 ]])

## Encoding
- Convert non numeric values to numeric representation
- One hot encoding
- Label encoding

In [18]:
hrt = pd.read_csv('heart.csv')
hrt.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40.0,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49.0,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37.0,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48.0,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54.0,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


In [19]:
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

In [20]:
hrt.isna().sum()

Age               4
Sex               3
ChestPainType     0
RestingBP         0
Cholesterol       0
FastingBS         0
RestingECG        0
MaxHR             0
ExerciseAngina    0
Oldpeak           0
ST_Slope          0
HeartDisease      0
dtype: int64

In [21]:
hrt['Age'].fillna(np.mean(hrt['Age']), inplace = True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  hrt['Age'].fillna(np.mean(hrt['Age']), inplace = True)


In [22]:
hrt.Sex.value_counts()

Sex
M    723
F    192
Name: count, dtype: int64

In [23]:
hrt['Sex'].fillna("M", inplace = True)
hrt.isna().sum()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  hrt['Sex'].fillna("M", inplace = True)


Age               0
Sex               0
ChestPainType     0
RestingBP         0
Cholesterol       0
FastingBS         0
RestingECG        0
MaxHR             0
ExerciseAngina    0
Oldpeak           0
ST_Slope          0
HeartDisease      0
dtype: int64

In [24]:
#Using get_dummies function to encode Sex variable
hrt = pd.get_dummies(hrt, columns=['Sex'], dtype=int)
hrt

Unnamed: 0,Age,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease,Sex_F,Sex_M
0,40.0,ATA,140,289,0,Normal,172,N,0.0,Up,0,0,1
1,49.0,NAP,160,180,0,Normal,156,N,1.0,Flat,1,1,0
2,37.0,ATA,130,283,0,ST,98,N,0.0,Up,0,0,1
3,48.0,ASY,138,214,0,Normal,108,Y,1.5,Flat,1,1,0
4,54.0,NAP,150,195,0,Normal,122,N,0.0,Up,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
913,45.0,TA,110,264,0,Normal,132,N,1.2,Flat,1,0,1
914,68.0,ASY,144,193,1,Normal,141,N,3.4,Flat,1,0,1
915,57.0,ASY,130,131,0,Normal,115,Y,1.2,Flat,1,0,1
916,57.0,ATA,130,236,0,LVH,174,N,0.0,Flat,1,1,0


In [25]:
hrt.drop('Sex_M', inplace=True, axis=1)
hrt

Unnamed: 0,Age,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease,Sex_F
0,40.0,ATA,140,289,0,Normal,172,N,0.0,Up,0,0
1,49.0,NAP,160,180,0,Normal,156,N,1.0,Flat,1,1
2,37.0,ATA,130,283,0,ST,98,N,0.0,Up,0,0
3,48.0,ASY,138,214,0,Normal,108,Y,1.5,Flat,1,1
4,54.0,NAP,150,195,0,Normal,122,N,0.0,Up,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
913,45.0,TA,110,264,0,Normal,132,N,1.2,Flat,1,0
914,68.0,ASY,144,193,1,Normal,141,N,3.4,Flat,1,0
915,57.0,ASY,130,131,0,Normal,115,Y,1.2,Flat,1,0
916,57.0,ATA,130,236,0,LVH,174,N,0.0,Flat,1,1


In [26]:
hrt = hrt.rename(columns={'Sex_F' : 'Gender'})
hrt

Unnamed: 0,Age,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease,Gender
0,40.0,ATA,140,289,0,Normal,172,N,0.0,Up,0,0
1,49.0,NAP,160,180,0,Normal,156,N,1.0,Flat,1,1
2,37.0,ATA,130,283,0,ST,98,N,0.0,Up,0,0
3,48.0,ASY,138,214,0,Normal,108,Y,1.5,Flat,1,1
4,54.0,NAP,150,195,0,Normal,122,N,0.0,Up,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
913,45.0,TA,110,264,0,Normal,132,N,1.2,Flat,1,0
914,68.0,ASY,144,193,1,Normal,141,N,3.4,Flat,1,0
915,57.0,ASY,130,131,0,Normal,115,Y,1.2,Flat,1,0
916,57.0,ATA,130,236,0,LVH,174,N,0.0,Flat,1,1


In [27]:
hrt = pd.get_dummies(hrt, columns=['ST_Slope'], dtype=int)
hrt

Unnamed: 0,Age,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,HeartDisease,Gender,ST_Slope_Down,ST_Slope_Flat,ST_Slope_Up
0,40.0,ATA,140,289,0,Normal,172,N,0.0,0,0,0,0,1
1,49.0,NAP,160,180,0,Normal,156,N,1.0,1,1,0,1,0
2,37.0,ATA,130,283,0,ST,98,N,0.0,0,0,0,0,1
3,48.0,ASY,138,214,0,Normal,108,Y,1.5,1,1,0,1,0
4,54.0,NAP,150,195,0,Normal,122,N,0.0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
913,45.0,TA,110,264,0,Normal,132,N,1.2,1,0,0,1,0
914,68.0,ASY,144,193,1,Normal,141,N,3.4,1,0,0,1,0
915,57.0,ASY,130,131,0,Normal,115,Y,1.2,1,0,0,1,0
916,57.0,ATA,130,236,0,LVH,174,N,0.0,1,1,0,1,0


In [28]:
chestpain = LabelEncoder()
hrt['ChestPainType'] = chestpain.fit_transform(hrt['ChestPainType'])

In [29]:
hrt.RestingECG.unique()

array(['Normal', 'ST', 'LVH'], dtype=object)

In [30]:
ecg = LabelEncoder()
hrt['RestingECG'] = ecg.fit_transform(hrt['RestingECG'])
hrt

Unnamed: 0,Age,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,HeartDisease,Gender,ST_Slope_Down,ST_Slope_Flat,ST_Slope_Up
0,40.0,1,140,289,0,1,172,N,0.0,0,0,0,0,1
1,49.0,2,160,180,0,1,156,N,1.0,1,1,0,1,0
2,37.0,1,130,283,0,2,98,N,0.0,0,0,0,0,1
3,48.0,0,138,214,0,1,108,Y,1.5,1,1,0,1,0
4,54.0,2,150,195,0,1,122,N,0.0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
913,45.0,3,110,264,0,1,132,N,1.2,1,0,0,1,0
914,68.0,0,144,193,1,1,141,N,3.4,1,0,0,1,0
915,57.0,0,130,131,0,1,115,Y,1.2,1,0,0,1,0
916,57.0,1,130,236,0,0,174,N,0.0,1,1,0,1,0


In [31]:
hrt.ExerciseAngina.unique()

array(['N', 'Y'], dtype=object)

In [32]:
exercise = LabelEncoder()
hrt['ExerciseAngina'] = exercise.fit_transform(hrt['ExerciseAngina'])
hrt

Unnamed: 0,Age,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,HeartDisease,Gender,ST_Slope_Down,ST_Slope_Flat,ST_Slope_Up
0,40.0,1,140,289,0,1,172,0,0.0,0,0,0,0,1
1,49.0,2,160,180,0,1,156,0,1.0,1,1,0,1,0
2,37.0,1,130,283,0,2,98,0,0.0,0,0,0,0,1
3,48.0,0,138,214,0,1,108,1,1.5,1,1,0,1,0
4,54.0,2,150,195,0,1,122,0,0.0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
913,45.0,3,110,264,0,1,132,0,1.2,1,0,0,1,0
914,68.0,0,144,193,1,1,141,0,3.4,1,0,0,1,0
915,57.0,0,130,131,0,1,115,1,1.2,1,0,0,1,0
916,57.0,1,130,236,0,0,174,0,0.0,1,1,0,1,0


In [33]:
#Take Titanic data and perform encoding
df = pd.read_csv('titanic.csv')
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [34]:
df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [35]:
df.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [36]:
df['Age'].fillna(np.mean(df['Age']), inplace = True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Age'].fillna(np.mean(df['Age']), inplace = True)


In [37]:
df['Embarked'].value_counts()

Embarked
S    644
C    168
Q     77
Name: count, dtype: int64

In [38]:
df['Embarked'].fillna('S',inplace = True)
df.isna().sum()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Embarked'].fillna('S',inplace = True)


PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         0
dtype: int64

In [39]:
df.drop("Cabin", axis=1, inplace=True)
df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.000000,1,0,A/5 21171,7.2500,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.000000,1,0,PC 17599,71.2833,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.000000,0,0,STON/O2. 3101282,7.9250,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.000000,1,0,113803,53.1000,S
4,5,0,3,"Allen, Mr. William Henry",male,35.000000,0,0,373450,8.0500,S
...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.000000,0,0,211536,13.0000,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.000000,0,0,112053,30.0000,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,29.699118,1,2,W./C. 6607,23.4500,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.000000,0,0,111369,30.0000,C


In [40]:
df.drop(['PassengerId','Name','Ticket'], inplace=True, axis=True)

In [41]:
df = pd.get_dummies(df, columns=['Sex'], dtype=int)
df

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,Embarked,Sex_female,Sex_male
0,0,3,22.000000,1,0,7.2500,S,0,1
1,1,1,38.000000,1,0,71.2833,C,1,0
2,1,3,26.000000,0,0,7.9250,S,1,0
3,1,1,35.000000,1,0,53.1000,S,1,0
4,0,3,35.000000,0,0,8.0500,S,0,1
...,...,...,...,...,...,...,...,...,...
886,0,2,27.000000,0,0,13.0000,S,0,1
887,1,1,19.000000,0,0,30.0000,S,1,0
888,0,3,29.699118,1,2,23.4500,S,1,0
889,1,1,26.000000,0,0,30.0000,C,0,1


In [42]:
df.drop('Sex_male', inplace=True, axis=1)
df = df.rename(columns={'Sex_female':'Gender'})
df

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,Embarked,Gender
0,0,3,22.000000,1,0,7.2500,S,0
1,1,1,38.000000,1,0,71.2833,C,1
2,1,3,26.000000,0,0,7.9250,S,1
3,1,1,35.000000,1,0,53.1000,S,1
4,0,3,35.000000,0,0,8.0500,S,0
...,...,...,...,...,...,...,...,...
886,0,2,27.000000,0,0,13.0000,S,0
887,1,1,19.000000,0,0,30.0000,S,1
888,0,3,29.699118,1,2,23.4500,S,1
889,1,1,26.000000,0,0,30.0000,C,0


In [43]:
emb = LabelEncoder()
df['Embarked'] = emb.fit_transform(df['Embarked'])
df

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,Embarked,Gender
0,0,3,22.000000,1,0,7.2500,2,0
1,1,1,38.000000,1,0,71.2833,0,1
2,1,3,26.000000,0,0,7.9250,2,1
3,1,1,35.000000,1,0,53.1000,2,1
4,0,3,35.000000,0,0,8.0500,2,0
...,...,...,...,...,...,...,...,...
886,0,2,27.000000,0,0,13.0000,2,0
887,1,1,19.000000,0,0,30.0000,2,1
888,0,3,29.699118,1,2,23.4500,2,1
889,1,1,26.000000,0,0,30.0000,0,0


### Split data into input features (X) and output feature (y)

In [44]:
X = df[['Pclass','Age','SibSp','Parch','Fare','Embarked','Gender']]
y = df['Survived']

In [45]:
print(X.shape)
print(y.shape)

(891, 7)
(891,)


In [46]:
x = df.drop('Survived', axis=1)   #Another way to select input columns
x
y = df['Survived']

## Splitting the data into train and test
- Train data should be around 75 to 80% of total values
- test data should have remaining 25 to 20% of total values

In [47]:
from sklearn.model_selection import train_test_split

In [48]:
x_train, x_test, y_train, y_test = train_test_split(X,y,test_size=0.25)

In [49]:
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

(668, 7)
(668,)
(223, 7)
(223,)


In [50]:
x_train

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Embarked,Gender
877,3,19.000000,0,0,7.8958,2,0
460,1,48.000000,0,0,26.5500,2,0
717,2,27.000000,0,0,10.5000,2,1
411,3,29.699118,0,0,6.8583,1,0
25,3,38.000000,1,5,31.3875,2,1
...,...,...,...,...,...,...,...
116,3,70.500000,0,0,7.7500,1,0
141,3,22.000000,0,0,7.7500,2,1
686,3,14.000000,4,1,39.6875,2,0
829,1,62.000000,0,0,80.0000,2,1


In [51]:
y_train

877    0
460    1
717    1
411    0
25     1
      ..
116    0
141    1
686    0
829    1
283    1
Name: Survived, Length: 668, dtype: int64