# Keeping the data set as imbalanced

In [None]:
import pandas as pd
df=pd.read_csv("/content/drive/MyDrive/Datasets/framingham.csv")
df

Unnamed: 0,male,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
0,1,39,4.0,0,0.0,0.0,0,0,0,195.0,106.0,70.0,26.97,80.0,77.0,0
1,0,46,2.0,0,0.0,0.0,0,0,0,250.0,121.0,81.0,28.73,95.0,76.0,0
2,1,48,1.0,1,20.0,0.0,0,0,0,245.0,127.5,80.0,25.34,75.0,70.0,0
3,0,61,3.0,1,30.0,0.0,0,1,0,225.0,150.0,95.0,28.58,65.0,103.0,1
4,0,46,3.0,1,23.0,0.0,0,0,0,285.0,130.0,84.0,23.10,85.0,85.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4235,0,48,2.0,1,20.0,,0,0,0,248.0,131.0,72.0,22.00,84.0,86.0,0
4236,0,44,1.0,1,15.0,0.0,0,0,0,210.0,126.5,87.0,19.16,86.0,,0
4237,0,52,2.0,0,0.0,0.0,0,0,0,269.0,133.5,83.0,21.47,80.0,107.0,0
4238,1,40,3.0,0,0.0,0.0,0,1,0,185.0,141.0,98.0,25.60,67.0,72.0,0


In [None]:
# checking the output to find whether it is imabalanced or not
df['TenYearCHD'].value_counts()

# here one type of output have significatly more than the other one. So, when we train the machine, the accuracy could be high, but the results will be biased

0    3596
1     644
Name: TenYearCHD, dtype: int64

In [None]:
df.isna().sum()

male                 0
age                  0
education          105
currentSmoker        0
cigsPerDay          29
BPMeds              53
prevalentStroke      0
prevalentHyp         0
diabetes             0
totChol             50
sysBP                0
diaBP                0
BMI                 19
heartRate            1
glucose            388
TenYearCHD           0
dtype: int64

In [None]:
df.drop(['education'],axis=1,inplace=True)

In [None]:
columns=["cigsPerDay",'BPMeds','totChol','heartRate','glucose']
for i in columns:
  df[i]=df[i].fillna(df[i].mode()[0])
df['BMI']=df['BMI'].fillna(df['BMI'].mean())

In [None]:
df.isna().sum()

male               0
age                0
currentSmoker      0
cigsPerDay         0
BPMeds             0
prevalentStroke    0
prevalentHyp       0
diabetes           0
totChol            0
sysBP              0
diaBP              0
BMI                0
heartRate          0
glucose            0
TenYearCHD         0
dtype: int64

In [None]:
x=df.iloc[:,:-1]
y=df.iloc[:,-1]

In [None]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3,random_state=1)

In [None]:
from sklearn.preprocessing import MinMaxScaler
sc=MinMaxScaler()
x_train=sc.fit_transform(x_train)
x_test=sc.transform(x_test)

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report
knn=KNeighborsClassifier()
sv=SVC()
nb=GaussianNB()
models=[knn,sv,nb]
for model in models:
  print(model)
  model.fit(x_train,y_train)
  y_pred=model.predict(x_test)
  print(classification_report(y_test,y_pred))

KNeighborsClassifier()
              precision    recall  f1-score   support

           0       0.86      0.97      0.91      1085
           1       0.30      0.09      0.13       187

    accuracy                           0.84      1272
   macro avg       0.58      0.53      0.52      1272
weighted avg       0.78      0.84      0.80      1272

SVC()
              precision    recall  f1-score   support

           0       0.85      1.00      0.92      1085
           1       0.00      0.00      0.00       187

    accuracy                           0.85      1272
   macro avg       0.43      0.50      0.46      1272
weighted avg       0.73      0.85      0.78      1272

GaussianNB()
              precision    recall  f1-score   support

           0       0.87      0.93      0.90      1085
           1       0.36      0.22      0.28       187

    accuracy                           0.83      1272
   macro avg       0.62      0.58      0.59      1272
weighted avg       0.80      0.8

# Doing the same problem after making the dataset balanced
-----------------------------------------------------------

2 Methods of balancing the dataset: Over sampling and under sampling. We cant say upfront which method will imporve the efficiancy since it depends on the data set. In the above problem, the output:TenYrCHD have the following value; 0:3596 and 1:644.

#1. Over Sampling
-----------------
In this we will increase the number of minority sample to match the majority sample. ie, in the above problem, the number of 1 will be increase to 3596 from 644

# 2. Under Sampling
------------------
In this, will reduce the majority samples count to minority sample.

# Doing the problem with Over Sampling
----------------------------------------
Here we are using SMOTE algorithm (Synthetic minority oversampling Technique)

In [None]:
from imblearn.over_sampling import SMOTE
os=SMOTE(random_state=1)
x_os,y_os=os.fit_resample(x,y)

In [None]:
y_os.value_counts() # both are matching

0    3596
1    3596
Name: TenYearCHD, dtype: int64

In [None]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x_os,y_os,test_size=0.3,random_state=1)

In [None]:
from sklearn.preprocessing import MinMaxScaler
sc=MinMaxScaler()
x_train=sc.fit_transform(x_train)
x_test=sc.transform(x_test)

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report
knn=KNeighborsClassifier()
sv=SVC()
nb=GaussianNB()
models=[knn,sv,nb]
for model in models:
  print(model)
  model.fit(x_train,y_train)
  y_pred=model.predict(x_test)
  print(classification_report(y_test,y_pred))

KNeighborsClassifier()
              precision    recall  f1-score   support

           0       0.83      0.69      0.75      1074
           1       0.73      0.86      0.79      1084

    accuracy                           0.77      2158
   macro avg       0.78      0.77      0.77      2158
weighted avg       0.78      0.77      0.77      2158

SVC()
              precision    recall  f1-score   support

           0       0.73      0.68      0.70      1074
           1       0.70      0.75      0.72      1084

    accuracy                           0.71      2158
   macro avg       0.71      0.71      0.71      2158
weighted avg       0.71      0.71      0.71      2158

GaussianNB()
              precision    recall  f1-score   support

           0       0.58      0.85      0.69      1074
           1       0.72      0.38      0.50      1084

    accuracy                           0.62      2158
   macro avg       0.65      0.62      0.59      2158
weighted avg       0.65      0.6

# Doing the problem with UnderSampling
----------------------------------------
Here we are using Random Undersampler algorithm

In [None]:
from imblearn.under_sampling import RandomUnderSampler
us=RandomUnderSampler(random_state=1)
x_us,y_us=us.fit_resample(x,y)

In [None]:
y_us.value_counts() # both are matching

0    644
1    644
Name: TenYearCHD, dtype: int64

In [None]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x_us,y_us,test_size=0.3,random_state=1)

In [None]:
from sklearn.preprocessing import MinMaxScaler
sc=MinMaxScaler()
x_train=sc.fit_transform(x_train)
x_test=sc.transform(x_test)

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report
knn=KNeighborsClassifier()
sv=SVC()
nb=GaussianNB()
models=[knn,sv,nb]
for model in models:
  print(model)
  model.fit(x_train,y_train)
  y_pred=model.predict(x_test)
  print(classification_report(y_test,y_pred))

KNeighborsClassifier()
              precision    recall  f1-score   support

           0       0.61      0.63      0.62       196
           1       0.61      0.59      0.60       191

    accuracy                           0.61       387
   macro avg       0.61      0.61      0.61       387
weighted avg       0.61      0.61      0.61       387

SVC()
              precision    recall  f1-score   support

           0       0.68      0.60      0.64       196
           1       0.63      0.72      0.67       191

    accuracy                           0.66       387
   macro avg       0.66      0.66      0.66       387
weighted avg       0.66      0.66      0.66       387

GaussianNB()
              precision    recall  f1-score   support

           0       0.54      0.98      0.70       196
           1       0.88      0.15      0.26       191

    accuracy                           0.57       387
   macro avg       0.71      0.57      0.48       387
weighted avg       0.71      0.5

In this case, the accuracy reduced. Sometimes, we will get better accuracy.