# Classification - Example

Classification predictive modeling is the task of approximating a mapping function (f) from input variables (X) to discrete output variables (y).

The output variables are often called labels or categories. The mapping function predicts the class or category for a given observation.

# 1. Initializing libraries

In [None]:
import pandas as pd  
import numpy as np  
import matplotlib.pyplot as plt 
import seaborn as sns
%matplotlib inline


# 2. Loading Dataset

In [None]:
data = pd.read_csv("/content/diabetes.csv")

In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


# 3. Understanding the dataset

In [None]:
data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [None]:
data.tail()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.34,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1
767,1,93,70,31,0,30.4,0.315,23,0


In [None]:
data.describe

<bound method NDFrame.describe of      Pregnancies  Glucose  ...  Age  Outcome
0              6      148  ...   50        1
1              1       85  ...   31        0
2              8      183  ...   32        1
3              1       89  ...   21        0
4              0      137  ...   33        1
..           ...      ...  ...  ...      ...
763           10      101  ...   63        0
764            2      122  ...   27        0
765            5      121  ...   30        0
766            1      126  ...   47        1
767            1       93  ...   23        0

[768 rows x 9 columns]>

In [None]:
data.shape

(768, 9)

In [None]:
corr = data.corr()
print(corr)

                          Pregnancies   Glucose  ...       Age   Outcome
Pregnancies                  1.000000  0.129459  ...  0.544341  0.221898
Glucose                      0.129459  1.000000  ...  0.263514  0.466581
BloodPressure                0.141282  0.152590  ...  0.239528  0.065068
SkinThickness               -0.081672  0.057328  ... -0.113970  0.074752
Insulin                     -0.073535  0.331357  ... -0.042163  0.130548
BMI                          0.017683  0.221071  ...  0.036242  0.292695
DiabetesPedigreeFunction    -0.033523  0.137337  ...  0.033561  0.173844
Age                          0.544341  0.263514  ...  1.000000  0.238356
Outcome                      0.221898  0.466581  ...  0.238356  1.000000

[9 rows x 9 columns]


In [None]:
print(data.mean())

Pregnancies                   3.845052
Glucose                     120.894531
BloodPressure                69.105469
SkinThickness                20.536458
Insulin                      79.799479
BMI                          31.992578
DiabetesPedigreeFunction      0.471876
Age                          33.240885
Outcome                       0.348958
dtype: float64


In [None]:
print(data.std())

Pregnancies                   3.369578
Glucose                      31.972618
BloodPressure                19.355807
SkinThickness                15.952218
Insulin                     115.244002
BMI                           7.884160
DiabetesPedigreeFunction      0.331329
Age                          11.760232
Outcome                       0.476951
dtype: float64


In [None]:
data['Outcome'].value_counts()

0    500
1    268
Name: Outcome, dtype: int64

# 4. Labelling the data

In [None]:
X = data.drop('Outcome', axis=1)  
y = data['Outcome']

# 5. Splitting the train data and test data

In [None]:
from sklearn.model_selection import train_test_split  
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20)

In [None]:
print(X_train)

     Pregnancies  Glucose  BloodPressure  ...   BMI  DiabetesPedigreeFunction  Age
409            1      172             68  ...  42.4                     0.702   28
434            1       90             68  ...  24.5                     1.138   36
234            3       74             68  ...  29.7                     0.293   23
112            1       89             76  ...  31.2                     0.192   23
82             7       83             78  ...  29.3                     0.767   36
..           ...      ...            ...  ...   ...                       ...  ...
116            5      124             74  ...  34.0                     0.220   38
407            0      101             62  ...  21.9                     0.336   25
290            0       78             88  ...  36.9                     0.434   21
302            5       77             82  ...  35.8                     0.156   35
365            5       99             54  ...  34.0                     0.499   30

[61

In [None]:
print(y_train)

409    1
434    0
234    0
112    0
82     0
      ..
116    1
407    0
290    0
302    0
365    0
Name: Outcome, Length: 614, dtype: int64


In [None]:
print(X_test)

     Pregnancies  Glucose  BloodPressure  ...   BMI  DiabetesPedigreeFunction  Age
70             2      100             66  ...  32.9                     0.867   28
265            5       96             74  ...  33.6                     0.997   43
284            2      108             80  ...  27.0                     0.259   52
693            7      129             68  ...  38.5                     0.439   43
229            0      117             80  ...  45.2                     0.089   24
..           ...      ...            ...  ...   ...                       ...  ...
277            0      104             64  ...  27.8                     0.454   23
497            2       81             72  ...  30.1                     0.547   25
375           12      140             82  ...  39.2                     0.528   58
367            0      101             64  ...  21.0                     0.252   21
626            0      125             68  ...  24.7                     0.206   21

[15

In [None]:
print(y_test)

70     1
265    0
284    1
693    1
229    0
      ..
277    0
497    0
375    1
367    0
626    0
Name: Outcome, Length: 154, dtype: int64


# 6. Running algorithms

1. Let us run Logistic Regression on the dataset. 

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
data_Check = LogisticRegression()
data_Check.fit(X_train, y_train)
y_pred = data_Check.predict(X_test)
correct = (y_pred == y_test).sum()
accuracy = correct/len(y_test)*100
print("accuracy = ", accuracy, "%")

accuracy =  69.48051948051948 %


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


2. Let us check with Clasification Decison Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier  
data_classifier = DecisionTreeClassifier()  
data_classifier.fit(X_train, y_train) 
y_pred = data_classifier.predict(X_test)


correct = (y_pred==y_test).sum()
accuracy = correct/len(y_test)*100
print("accuracy = ", accuracy, "%")


accuracy =  72.72727272727273 %


3. Let us check with Support Vector Machine

In [None]:
from sklearn.svm import SVC  
svclassifier = SVC(kernel='linear')  
svclassifier.fit(X, y)

y_pred = svclassifier.predict(X)
correct = (y_pred==y).sum()
accuracy = correct/len(y)*100
print("accuracy = ", accuracy, "%")

accuracy =  77.734375 %


4. Let us check with Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB 
gnb = GaussianNB() 
gnb.fit(X, y) 
y_pred = gnb.predict(X)
correct = (y_pred==y).sum()
accuracy = correct/len(y)*100
print("accuracy = ", accuracy, "%")

accuracy =  76.30208333333334 %


In [None]:
class Students:
     def __init__(self,a="Cmrit"):
         self.a=a
 
     def display(self):
         print(self.a)
obj=test()
obj.display()

Hello World


In [None]:
class test:
     def __init__(self,c):
         self.c=c
 
     def display(self):
         print(self.c)
obj=test()
obj.display()

TypeError: ignored