## Predicting diabetes

In [None]:
# Importing Dataset
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
# Importing the dataset
dataset = pd.read_csv('diabetesdataset.csv')

In [3]:
dataset.head(20)

Unnamed: 0,ID,pregnant,glucose,pressure,triceps,insulin,mass,pedigree,age,diabetes
0,1,6,148,72,35,0,33.6,0.627,50,pos
1,2,1,85,66,29,0,26.6,0.351,31,neg
2,3,8,183,64,0,0,23.3,0.672,32,pos
3,4,1,89,66,23,94,28.1,0.167,21,neg
4,5,0,137,40,35,168,43.1,2.288,33,pos
5,6,5,116,74,0,0,25.6,0.201,30,neg
6,7,3,78,50,32,88,31.0,0.248,26,pos
7,8,10,115,0,0,0,35.3,0.134,29,neg
8,9,2,197,70,45,543,30.5,0.158,53,pos
9,10,8,125,96,0,0,0.0,0.232,54,pos


In [4]:
#preprocessing
# Encoding categorical data
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
labelencoder = LabelEncoder()

In [5]:
dataset['diabetes']= labelencoder.fit_transform(dataset['diabetes'])

In [6]:
dataset.head(20)

Unnamed: 0,ID,pregnant,glucose,pressure,triceps,insulin,mass,pedigree,age,diabetes
0,1,6,148,72,35,0,33.6,0.627,50,1
1,2,1,85,66,29,0,26.6,0.351,31,0
2,3,8,183,64,0,0,23.3,0.672,32,1
3,4,1,89,66,23,94,28.1,0.167,21,0
4,5,0,137,40,35,168,43.1,2.288,33,1
5,6,5,116,74,0,0,25.6,0.201,30,0
6,7,3,78,50,32,88,31.0,0.248,26,1
7,8,10,115,0,0,0,35.3,0.134,29,0
8,9,2,197,70,45,543,30.5,0.158,53,1
9,10,8,125,96,0,0,0.0,0.232,54,1


In [7]:
median=dataset['diabetes'].mean()

In [8]:
median

0.3489583333333333

In [9]:
dataset[['ID', 'pregnant', 'glucose', 'pressure', 'triceps', 'insulin', 'mass',
       'pedigree', 'age']]=dataset[['ID', 'pregnant', 'glucose', 'pressure', 'triceps', 'insulin', 'mass',
       'pedigree', 'age']].replace(0,np.NaN)

In [10]:
dataset.isnull().sum()

ID            0
pregnant    111
glucose       5
pressure     35
triceps     227
insulin     374
mass         11
pedigree      0
age           0
diabetes      0
dtype: int64

In [11]:
dataset.fillna(dataset.mean(),inplace=True)

In [12]:
dataset.isnull().sum()

ID          0
pregnant    0
glucose     0
pressure    0
triceps     0
insulin     0
mass        0
pedigree    0
age         0
diabetes    0
dtype: int64

In [13]:
dataset.head(20)

Unnamed: 0,ID,pregnant,glucose,pressure,triceps,insulin,mass,pedigree,age,diabetes
0,1,6.0,148.0,72.0,35.0,155.548223,33.6,0.627,50,1
1,2,1.0,85.0,66.0,29.0,155.548223,26.6,0.351,31,0
2,3,8.0,183.0,64.0,29.15342,155.548223,23.3,0.672,32,1
3,4,1.0,89.0,66.0,23.0,94.0,28.1,0.167,21,0
4,5,4.494673,137.0,40.0,35.0,168.0,43.1,2.288,33,1
5,6,5.0,116.0,74.0,29.15342,155.548223,25.6,0.201,30,0
6,7,3.0,78.0,50.0,32.0,88.0,31.0,0.248,26,1
7,8,10.0,115.0,72.405184,29.15342,155.548223,35.3,0.134,29,0
8,9,2.0,197.0,70.0,45.0,543.0,30.5,0.158,53,1
9,10,8.0,125.0,96.0,29.15342,155.548223,32.457464,0.232,54,1


In [14]:
dataset['diabetes']=dataset['diabetes'].astype(str)

In [15]:
dataset.dtypes

ID            int64
pregnant    float64
glucose     float64
pressure    float64
triceps     float64
insulin     float64
mass        float64
pedigree    float64
age           int64
diabetes     object
dtype: object

In [16]:
categorical_feature_mask=dataset.dtypes==object

In [17]:
categorical_feature_mask

ID          False
pregnant    False
glucose     False
pressure    False
triceps     False
insulin     False
mass        False
pedigree    False
age         False
diabetes     True
dtype: bool

In [18]:
from sklearn.compose import ColumnTransformer
onehotencoder = ColumnTransformer([('onehotencoder', OneHotEncoder(), ['diabetes'])],remainder='passthrough')
dataset['diabetes']=np.array(onehotencoder.fit_transform(dataset), dtype=np.float)
print(dataset['diabetes'])

0      0.0
1      1.0
2      0.0
3      1.0
4      0.0
      ... 
763    1.0
764    1.0
765    1.0
766    0.0
767    1.0
Name: diabetes, Length: 768, dtype: float64


In [19]:
feature_columns=['ID', 'pregnant', 'glucose', 'pressure', 'triceps', 'insulin', 'mass', 'pedigree', 'age']
predicted_values=['diabetes']

In [20]:
x=dataset[feature_columns].values
y=dataset[predicted_values].values

In [21]:
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 500)


In [22]:
# Fitting Multiple Linear Regression to the Training set
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()


In [23]:
regressor.fit(x_train, y_train)

LinearRegression()

In [24]:
# Predicting the Test set results
y_pred = regressor.predict(x_test)
np.set_printoptions(precision=2)
print(np.concatenate((y_pred.reshape(len(y_pred),1),y_test.reshape(len(y_test),1)),1))

[[ 0.34  0.  ]
 [ 0.82  1.  ]
 [ 0.41  0.  ]
 [-0.14  0.  ]
 [ 0.63  1.  ]
 [ 0.42  0.  ]
 [ 0.63  1.  ]
 [ 0.19  0.  ]
 [ 0.07  1.  ]
 [ 1.01  1.  ]
 [ 0.66  1.  ]
 [ 0.96  1.  ]
 [ 0.47  1.  ]
 [ 0.61  1.  ]
 [ 0.63  0.  ]
 [ 0.3   0.  ]
 [ 0.19  0.  ]
 [ 0.61  0.  ]
 [ 0.73  1.  ]
 [ 0.85  1.  ]
 [ 0.65  0.  ]
 [ 0.69  1.  ]
 [ 0.92  1.  ]
 [ 0.91  1.  ]
 [ 1.01  1.  ]
 [ 0.17  1.  ]
 [ 0.33  1.  ]
 [ 0.85  1.  ]
 [ 0.39  0.  ]
 [ 1.26  1.  ]
 [ 1.02  1.  ]
 [ 0.28  0.  ]
 [ 0.32  1.  ]
 [ 0.9   1.  ]
 [ 0.72  1.  ]
 [ 0.55  1.  ]
 [ 0.73  1.  ]
 [ 0.58  1.  ]
 [ 0.66  0.  ]
 [ 0.58  0.  ]
 [ 0.21  0.  ]
 [ 0.74  0.  ]
 [ 0.41  0.  ]
 [ 0.95  1.  ]
 [ 1.03  1.  ]
 [ 0.4   0.  ]
 [ 0.54  0.  ]
 [ 0.8   1.  ]
 [ 0.38  1.  ]
 [ 0.31  0.  ]
 [ 0.33  0.  ]
 [ 0.49  1.  ]
 [ 0.1   0.  ]
 [ 0.74  1.  ]
 [ 0.59  1.  ]
 [ 0.73  0.  ]
 [ 0.44  1.  ]
 [ 0.59  1.  ]
 [ 0.28  0.  ]
 [ 0.24  0.  ]
 [ 1.17  1.  ]
 [ 0.77  1.  ]
 [ 0.71  1.  ]
 [ 0.69  1.  ]
 [ 0.8   1.  ]
 [ 1.09  1.  ]
 [ 0.63  0

In [25]:
import sklearn.metrics as met
mse=met.mean_squared_error(y_test,y_pred)
rmse=np.sqrt(mse)
r2=met.r2_score(y_test,y_pred)
print('Mean Squared error:',mse)
print('Root mean squared error:',rmse)
print('R2_score:',r2)

Mean Squared error: 0.15508815325702136
Root mean squared error: 0.39381233253546205
R2_score: 0.3128954525231612
