# Diabetes prediction

In [None]:
# Importing Dependencies 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings("ignore")

In [None]:
#Loading dataset
df=pd.read_csv('diabetes.csv')

In [None]:
# Display first five rows
df.head()

In [None]:
# Shape of the dataset
df.shape

In [None]:
# Check if it has some null values
df.isnull().sum()

#### There are no null values

In [None]:
# Some more information about dataset
df.info()

#### All features are integer or float type

In [None]:
# Statistical analysis
df.describe()

In [None]:
df['Outcome'].value_counts()

# Data Visualization

In [None]:
## Distribution plot
sns.distplot(df['Age'], color='purple')

#### Observations has most age group 25 

In [None]:
sns.distplot(df['Pregnancies'], color='purple')

#### This feature is right skewed 

In [None]:
sns.distplot(df['Glucose'], color='purple')

In [None]:
sns.distplot(df['BloodPressure'], color='purple')

In [None]:
plt.figure(figsize=(15, 8))
sns.countplot(x='Age',hue='Outcome', data=df)

In [None]:
sns.heatmap(df.corr(), annot=True)

## Data preprocessing

In [None]:
df.groupby('Outcome').mean()

In [None]:
# Seperate fatures and targets
X=df.drop('Outcome', axis=1)
y=df['Outcome']

In [None]:
print(X)
print(y)

## Data Standardization

In [None]:
scaler=StandardScaler()
new_df=scaler.fit_transform(X)

In [None]:
new_df

In [None]:
X=new_df

## Separating training and test data

In [None]:
# train_test_split
X_train, X_test, y_train, y_test=train_test_split(X, y, test_size=0.2, stratify=y, random_state=2) 

In [None]:
print(X.shape, X_train.shape, X_test.shape)

## Training the model

In [None]:
classifier=svm.SVC(kernel='linear')

In [None]:
classifier.fit(X_train, y_train)

In [None]:
predict_x_train=classifier.predict(X_train)
predict_y=classifier.predict(X_test)

## Accuracy score

In [None]:
# Training accuracy score
training_accuaracy=accuracy_score(y_train, predict_x_train)
training_accuaracy

In [None]:
# test data accuracy score
test_accuracy=accuracy_score(predict_y, y_test)
test_accuracy

# Making the predictive Model

In [None]:
input_data=(1,85,66,29,0,26.6,0.351,31)
input_data_asarray=np.asarray(input_data)
#reshape
data_reshape=input_data_asarray.reshape(1, -1)
print(data_reshape)

In [None]:
std_data=scaler.transform(data_reshape)
print(std_data)
output=classifier.predict(std_data)
print(output)

In [None]:
if (output==1) :
    print("person is diabetic")
else:
    print('Person is not diabetic')