In [105]:
#LIBRARIES:

#Import data wrangling libraries
import pandas as pd
import numpy as np
import os

#Import graphing libraries
import seaborn as sns
import matplotlib.pyplot as plt

#Import ML libraries
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

In [112]:
# FUNCTIONS:

#Importing dataset
def get_data(file):
    return pd.read_csv('../{}.csv'.format(file),  sep=';')

#Defining features and groundtruth
def set_xy(data):
    X = data.drop('cardio', axis=1)
    y = data.cardio
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)
    return X_train, X_test, y_train, y_test

#Plotting a correlation heatmap
def plot_corr(df,size=10):
    corr = df.corr()
    fig, ax = plt.subplots(figsize=(size, size))
    cmap = sns.cubehelix_palette(light=1, as_cmap=True)
    ax = sns.heatmap(corr, annot=True, cmap=cmap).set_title("Data Correlation Heatmap")
    fig.savefig('images/corr_heatmap.png')

#Checking data types and null values from data set
def check_data(df):
    print('------- Checking Data -------\n\n')
    plot_corr(data)
    print('The number of N/A values in the data frame are: \n{}\n\n'.format(df.isna().sum()))
    print('The types of data in the data frame are: \n{}\n'.format(df.dtypes))
    print('\n\n------- Checking Data -------\n\n')

#Graphing confusion matrix for results
def conf_mat(y_test, y_pred, title, size=3):
    conf = confusion_matrix(y_test, y_pred)
    plt.figure(figsize = (size,size))
    cmap = sns.cubehelix_palette(light=1, as_cmap=True)
    sns.heatmap(conf, annot=True, cmap=cmap).set_title("Confusion Matrix for {}".format(title))
    
#Logistic Regresion prediction
def LR_model(X_train, X_test, y_train, y_test):
    clf = LogisticRegression(solver='lbfgs', max_iter=500).fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    acc = round(accuracy_score(y_test, y_pred),4)
    conf_mat(y_test, y_pred, 'Logistic Regresion')
    print('Linear Regresion accuracy score: {}'.format(acc))


def SVC_model(X_train, X_test, y_train, y_test):
    clf = SVC(gamma='scale',  kernel='linear').fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    acc = round(accuracy_score(y_test, y_pred),4)
    conf_mat(y_test, y_pred, 'S.V.C.')
    print('S.V.C. accuracy score: {}'.format(acc))

In [None]:
#MAIN CODE:

if __name__ == "__main__":
    data = get_data('cardio_train')
    check_data(data)
    X_train, X_test, y_train, y_test = set_xy(data)
    LR_model(X_train, X_test, y_train, y_test)
    SVC_model(X_train, X_test, y_train, y_test)

------- Checking Data -------


The number of N/A values in the data frame are: 
id             0
age            0
gender         0
height         0
weight         0
ap_hi          0
ap_lo          0
cholesterol    0
gluc           0
smoke          0
alco           0
active         0
cardio         0
dtype: int64


The types of data in the data frame are: 
id               int64
age              int64
gender           int64
height           int64
weight         float64
ap_hi            int64
ap_lo            int64
cholesterol      int64
gluc             int64
smoke            int64
alco             int64
active           int64
cardio           int64
dtype: object



------- Checking Data -------


Linear Regresion accuracy score: 0.7013
