<a href="https://colab.research.google.com/github/johnnystrada/dataprojects/blob/master/smoker_prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

In [None]:
import statsmodels.api as sm, pprint, math, seaborn as sns
import matplotlib.pyplot as plt
import sklearn as sk
from sklearn.model_selection import train_test_split as tts
from sklearn.linear_model import LogisticRegression as LogReg
from sklearn.preprocessing import StandardScaler, scale, MinMaxScaler
from sklearn.model_selection import train_test_split as tts
from sklearn.metrics import r2_score, roc_auc_score, roc_curve, auc, confusion_matrix


In [None]:
df = pd.read_csv('/train.csv')
df.drop(columns = 'eyesight(right)', inplace=True)
df_weight_age = df['weight(kg)'].groupby(df['age'])
df_weight_age_med = pd.DataFrame(df_weight_age.median())
df_weight_age_med.rename(columns={'weight(kg)':'med_weight'},inplace=True)
df_height_age = df['height(cm)'].groupby(df['age'])
df_height_age_med = pd.DataFrame(df_height_age.median())
df_height_age_med.rename(columns={'height(cm)':'med_height'}, inplace=True)
df = df.merge(df_weight_age_med,on=['age'])
df = df.merge(df_height_age_med,on=['age'])
df['height(m)'] = df['height(cm)'] / 100
df['bmi'] = df['weight(kg)'] / (df['height(m)']*df['height(m)'])
df['med_height_diff'] = df['height(cm)'] - df['med_height']
df['med_weight_diff'] = df['weight(kg)'] - df['med_weight']
df_model = df.copy()
df_model.drop(columns = ['weight(kg)','height(cm)', 'med_weight', 'med_height'], inplace=True)
# Adding a constant to the model
df_model['const'] = 1
df_target = pd.DataFrame(df['smoking'])
df_model.drop(columns=['id', 'smoking', 'const'], inplace=True)
df_scaled = pd.DataFrame(scale(df_model), columns=df_model.columns)
df_model = df_scaled
df_model = df_scaled.merge(df_target,left_index=True, right_index=True)
df_model['age'] = df['age']
df_model = df_model.drop(df_model[(df_model.age == 40)].sample(frac=.5).index)
df_model = df_model.drop(df_model[(df_model.age > 65)].index)
df_model['const'] = 1
df_model.drop(columns = 'hearing(right)', inplace=True)
df_model.drop(columns = 'serum creatinine', inplace=True)
tX1, vX1, tY1, vY1 = tts(df_model.drop(['smoking'], axis=1), df_model['smoking'], test_size = 0.2, random_state=123)
# Logistic regression model summary
md1 = sm.Logit(tY1, tX1).fit()
print(md1.summary(title='Smoking Prediction Model', alpha=.05))
# Creating prediction probabilities and labels
pY_prob1 = md1.predict(vX1)
pY_prob1 = pY_prob1
pY1 = (pY_prob1 > 0.85) * 1
AUC = roc_auc_score(vY1, pY_prob1)

dfCM = pd.DataFrame(confusion_matrix(vY1, pY1), index=['True+','True-'], columns=['Pred+','Pred-'])
print(f'Confusion matrix:\n{dfCM}')
print(f'Out of sample accuracy: {np.mean(pY1 == vY1):.2f} and AUC:{AUC:.2f}')

fpr, tpr, thresholds = roc_curve(vY1, pY_prob1)

plt.rcParams['figure.figsize'] = [5, 5]
ax = pd.DataFrame([fpr, tpr], index=['fpr','tpr']).T.plot(
    'fpr','tpr', kind='line', grid=True, title='Receiver Operating Characteristic', label=f'ROC curve. AUC = {AUC:.2f}');

ax.plot([0, 1], [0, 1], 'r--');  # random predictions curve
ax.set_ylabel('True Positive Rate or (Sensitivity)');
ax.set_xlabel('False Positive Rate or (1 - Specifity)');


In [None]:
df_test = pd.read_csv('/test.csv')
df_test_raw = pd.read_csv('/test.csv')
df_weight_age = df_test['weight(kg)'].groupby(df_test['age'])
df_weight_age_med = pd.DataFrame(df_weight_age.median())
df_weight_age_med.rename(columns={'weight(kg)':'med_weight'},inplace=True)
df_height_age = df_test['height(cm)'].groupby(df_test['age'])
df_height_age_med = pd.DataFrame(df_height_age.median())
df_height_age_med.rename(columns={'height(cm)':'med_height'}, inplace=True)
df_test = df_test.merge(df_weight_age_med,on=['age'])
df_test = df_test.merge(df_height_age_med,on=['age'])
df_test['height(m)'] = df_test['height(cm)'] / 100
df_test['bmi'] = df_test['weight(kg)'] / (df_test['height(m)']*df_test['height(m)'])
df_test['med_height_diff'] = df_test['height(cm)'] - df_test['med_height']
df_test['med_weight_diff'] = df_test['weight(kg)'] - df_test['med_weight']
df_test.drop(columns = ['eyesight(right)','serum creatinine','id','height(cm)','weight(kg)','eyesight(right)','hearing(right)', 'med_weight', 'med_height'], inplace=True)
df_test['const'] = 1
pY_prob0 = md1.predict(df_test)
df_test = df_test.merge(df_test_raw['id'], left_index=True, right_index=True)
pY_prob0 = round(pY_prob0,4)
pY0 = (pY_prob0 > 0.8) * 1
df_prob = pd.Series(pY_prob0)
df_prob = pd.DataFrame(df_prob, columns=['smoking'])
df_pred = pd.DataFrame(pY0, columns=['pred'])
prediction_results = df_prob.merge(df_test['id'], left_index=True, right_index=True).merge(df_pred['pred'], left_index=True, right_index=True)
prediction_results.to_csv('submission.csv')
submission = pd.DataFrame({ 'id': prediction_results.id.values, 'smoking': pY_prob0 })
submission.to_csv('submission.csv', index=False)