In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, log_loss, zero_one_loss
from sklearn.model_selection import cross_val_score, KFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_validate


df = pd.read_csv('smoking_driking_dataset.csv', sep = ';')
print(df.shape[0])
df.head(5)

In [None]:
df = df[df['waistline'] < 200]
print(df.shape[0])
df['DRK_YN'] = df['DRK_YN'].replace({'Y': 1, 'N': 0})
df['sex'] = df['sex'].replace({'Female': 1, 'Male': 0})
for i in df.columns[5:-1]:   
    mu = df[i].mean()
    sd = df[i].std()
    df[i] = df[i].apply(lambda x: (x - mu)/sd) 
df.head(5)
cols = df.columns

In [None]:
for label in cols[:-1]:
  plt.hist(df[df["DRK_YN"]==1][label], color='blue', label='Drinks', alpha=0.7, density=True)
  plt.hist(df[df["DRK_YN"]==0][label], color='red', label='not', alpha=0.7, density=True)
  plt.title(label)
  plt.ylabel("Probability")
  plt.xlabel(label)
  plt.legend()
  plt.show()

In [None]:
df = df.rename(columns= {'DRK_YN' : 'class'})
cor = df.corr()
fig, ax = plt.subplots(figsize=(13,13))
sns.heatmap(cor, linewidth=0.5, annot = True, cmap = 'PiYG', fmt = '.2f', ax = ax)
plt.show()

In [None]:
feature_matrix = []
for i in [0.05, 0.1, 0.15, 0.2, 0.25]:
    pom = []
    for j in df.columns[:-1]:
        if cor['class'][j] > i or cor['class'][j] < -i:
            pom.append(j)
    feature_matrix.append(pom)
for i in feature_matrix:
    print(i)

In [None]:
def generate_confusion_matrix(y_true, y_pred):
    ax = plt.subplot()
    c_mat = confusion_matrix(y_true, y_pred)
    sns.heatmap(c_mat, annot=True, fmt='g', ax=ax)
    ax.set_xlabel('Predicted labels', fontsize=15)
    ax.set_ylabel('True labels', fontsize=15)
    ax.set_title('Confusion Matrix', fontsize=15)
    
df_new = df.sample(n = 100000) # for checking different training methods and parameters  

for j in range(10):
    print('-'*40)
    train, valid, test = np.split(df_new.sample(frac=1), [int(0.6*len(df_new)), int(0.8*len(df_new))])
    for i in range(len(feature_matrix)):
        X_train = train[feature_matrix[i]]
        y_train = train['class']
        X_valid = valid[feature_matrix[i]]
        y_valid = valid['class']
        X_test = test[feature_matrix[i]]
        y_test = test['class']
        clf_1 = LogisticRegression(max_iter = 4000, solver = 'newton-cholesky', class_weight = 'balanced', C = 12)
        clf_1.fit(X_train, y_train)
        y_pred = clf_1.predict(X_train)
        multi_accuracy = accuracy_score(y_train, y_pred)
        #print(classification_report(y_train, y_pred))

        #y_test_pred = clf_1.predict(X_test)
        #generate_confusion_matrix(y_test, y_test_pred)
        #multi_accuracy = accuracy_score(y_test, y_test_pred)

        print(f"Prediction accuracy on training set: {100*multi_accuracy:.2f}%")
        #print(classification_report(y_test, y_test_pred))
        loss = log_loss(y_train, y_pred)
        print(('validation loss', loss))
        y_valid_pred = clf_1.predict(X_valid)
        multi_accuracy = accuracy_score(y_valid, y_valid_pred)
        print(f"Prediction accuracy on validation set: {100*multi_accuracy:.2f}%")
        loss = log_loss(y_valid, y_valid_pred)
        print(('validation loss', loss))
        plt.show()

In [None]:
x_line = []
y_line = []
y_w = []
df_new = df.sample(n = 100000)
train, valid, test = np.split(df_new.sample(frac=1), [int(0.6*len(df_new)),int(0.8*len(df_new))])
for j in range(len(feature_matrix)):
    print('-'*40)
    for i in range(3, 200):
        X_train = train[feature_matrix[j]]
        y_train = train['class']
        X_valid = valid[feature_matrix[j]]
        y_valid = valid['class']
        X_test = test[feature_matrix[j]]
        y_test = test['class']
        knn_model = KNeighborsClassifier(n_neighbors=i)
        knn_model.fit(X_train, y_train)
        y_pred = knn_model.predict(X_train)
        multi_accuracy = accuracy_score(y_train, y_pred)
        print(i)
        print(f"Prediction accuracy on training set: {100*multi_accuracy:.2f}%")
        print('Training loss: %.3f' % zero_one_loss(y_train, y_pred))
        y_line.append(100*multi_accuracy)
        y_valid_pred = knn_model.predict(X_valid)
        multi_accuracy = accuracy_score(y_valid, y_valid_pred)
        print(f"Prediction accuracy on validation set: {100*multi_accuracy:.2f}%")
        print('Validation loss: %.3f' % zero_one_loss(y_valid, y_valid_pred))
        print("")
        x_line.append(i)
        y_w.append(100*multi_accuracy)

        #knn_model_2 = KNeighborsClassifier(n_neighbors=i, weights = 'distance')
        #knn_model_2.fit(X_train, y_train)
        #y_pred_2 = knn_model_2.predict(X_test)
        #multi_accuracy_2 = accuracy_score(y_test, y_pred_2)
        #y_w.append(100*multi_accuracy_2)


    plt.plot(x_line, y_line, x_line, y_w)
    plt.savefig('KNN.png')
    plt.show()

In [None]:
x_line = []
y_line = []
y_w = []
df_new = df.sample(n = 100000)
train, valid, test = np.split(df_new.sample(frac=1), [int(0.6*len(df_new)),int(0.8*len(df_new))])
for i in range(3, 200):
    X_train = train[feature_matrix[j]]
    y_train = train['class']
    X_valid = valid[feature_matrix[j]]
    y_valid = valid['class']
    X_test = test[feature_matrix[j]]
    y_test = test['class']
    knn_model = KNeighborsClassifier(n_neighbors=i)
    knn_model.fit(X_train, y_train)
    y_pred = knn_model.predict(X_train)
    multi_accuracy = accuracy_score(y_train, y_pred)
    print(i)
    print(f"Prediction accuracy on training set: {100*multi_accuracy:.2f}%")
    print('Training loss: %.3f' % zero_one_loss(y_train, y_pred))
    y_line.append(100*multi_accuracy)
    y_valid_pred = knn_model.predict(X_valid)
    multi_accuracy = accuracy_score(y_valid, y_valid_pred)
    print(f"Prediction accuracy on validation set: {100*multi_accuracy:.2f}%")
    print('Validation loss: %.3f' % zero_one_loss(y_valid, y_valid_pred))
    print("")
    x_line.append(i)
    y_w.append(100*multi_accuracy)

    #knn_model_2 = KNeighborsClassifier(n_neighbors=i, weights = 'distance')
    #knn_model_2.fit(X_train, y_train)
    #y_pred_2 = knn_model_2.predict(X_test)
    #multi_accuracy_2 = accuracy_score(y_test, y_pred_2)
    #y_w.append(100*multi_accuracy_2)


plt.plot(x_line, y_line, x_line, y_w)
plt.savefig('KNN.png')
plt.show()

In [None]:
train, valid, test = np.split(df.sample(frac=1), [int(0.6*len(df)), int(0.8*len(df))])
X_train = train[feature_matrix[0]]
y_train = train['class']
X_valid = valid[feature_matrix[0]]
y_valid = valid['class']
X_test = test[feature_matrix[0]]
y_test = test['class']
clf_1 = LogisticRegression(max_iter = 4000, solver = 'newton-cholesky', class_weight = 'balanced', C = 12)
clf_1.fit(X_train, y_train)


y_pred = clf_1.predict(X_train)
multi_accuracy = accuracy_score(y_train, y_pred)
print(f"Prediction accuracy on training set: {100*multi_accuracy:.2f}%")
print('Training loss: %.3f' % log_loss(y_train, y_pred))
print("")

y_valid_pred = clf_1.predict(X_valid)
multi_accuracy = accuracy_score(y_valid, y_valid_pred)
print(f"Prediction accuracy on validation set: {100*multi_accuracy:.2f}%")
print('Validation loss: %.3f' % log_loss(y_valid, y_valid_pred))
print("")

y_test_pred = clf_1.predict(X_test)
generate_confusion_matrix(y_test, y_test_pred)
multi_accuracy = accuracy_score(y_test, y_test_pred)
print(f"Prediction accuracy on test set: {100*multi_accuracy:.2f}%")
print('Test loss: %.3f' % log_loss(y_test, y_test_pred))

In [None]:
train, valid, test = np.split(df.sample(frac=1), [int(0.6*len(df)), int(0.8*len(df))])
X_train = train[feature_matrix[2]]
y_train = train['class']
X_valid = valid[feature_matrix[2]]
y_valid = valid['class']
X_test = test[feature_matrix[2]]
y_test = test['class']
knn_model = KNeighborsClassifier(n_neighbors=50)
knn_model.fit(X_train, y_train)

y_pred = knn_model.predict(X_train)
multi_accuracy = accuracy_score(y_train, y_pred)
print(f"Prediction accuracy on training set: {100*multi_accuracy:.2f}%")
print('Training loss: %.3f' % zero_one_loss(y_train, y_pred))

y_valid_pred = knn_model.predict(X_valid)
multi_accuracy = accuracy_score(y_valid, y_valid_pred)
print(f"Prediction accuracy on training set: {100*multi_accuracy:.2f}%")
print('Validation loss: %.3f' % zero_one_loss(y_valid, y_valid_pred))

y_test_pred = knn_model.predict(X_test)
generate_confusion_matrix(y_test, y_test_pred)
multi_accuracy = accuracy_score(y_test, y_test_pred)
print(f"Prediction accuracy on test set: {100*multi_accuracy:.2f}%")
print('Test loss: %.3f' % zero_one_loss(y_test, y_test_pred))