In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl

# Models
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.decomposition import PCA

# Preprocessing and metrics
from sklearn.preprocessing import scale, StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Feature labels
import drugfeatures

# Define functions for cleaning data

In [3]:
# Drop null or invalid responses - 99 = Not Stated, 96 = Valid Skip, 9 = 'I dont know'
def DropNull(features):
    features = features.replace(99, pd.NA)
    features = features.replace(96, pd.NA)
    features = features.replace(9, pd.NA)
    features = features.dropna(axis=0)
    return features

# Binary class creation for 'ALC_050' - 'How often did you have 5 or more drinks on one occasion?'
# 1 = Less than once a month or never, 2 = more than once a month
def y_replace(y):
    y = y.replace(2,1)
    y = y.replace(3,1)
    y = y.replace(4,2)
    y = y.replace(5,2)
    y = y.replace(6,2)
    y = y.replace(7,2)
    y = y.replace(8,2)
    return y

# Define functions for training models

In [5]:
# Logistic Regression model
def logreg(X,y,label):
    # Split training/testing data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Scale data
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    # Fit to logistic regression model
    model = LogisticRegression(random_state=42)
    model.fit(X_train_scaled, y_train)
    
    # Make predictions
    y_pred = model.predict(X_test_scaled)

    # Print accuracy, classification replort and confusion matrix
    print(label, "- Accuracy:", accuracy_score(y_test, y_pred))
    print("\nClassification Report:\n", classification_report(y_test, y_pred, zero_division=1))

    cm = confusion_matrix(y_test, y_pred, labels = model.classes_)
    print(label, '- Confusion Matrix:\n')
    print(model.classes_)
    print(cm, '\n\n_______________________________________________________________________\n')

# Decision Tree
def decisiontree(X,y,label):
    # Split training/testing data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Scale data
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    # Fit to decision tree model
    model = DecisionTreeClassifier()
    model.fit(X_train_scaled, y_train)
    
    # Make predictions
    y_pred = model.predict(X_test_scaled)

    # Print accuracy, classification report and confusion matrix
    print(label, "- Accuracy:", accuracy_score(y_test, y_pred))
    print("\nClassification Report:\n", classification_report(y_test, y_pred, zero_division=1))

    cm = confusion_matrix(y_test, y_pred, labels = model.classes_)
    print(label, '- Confusion Matrix:\n')
    print(model.classes_)
    print(cm, '\n\n_______________________________________________________________________\n')

# Define features from drugfeatures.py

In [7]:
# Drop null or invalid responses
dv_features = DropNull(drugfeatures.dv_features)
bullied_features = DropNull(drugfeatures.bullied_features)
bully_features = DropNull(drugfeatures.bully_features)
tb_features = DropNull(drugfeatures.tb_features)
can_features = DropNull(drugfeatures.can_features)

# Define predictor and target variables
X_dv = dv_features.drop('ALC_050', axis=1)
y_dv = dv_features['ALC_050']
X_bullied = bullied_features.drop('ALC_050', axis=1)
y_bullied = bullied_features['ALC_050']
X_bully = bully_features.drop('ALC_050', axis=1)
y_bully = bully_features['ALC_050']
X_tb = tb_features.drop('ALC_050', axis=1)
y_tb = tb_features['ALC_050']
X_can = can_features.drop('ALC_050', axis=1)
y_can = can_features['ALC_050']

# Create binary classes - 1 for low risk, 2 for high risk
y_dv = y_replace(y_dv)
y_bullied = y_replace(y_bullied)
y_bully = y_replace(y_bully)
y_tb = y_replace(y_tb)
y_can = y_replace(y_can)

# Logistic Regression

In [12]:
logreg(X_dv,y_dv,'Logistic Regression\nDemographic Variables')
logreg(X_bullied,y_bullied,'Logistic Regression\nPast 30 Days: Experienced Bullying')
logreg(X_bully,y_bully,'Logistic Regression\nPast 30 Days: Bullied Another Student')
logreg(X_tb,y_tb,'Logistic Regression\nPast 30 Days: Used Tobacco')
logreg(X_can,y_can,'Logistic Regression\nPast 30 Days: Used Cannabis')

Logistic Regression
Demographic Variables - Accuracy: 0.738347718865598

Classification Report:
               precision    recall  f1-score   support

           1       0.74      1.00      0.85      2994
           2       1.00      0.00      0.00      1061

    accuracy                           0.74      4055
   macro avg       0.87      0.50      0.42      4055
weighted avg       0.81      0.74      0.63      4055

Logistic Regression
Demographic Variables - Confusion Matrix:

[1 2]
[[2994    0]
 [1061    0]] 

_______________________________________________________________________

Logistic Regression
Past 30 Days: Experienced Bullying - Accuracy: 0.7569955817378498

Classification Report:
               precision    recall  f1-score   support

           1       0.76      1.00      0.86      3598
           2       1.00      0.00      0.00      1155

    accuracy                           0.76      4753
   macro avg       0.88      0.50      0.43      4753
weighted avg       0.8

# Decision Tree Classifier

In [16]:
decisiontree(X_dv,y_dv,'Decision Tree\nDemographic Variables')
decisiontree(X_bullied,y_bullied,'Decision Tree\nPast 30 Days: Experienced Bullying')
decisiontree(X_bully,y_bully,'Decision Tree\nPast 30 Days: Bullied Another Student')
decisiontree(X_tb,y_tb,'Decision Tree\nPast 30 Days: Used Tobacco')
decisiontree(X_can,y_can,'Decision Tree\nPast 30 Days: Used Cannabis')

Decision Tree
Demographic Variables - Accuracy: 0.7230579531442664

Classification Report:
               precision    recall  f1-score   support

           1       0.74      0.96      0.84      2994
           2       0.33      0.06      0.10      1061

    accuracy                           0.72      4055
   macro avg       0.54      0.51      0.47      4055
weighted avg       0.63      0.72      0.64      4055

Decision Tree
Demographic Variables - Confusion Matrix:

[1 2]
[[2872  122]
 [1001   60]] 

_______________________________________________________________________

Decision Tree
Past 30 Days: Experienced Bullying - Accuracy: 0.7563644014306754

Classification Report:
               precision    recall  f1-score   support

           1       0.76      1.00      0.86      3598
           2       0.20      0.00      0.00      1155

    accuracy                           0.76      4753
   macro avg       0.48      0.50      0.43      4753
weighted avg       0.62      0.76      

# Principal Components

In [19]:
# Copy 5x so that there is a pca_loadings dataframe for each set of features (example 'dv_pca_loadings')
dv_pca_loadings = pd.DataFrame(PCA().fit(X_dv).components_.T, index=X_dv.columns, columns=['V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7'])
bullied_pca_loadings = pd.DataFrame(PCA().fit(X_bullied).components_.T, index=X_bullied.columns, columns=['V1', 'V2', 'V3', 'V4', 'V5', 'V6'])
bully_pca_loadings = pd.DataFrame(PCA().fit(X_bully).components_.T, index=X_bully.columns, columns=['V1', 'V2', 'V3', 'V4', 'V5', 'V6'])
tb_pca_loadings = pd.DataFrame(PCA().fit(X_tb).components_.T, index=X_tb.columns, columns=['V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9'])
can_pca_loadings = pd.DataFrame(PCA().fit(X_can).components_.T, index=X_can.columns, columns=['V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11', 'V12', 'V13', 'V14', 'V15'])

In [21]:
pca = PCA()
# Copy 5x so that there is a df_plot dataframe for each set of features (example 'dv_df_plot')
dv_df_plot = pd.DataFrame(pca.fit_transform(X_dv), columns = ['PC1', 'PC2', 'PC3', 'PC4', 'PC5', 'PC6', 'PC7'], index=X_dv.index)
bullied_df_plot = pd.DataFrame(pca.fit_transform(X_bullied), columns = ['PC1', 'PC2', 'PC3', 'PC4', 'PC5', 'PC6'], index=X_bullied.index)
bully_df_plot = pd.DataFrame(pca.fit_transform(X_bully), columns = ['PC1', 'PC2', 'PC3', 'PC4', 'PC5', 'PC6'], index=X_bully.index)
tb_df_plot = pd.DataFrame(pca.fit_transform(X_tb), columns = ['PC1', 'PC2', 'PC3', 'PC4', 'PC5', 'PC6', 'PC7', 'PC8', 'PC9'], index=X_tb.index)
can_df_plot = pd.DataFrame(pca.fit_transform(X_can), columns = ['PC1', 'PC2', 'PC3', 'PC4', 'PC5', 'PC6', 'PC7', 'PC8', 'PC9', 'PC10', 'PC11', 'PC12', 'PC13', 'PC14', 'PC15'], index=X_can.index)

In [23]:
# Copy 5x so that there is an Xlog variable for each feature set that contains the first principal component(example: dv_Xlog)
dv_Xlog = dv_df_plot[['PC1' ]]
bullied_Xlog = bullied_df_plot[['PC1']]
bully_Xlog = bully_df_plot[['PC1']]
tb_Xlog = tb_df_plot[['PC1']]
can_Xlog = can_df_plot[['PC1']]

# Logistic Regression trained on Principal Component 1

In [26]:
logreg(dv_Xlog,y_dv, 'Demographic Variables')
logreg(bullied_Xlog,y_bullied, 'Past 30 Days: Experienced Bullying')
logreg(bully_Xlog,y_bully, 'Past 30 Days: Bullied Another Student')
logreg(tb_Xlog,y_tb, 'Past 30 Days: Used Tobacco')
logreg(can_Xlog,y_can, 'Past 30 Days: Used Cannabis')

Demographic Variables - Accuracy: 0.738347718865598

Classification Report:
               precision    recall  f1-score   support

           1       0.74      1.00      0.85      2994
           2       1.00      0.00      0.00      1061

    accuracy                           0.74      4055
   macro avg       0.87      0.50      0.42      4055
weighted avg       0.81      0.74      0.63      4055

Demographic Variables - Confusion Matrix:

[1 2]
[[2994    0]
 [1061    0]] 

_______________________________________________________________________

Past 30 Days: Experienced Bullying - Accuracy: 0.7569955817378498

Classification Report:
               precision    recall  f1-score   support

           1       0.76      1.00      0.86      3598
           2       1.00      0.00      0.00      1155

    accuracy                           0.76      4753
   macro avg       0.88      0.50      0.43      4753
weighted avg       0.82      0.76      0.65      4753

Past 30 Days: Experienced B

# Decision Tree trained on Principal Component 1

In [29]:
# Copy 5x with Xlog and y variables corresponding to each set of features
decisiontree(dv_Xlog,y_dv, 'Demographic Variables')
decisiontree(bullied_Xlog,y_bullied, 'Past 30 Days: Experienced Bullying')
decisiontree(bully_Xlog,y_bully, 'Past 30 Days: Bullied Another Student')
decisiontree(tb_Xlog,y_tb, 'Past 30 Days: Used Tobacco')
decisiontree(can_Xlog,y_can, 'Past 30 Days: Used Cannabis')

Demographic Variables - Accuracy: 0.7250308261405672

Classification Report:
               precision    recall  f1-score   support

           1       0.74      0.96      0.84      2994
           2       0.35      0.06      0.10      1061

    accuracy                           0.73      4055
   macro avg       0.54      0.51      0.47      4055
weighted avg       0.64      0.73      0.64      4055

Demographic Variables - Confusion Matrix:

[1 2]
[[2879  115]
 [1000   61]] 

_______________________________________________________________________

Past 30 Days: Experienced Bullying - Accuracy: 0.7563644014306754

Classification Report:
               precision    recall  f1-score   support

           1       0.76      1.00      0.86      3598
           2       0.20      0.00      0.00      1155

    accuracy                           0.76      4753
   macro avg       0.48      0.50      0.43      4753
weighted avg       0.62      0.76      0.65      4753

Past 30 Days: Experienced 

# All previous models, adusted binary classes

In [32]:
y_dv_adj = dv_features['ALC_050']
y_bullied_adj = bullied_features['ALC_050']
y_bully_adj = bully_features['ALC_050']
y_tb_adj = tb_features['ALC_050']
y_can_adj = can_features['ALC_050']

In [34]:
# Adjust binary classes
# 1 = Low risk (never, not in the last 12 months, less than once a month, once a month)
# 2 = High risk (more than once a month)
def y_adjust(y):
    y = y.replace(2,1)
    y = y.replace(3,1)
    y = y.replace(4,1)
    y = y.replace(5,2)
    y = y.replace(6,2)
    y = y.replace(7,2)
    y = y.replace(8,2)
    return y

In [36]:
y_dv_adj = y_adjust(y_dv_adj)
y_bullied_adj = y_adjust(y_bullied_adj)
y_bully_adj =y_adjust(y_bully_adj)
y_tb_adj =y_adjust(y_tb_adj)
y_can_adj =y_adjust(y_can_adj)

In [38]:
logreg(X_dv,y_dv_adj,'Logistic Regression\nDemographic Variables')
logreg(X_bullied,y_bullied_adj,'Logistic Regression\nPast 30 Days: Experienced Bullying')
logreg(X_bully,y_bully_adj,'Logistic Regression\nPast 30 Days: Bullied Another Student')
logreg(X_tb,y_tb_adj,'Logistic Regression\nPast 30 Days: Used Tobacco')
logreg(X_can,y_can_adj,'Logistic Regression\nPast 30 Days: Used Cannabis')

decisiontree(X_dv,y_dv_adj,'Decision Tree\nDemographic Variables')
decisiontree(X_bullied,y_bullied_adj,'Decision Tree\nPast 30 Days: Experienced Bullying')
decisiontree(X_bully,y_bully_adj,'Decision Tree\nPast 30 Days: Bullied Another Student')
decisiontree(X_tb,y_tb_adj,'Decision Tree\nPast 30 Days: Used Tobacco')
decisiontree(X_can,y_can_adj,'Decision Tree\nPast 30 Days: Used Cannabis')

logreg(dv_Xlog,y_dv_adj, 'Demographic Variables')
logreg(bullied_Xlog,y_bullied_adj, 'Past 30 Days: Experienced Bullying')
logreg(bully_Xlog,y_bully_adj, 'Past 30 Days: Bullied Another Student')
logreg(tb_Xlog,y_tb_adj, 'Past 30 Days: Used Tobacco')
logreg(can_Xlog,y_can_adj, 'Past 30 Days: Used Cannabis')

decisiontree(dv_Xlog,y_dv_adj, 'Demographic Variables')
decisiontree(bullied_Xlog,y_bullied_adj, 'Past 30 Days: Experienced Bullying')
decisiontree(bully_Xlog,y_bully_adj, 'Past 30 Days: Bullied Another Student')
decisiontree(tb_Xlog,y_tb_adj, 'Past 30 Days: Used Tobacco')
decisiontree(can_Xlog,y_can_adj, 'Past 30 Days: Used Cannabis')

Logistic Regression
Demographic Variables - Accuracy: 0.8345252774352651

Classification Report:
               precision    recall  f1-score   support

           1       0.83      1.00      0.91      3384
           2       1.00      0.00      0.00       671

    accuracy                           0.83      4055
   macro avg       0.92      0.50      0.45      4055
weighted avg       0.86      0.83      0.76      4055

Logistic Regression
Demographic Variables - Confusion Matrix:

[1 2]
[[3384    0]
 [ 671    0]] 

_______________________________________________________________________

Logistic Regression
Past 30 Days: Experienced Bullying - Accuracy: 0.8523038081211867

Classification Report:
               precision    recall  f1-score   support

           1       0.85      1.00      0.92      4051
           2       1.00      0.00      0.00       702

    accuracy                           0.85      4753
   macro avg       0.93      0.50      0.46      4753
weighted avg       0.