In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
import drugfeatures

from sklearn.preprocessing import StandardScaler

In [2]:
#df = pd.read_csv('drug.csv')
#df.describe()

In [3]:
def DropNull(features):
    features = features.replace(99, pd.NA)
    features = features.replace(96, pd.NA)
    features = features.replace(9, pd.NA)
    features = features.dropna(axis=0)
    return features

def y_replace(y):
    y = y.replace(2,1)
    y = y.replace(3,1)
    y = y.replace(4,1)
    y = y.replace(5,2)
    y = y.replace(6,2)
    y = y.replace(7,2)
    y = y.replace(8,2)
    return y

def LogReg(X,y,label):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    model = LogisticRegression(random_state=42)
    model.fit(X_train_scaled, y_train)
    
    y_pred = model.predict(X_test_scaled)

    print(label, "- Accuracy:", accuracy_score(y_test, y_pred))
    print("\nClassification Report:\n", classification_report(y_test, y_pred, zero_division=1))

    cm = confusion_matrix(y_test, y_pred, labels = model.classes_)
    print(label, '- Confusion Matrix:\n')
    print(model.classes_)
    print(cm, '\n\n_______________________________________________________________________\n')

The National Institute on Alcohol Abuse and Alcoholism (NIAAA) defines excessive or heavy drinking for women as consuming 7 or more drinks per week, and for men as consuming 15 or more drinks per week

Any alcohol intake is not healthy while a young person's brain is still developing. However, assume that a high school student drinking 5 or more drinks (to intoxication) at least once a month is high risk alcohol intake.

In [4]:
dv_features = DropNull(drugfeatures.dv_features)
bullied_features = DropNull(drugfeatures.bullied_features)
bully_features = DropNull(drugfeatures.bully_features)
tb_features = DropNull(drugfeatures.tb_features)
can_features = DropNull(drugfeatures.can_features)

In [5]:
X_dv = dv_features.drop('ALC_050', axis=1)
y_dv = dv_features['ALC_050']
X_bullied = bullied_features.drop('ALC_050', axis=1)
y_bullied = bullied_features['ALC_050']
X_bully = bully_features.drop('ALC_050', axis=1)
y_bully = bully_features['ALC_050']
X_tb = tb_features.drop('ALC_050', axis=1)
y_tb = tb_features['ALC_050']
X_can = can_features.drop('ALC_050', axis=1)
y_can = can_features['ALC_050']

# ALC_050: 'In the last 12 months, how often did you have 5 or more drinks of alcohol on one occasion?'
# Values 1-3 represent low risk drinking (less than once a month)
# Values 4-9 represent high risk drinking (once a month or more)
y_dv = y_replace(y_dv)
y_bullied = y_replace(y_bullied)
y_bully = y_replace(y_bully)
y_tb = y_replace(y_tb)
y_can = y_replace(y_can)

  y = y.replace(2,1)


In [7]:
LogReg(X_dv,y_dv,'Demographic Variables')
LogReg(X_bullied,y_bullied,'Past 30 Days: Experienced Bullying')
LogReg(X_bully,y_bully,'Past 30 Days: Bullied Another Student')
LogReg(X_tb,y_tb,'Past 30 Days: Used Tobacco')
LogReg(X_can,y_can,'Past 30 Days: Used Cannabis')

Demographic Variables - Accuracy: 0.8345252774352651

Classification Report:
               precision    recall  f1-score   support

           1       0.83      1.00      0.91      3384
           2       1.00      0.00      0.00       671

    accuracy                           0.83      4055
   macro avg       0.92      0.50      0.45      4055
weighted avg       0.86      0.83      0.76      4055

Demographic Variables - Confusion Matrix:

[1 2]
[[3384    0]
 [ 671    0]] 

_______________________________________________________________________

Past 30 Days: Experienced Bullying - Accuracy: 0.8523038081211867

Classification Report:
               precision    recall  f1-score   support

           1       0.85      1.00      0.92      4051
           2       1.00      0.00      0.00       702

    accuracy                           0.85      4753
   macro avg       0.93      0.50      0.46      4753
weighted avg       0.87      0.85      0.78      4753

Past 30 Days: Experienced 