In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, classification_report
import matplotlib.pyplot as plt

df = pd.read_csv('drug.csv')

def DropNull(features):
    features = features.replace(99, pd.NA)
    features = features.replace(96, pd.NA)
    features = features.replace(9, pd.NA)
    features = features.dropna(axis=0)
    return features

def y_replace(y):
    y = y.replace(2,1)
    y = y.replace(3,1)
    y = y.replace(4,2)
    y = y.replace(5,2)
    y = y.replace(6,2)
    y = y.replace(7,2)
    y = y.replace(8,2)
    return y

def DecisionTree(X,y,label):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    model = DecisionTreeClassifier()
    model.fit(X_train_scaled, y_train)
    
    y_pred = model.predict(X_test_scaled)

    print(label, "- Accuracy:", accuracy_score(y_test, y_pred))
    print("\nClassification Report:\n", classification_report(y_test, y_pred, zero_division=1))

    cm = confusion_matrix(y_test, y_pred, labels = model.classes_)
    print(label, '- Confusion Matrix:\n')
    print(model.classes_)
    print(cm, '\n\n_______________________________________________________________________\n')

# Features based on demographic qualities of respondents
dv_features = df[['PROVID', 
                  'DVGENDER', 
                  'DVURBAN', 
                  'DVRES', 
                  'DVORIENT', 
                  'DVDESCRIBE', 
                  'GH_020', 
                  'ALC_050']]

# Features regarding experiencing bullying in the past 30 days
bullied_features = df[['BUL_010', 
                       'BUL_020', 
                       'BUL_030', 
                       'BUL_040', 
                       'BUL_050', 
                       'BUL_060', 
                       'ALC_050']]

# Features regarding perpetrating bullying in the past 30 days
bully_features = df[['BUL_070',
                     'BUL_080',
                     'BUL_090',
                     'BUL_100',
                     'BUL_110',
                     'BUL_120',
                     'ALC_050']]

# Features regarding tobacco use in the past 30 days
tb_features = df[['TP_001',
                  'TP_016',
                  'TP_046',
                  'TP_056',
                  'TP_066',
                  'TP_086',
                  'ELC_026a',
                  'ELC_026b',
                  'ELC_026c',
                  'ALC_050']]

# Features regarding cannabis use in the past 30 days
can_features = df[['CAN_010',
                   'CAN_020',
                   'CAN_030',
                   'CAN_040',
                   'CAN_050',
                   'CAN_060',
                   'CAN_070',
                   'CAN_080',
                   'CAN_091',
                   'CAN_092',
                   'CAN_100',
                   'CAN_110',
                   'CAN_121',
                   'CAN_130',
                   'CAN_140',
                   'ALC_050']]

# Drop responses coded as 'Not Stated', 'Valid Skip' or 'I dont know'
dv_features = DropNull(dv_features)
bullied_features = DropNull(bullied_features)
bully_features = DropNull(bully_features)
tb_features = DropNull(tb_features)
can_features = DropNull(can_features)

X_dv = dv_features.drop('ALC_050', axis=1)
y_dv = dv_features['ALC_050']
X_bullied = bullied_features.drop('ALC_050', axis=1)
y_bullied = bullied_features['ALC_050']
X_bully = bully_features.drop('ALC_050', axis=1)
y_bully = bully_features['ALC_050']
X_tb = tb_features.drop('ALC_050', axis=1)
y_tb = tb_features['ALC_050']
X_can = can_features.drop('ALC_050', axis=1)
y_can = can_features['ALC_050']

# ALC_050: 'In the last 12 months, how often did you have 5 or more drinks of alcohol on one occasion?'
# Values 1-3 represent low risk drinking (less than once a month)
# Values 4-9 represent high risk drinking (once a month or more)
y_dv = y_replace(y_dv)
y_bullied = y_replace(y_bullied)
y_bully = y_replace(y_bully)
y_tb = y_replace(y_tb)
y_can = y_replace(y_can)

  y = y.replace(2,1)


In [2]:
DecisionTree(X_dv,y_dv,'Demographic Variables')
DecisionTree(X_bullied,y_bullied,'Past 30 Days: Experienced Bullying')
DecisionTree(X_bully,y_bully,'Past 30 Days: Bullied Another Student')
DecisionTree(X_tb,y_tb,'Past 30 Days: Used Tobacco')
DecisionTree(X_can,y_can,'Past 30 Days: Used Cannabis')

Demographic Variables - Accuracy: 0.7240443896424168

Classification Report:
               precision    recall  f1-score   support

           1       0.74      0.96      0.84      2994
           2       0.34      0.06      0.10      1061

    accuracy                           0.72      4055
   macro avg       0.54      0.51      0.47      4055
weighted avg       0.64      0.72      0.64      4055

Demographic Variables - Confusion Matrix:

[1 2]
[[2877  117]
 [1002   59]] 

_______________________________________________________________________

Past 30 Days: Experienced Bullying - Accuracy: 0.7563644014306754

Classification Report:
               precision    recall  f1-score   support

           1       0.76      1.00      0.86      3598
           2       0.20      0.00      0.00      1155

    accuracy                           0.76      4753
   macro avg       0.48      0.50      0.43      4753
weighted avg       0.62      0.76      0.65      4753

Past 30 Days: Experienced 