In [1]:
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale, StandardScaler
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, precision_score, recall_score
import matplotlib as mpl
import matplotlib.pyplot as plt
import drugfeatures

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

def DropNull(features):
    features = features.replace(99, pd.NA)
    features = features.replace(96, pd.NA)
    features = features.replace(9, pd.NA)
    features = features.dropna(axis=0)
    return features

def y_replace(y):
    y = y.replace(2,1)
    y = y.replace(3,1)
    y = y.replace(4,2)
    y = y.replace(5,2)
    y = y.replace(6,2)
    y = y.replace(7,2)
    y = y.replace(8,2)
    return y

def LogReg(X,y,label):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    model = LogisticRegression(random_state=42, max_iter=200)
    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_test)

    print(label, "- Accuracy:", accuracy_score(y_test, y_pred))
    print("\nClassification Report:\n", classification_report(y_test, y_pred, zero_division=1))

    cm = confusion_matrix(y_test, y_pred, labels = model.classes_)
    print(label, '- Confusion Matrix:\n')
    print(model.classes_)
    print(cm, '\n\n_______________________________________________________________________\n')

In [2]:
dv_features = drugfeatures.dv_features
bullied_features = drugfeatures.bullied_features
bully_features = drugfeatures.bully_features
tb_features = drugfeatures.tb_features
can_features = drugfeatures.can_features

In [3]:
dv_df = DropNull(dv_features)
bullied_df = DropNull(bullied_features)
bully_df = DropNull(bully_features)
tb_df = DropNull(tb_features)
can_df = DropNull(can_features)

In [4]:
# Drop target variable 'ALC_050' from X variable used in PCA

# Copy this line 5x and replace variables
# Example: X_dv for demographic features, X_bullied for experienced bullying features
X_dv = dv_df.drop('ALC_050', axis=1)
X_bullied = bullied_df.drop('ALC_050', axis=1)
X_bully= bully_df.drop('ALC_050', axis=1)
X_tb = tb_df.drop('ALC_050', axis=1)
X_can = can_df.drop('ALC_050', axis=1)

X_dv = pd.DataFrame(scale(X_dv), index = X_dv.index, columns = X_dv.columns)
X_bullied = pd.DataFrame(scale(X_bullied), index = X_bullied.index, columns = X_bullied.columns)
X_bully = pd.DataFrame(scale(X_bully), index = X_bully.index, columns = X_bully.columns)
X_tb = pd.DataFrame(scale(X_tb), index = X_tb.index, columns = X_tb.columns)
X_can = pd.DataFrame(scale(X_can), index = X_can.index, columns = X_can.columns)

# Make sure to have a y variable for each set of features, otherwise there will be a mismatched values error output
y_dv = dv_df['ALC_050']
y_bullied = bullied_df['ALC_050']
y_bully = bully_df['ALC_050']
y_tb = tb_df['ALC_050']
y_can = can_df['ALC_050']

y_dv = y_replace(y_dv)
y_bullied = y_replace(y_bullied)
y_bully = y_replace(y_bully)
y_tb = y_replace(y_tb)
y_can = y_replace(y_can)


  y = y.replace(2,1)


In [5]:
# Copy 5x so that there is a pca_loadings dataframe for each set of features (example 'dv_pca_loadings')
dv_pca_loadings = pd.DataFrame(PCA().fit(X_dv).components_.T, index=X_dv.columns, columns=['V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7'])
bullied_pca_loadings = pd.DataFrame(PCA().fit(X_bullied).components_.T, index=X_bullied.columns, columns=['V1', 'V2', 'V3', 'V4', 'V5', 'V6'])
bully_pca_loadings = pd.DataFrame(PCA().fit(X_bully).components_.T, index=X_bully.columns, columns=['V1', 'V2', 'V3', 'V4', 'V5', 'V6'])
tb_pca_loadings = pd.DataFrame(PCA().fit(X_tb).components_.T, index=X_tb.columns, columns=['V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9'])
can_pca_loadings = pd.DataFrame(PCA().fit(X_can).components_.T, index=X_can.columns, columns=['V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11', 'V12', 'V13', 'V14', 'V15'])

In [6]:
pca = PCA()

# Copy 5x so that there is a df_plot dataframe for each set of features (example 'dv_df_plot')
dv_df_plot = pd.DataFrame(pca.fit_transform(X_dv), columns = ['PC1', 'PC2', 'PC3', 'PC4', 'PC5', 'PC6', 'PC7'], index=X_dv.index)
bullied_df_plot = pd.DataFrame(pca.fit_transform(X_bullied), columns = ['PC1', 'PC2', 'PC3', 'PC4', 'PC5', 'PC6'], index=X_bullied.index)
bully_df_plot = pd.DataFrame(pca.fit_transform(X_bully), columns = ['PC1', 'PC2', 'PC3', 'PC4', 'PC5', 'PC6'], index=X_bully.index)
tb_df_plot = pd.DataFrame(pca.fit_transform(X_tb), columns = ['PC1', 'PC2', 'PC3', 'PC4', 'PC5', 'PC6', 'PC7', 'PC8', 'PC9'], index=X_tb.index)
can_df_plot = pd.DataFrame(pca.fit_transform(X_can), columns = ['PC1', 'PC2', 'PC3', 'PC4', 'PC5', 'PC6', 'PC7', 'PC8', 'PC9', 'PC10', 'PC11', 'PC12', 'PC13', 'PC14', 'PC15'], index=X_can.index)

In [7]:
# Copy 5x so that there is an Xlog variable for each feature set that contains the first principal component(example: dv_Xlog)
dv_Xlog = dv_df_plot[['PC1']]
bullied_Xlog = bullied_df_plot[['PC1']]
bully_Xlog = bully_df_plot[['PC1']]
tb_Xlog = tb_df_plot[['PC1']]
can_Xlog = can_df_plot[['PC1']]

In [8]:
# Copy 5x with Xlog and y variables corresponding to each set of features
LogReg(dv_Xlog,y_dv, 'Demographic Variables')
LogReg(bullied_Xlog,y_bullied, 'Experienced Bullying')
LogReg(bully_Xlog,y_bully, 'bullied another students')
LogReg(tb_Xlog,y_tb, 'used tabaco')
LogReg(can_Xlog,y_can, 'used cannabis')

Demographic Variables - Accuracy: 0.738347718865598

Classification Report:
               precision    recall  f1-score   support

           1       0.74      1.00      0.85      2994
           2       1.00      0.00      0.00      1061

    accuracy                           0.74      4055
   macro avg       0.87      0.50      0.42      4055
weighted avg       0.81      0.74      0.63      4055

Demographic Variables - Confusion Matrix:

[1 2]
[[2994    0]
 [1061    0]] 

_______________________________________________________________________

Experienced Bullying - Accuracy: 0.7569955817378498

Classification Report:
               precision    recall  f1-score   support

           1       0.76      1.00      0.86      3598
           2       1.00      0.00      0.00      1155

    accuracy                           0.76      4753
   macro avg       0.88      0.50      0.43      4753
weighted avg       0.82      0.76      0.65      4753

Experienced Bullying - Confusion Matrix:


# Interpretation
Accuracy is the same in the PCA version of the logistic regression model using only 1 principal component compared to the 7 original features, meaning that in this case, pca was able to effectively decompose the variables while retaining the same amount of variance