In [None]:
#import necessary libraries
import numpy as np
import pandas as pd
from scipy.io import loadmat
import requests

import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import PCA

# Plotting style
sns.set_style('darkgrid')
sns.set_theme(font_scale=1.)

In [None]:
#retrieve data from web source and inspect
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00373/drug_consumption.data'
attr_url = 'https://raw.githubusercontent.com/joshinsky/machine_learning_group35/main/Attribute%20descriptions.txt'

# Read attribute names
attribute_names = pd.read_csv(attr_url, sep='\t', header=None, usecols=[0])[0].tolist()

# Load dataset using attribute names
df = pd.read_csv(url, header=None, names=attribute_names)

# Parse attribute descriptions → roles (Feature/Target/ID)
feature_names, drug_targets = [], []
lines = requests.get(attr_url).text.splitlines()
for line in lines:
    parts = line.strip().split('\t')
    if not parts or len(parts) < 2:
        continue
    name = parts[0].strip()
    role = parts[1].strip().lower()
    if role.startswith('feature'):
        feature_names.append(name)
    elif role.startswith('target'):
        drug_targets.append(name)

print(df.head())
print('Shape:', df.shape)
print('Detected targets:', drug_targets)


In [None]:
import requests
print(requests.get(attr_url).text[:1000])  # show first 500 characters


In [None]:
# Define X and y using parsed lists from the attribute file
X = df[feature_names].copy()  # age, gender, education, country, ethnicity, big five, impulsive, ss
y = df[drug_targets].copy()   # all targets/drugs

# Binarize y: CL0–CL2 => 0 (non-user), CL3–CL6 => 1 (user)
y_binarized = y.applymap(lambda x: 0 if x in ['CL0', 'CL1', 'CL2'] else 1)

print('X columns:', list(X.columns))
print(X.head())
print('y (binarized) columns:', list(y_binarized.columns))
print(y_binarized.head())

In [None]:
#making stupid numbers into one hot encoding
# Mapping dictionaries
age_map = {
    -0.95197: "18-24",
    -0.07854: "25-34",
     0.49788: "35-44",
     1.09449: "45-54",
     1.82213: "55-64",
     2.59171: "65+"
}

gender_map = {
     0.48246: "Female",
    -0.48246: "Male"
}

education_map = {
    -2.43591: "Left<16",
    -1.73790: "Left16",
    -1.43719: "Left17",
    -1.22751: "Left18",
    -0.61113: "SomeCollege",
    -0.05921: "ProfCert",
     0.45468: "UniDegree",
     1.16365: "Masters",
     1.98437: "Doctorate"
}

country_map = {
    -0.09765: "Australia",
     0.24923: "Canada",
    -0.46841: "NewZealand",
    -0.28519: "Other",
     0.21128: "Ireland",
     0.96082: "UK",
    -0.57009: "USA"
}

ethnicity_map = {
    -0.50212: "Asian",
    -1.10702: "Black",
     1.90725: "MixedBlackAsian",
     0.12600: "MixedWhiteAsian",
    -0.22166: "MixedWhiteBlack",
     0.11440: "Other",
    -0.31685: "White"
}

# Map codes → labels
X["age"] = X["age"].map(age_map)
X["gender"] = X["gender"].map(gender_map)
X["education"] = X["education"].map(education_map)
X["country"] = X["country"].map(country_map)
X["ethnicity"] = X["ethnicity"].map(ethnicity_map)

# One-hot encode → now column names use meanings
X_encoded = pd.get_dummies(X, columns=["age", "gender", "education", "country", "ethnicity"])

print(X_encoded.filter(like="age").head())  # example: see only age columns


In [None]:
# Part 1.a: linear regression

traits = X[['nscore', 'escore', 'oscore', 'ascore', 'cscore', 'impuslive', 'ss']]
#traits.head()
#traits.std()
#traits.mean()



In [None]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import KFold, cross_val_score

# Define predictors and target
X = traits[['nscore', 'escore', 'oscore', 'ascore', 'cscore']]  
y = traits['impuslive']  
# 10-fold cross-validation
seed = 42
CV_kfold = KFold(n_splits=10, shuffle=True, random_state=seed)

# Range of λ (regularization strength)
lambdas = np.logspace(-1.1, 1.8, 100)  # 1e-4 to 1e4

# Compute cross-validated MSE for each λ
mse_scores = []
for l in lambdas:
    ridge = Ridge(alpha=l)
    scores = cross_val_score(ridge, X, y, cv=CV_kfold, scoring='neg_mean_squared_error')
    mse_scores.append(-scores.mean())

# Plot generalization error vs λ
plt.figure(figsize=(7,5))
plt.semilogx(lambdas, mse_scores, marker='o')
plt.xlabel('λ (regularization strength)')
plt.ylabel('Cross-validated MSE')
plt.title('Generalization error vs λ (Ridge Regression)')
plt.grid(True)
plt.show()

# Find λ with lowest error
best_lambda = lambdas[np.argmin(mse_scores)]
print(f"Optimal λ: {best_lambda:.4f}")
print(f"Minimum CV MSE: {min(mse_scores):.4f}")

In [None]:
# optimal lambda is 18.7382 (based on 100 lambdas generated last code section)
optimal_lambda = 18.7382

ridge_model = Ridge(alpha=optimal_lambda).fit(X, y)

print("Intercept:", ridge_model.intercept_)
print("Coefficients:", ridge_model.coef_)

# Get coefficients and trait names
coef_table = pd.DataFrame({
    'Trait': X.columns,
    'Coefficient': ridge_model.coef_
})

# Add intercept as a separate row (optional)
coef_table.loc[len(coef_table)] = ['Intercept', ridge_model.intercept_]

# Display neatly
print(coef_table)