In [8]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, roc_curve, auc

# new:
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import make_pipeline
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from torch import optim

In [9]:
# TAKEN DIRECTLY FROM COMBINED SCRIPT:
# USE 1) df_most_recent_wave_per_mergeid OR 2) df_relevant

easyshare = pd.read_stata('data/sharewX_rel8-0-0_easySHARE_stata/easySHARE_rel8-0-0.dta')

illness_before = pd.read_stata("data/SHARE-ENV - Exposure to Environmental Hazards/illness_before_module_v01.dta")
illness_during = pd.read_stata("data/SHARE-ENV - Exposure to Environmental Hazards/illness_during_module_v01.dta")
job = pd.read_stata("data/SHARE-ENV - Exposure to Environmental Hazards/job_module_v01.dta")
life = pd.read_stata("data/SHARE-ENV - Exposure to Environmental Hazards/life_module_v01.dta")
young_age = pd.read_stata("data/SHARE-ENV - Exposure to Environmental Hazards/young_age_module_v01.dta")
#yearly = pd.read_stata("data/SHARE-ENV - Exposure to Environmental Hazards/yearly_module_v01.dta")
# individual_year = pd.read_stata(\"data/SHARE-ENV - Exposure to Environmental Hazards/individual_year_panel_v01.dta\")  NB times out, I ran it for 2+hrs,
#merged.to_pickle(\"data/df_merged.pkl\"

df = pd.merge(easyshare, life, on=['mergeid', 'wave'], how='left')
df = pd.merge(df, job, on=['mergeid'], how='left')

df_sorted = df.sort_values(by=['mergeid', 'wave'], ascending=[True, False])
df_most_recent_wave_per_mergeid = df_sorted.drop_duplicates(subset='mergeid', keep='first')

df_most_recent_wave_per_mergeid # this is the full dataset, we should be trying to run models on this default

df_relevant = df_most_recent_wave_per_mergeid[df_most_recent_wave_per_mergeid.columns.drop(list(df.filter(regex='^euro')))]
df_relevant = df_relevant[df_relevant.columns.drop(list(df_relevant.filter(regex='^dn')))]
non_predictive_vars = [
    'mergeid',    # Used for merging records, no predictive power
    'hhid',       # Household identifier for tracking or grouping data
    'coupleid',   # Links records of individuals within a household
    'int_version',# Version of the questionnaire or interview format
    'int_year',   # Year the interview was conducted, structural rather than predictive
    'int_month',  # Month the interview was conducted, similar to int_year
    'country',    # Country code, used for stratification or adjustments
    'country_mod', # Modified country code, typically for data manipulation
    'wavepart'   # Wave part, used for stratification or adjustments
]
df_relevant = df_relevant[df_relevant.columns.drop(non_predictive_vars)]

def replace_dash_with_na(df_relevant):
    for column in df_relevant.columns:
        if df_relevant[column].dtype == 'category':
            # Replace entries containing '-' with NA
            df_relevant[column] = df[column].apply(lambda x: pd.NA if '-' in str(x) else x)
    return df_relevant

df_relevant = replace_dash_with_na(df_relevant)

### From here is new

In [10]:
X = df_relevant.drop('sphus', axis=1) 
y = df_relevant['sphus']  #

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [11]:
train_ratio = 0.70
validation_ratio = 0.15
test_ratio = 0.15

# train is now 70% of the entire data set
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=train_ratio, random_state = 10117)

# test is now 15% of the initial data set
# validation is now 15% of the initial data set
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size = test_ratio / (test_ratio + validation_ratio), random_state = 10117)

n_train = X_train.shape[0]
n_val = X_val.shape[0]
n_test = X_test.shape[0]
n = n_train + n_val + n_test

print(n_train / n, n_val / n, n_test / n) # why only 3 number????????????

0.6999964317573595 0.15000178412132026 0.15000178412132026


In [12]:
# training it: first need to remove all NaNs / impute

if np.isnan(X_train).any() or np.isnan(y_train).any():
    print("NaNs in train data")
if np.isinf(X_train).any() or np.isinf(y_train).any():
    print("Infs in train data")

TypeError: Object with dtype category cannot perform the numpy op isnan

In [13]:
# training it: first need to remove all NaNs / impute

# Check for NaNs in the training data
if X_train.isna().any().any() or y_train.isna().any().any():
    print("NaNs in train data")

# Check for infinities in the training data
if (X_train == np.inf).any().any() or (y_train == np.inf).any().any():
    print("Infinities in train data")

NaNs in train data


In [23]:
# WHEN WE HAVE LINO'S 1 HOT ENCODING DATASET USE THT HERE INSTEAD OF THIS

from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline

# Identify numeric and categorical columns
numeric_features = X_train.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X_train.select_dtypes(include=['object', 'category']).columns

# Create transformers for numeric and categorical data
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))  # Use OneHotEncoder for categorical data
])

# Combine transformers into a single ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Fit and transform the training data
X_train_preprocessed = preprocessor.fit_transform(X_train)
# Transform the validation data
X_val_preprocessed = preprocessor.transform(X_val)


TypeError: Encoders require their input to be uniformly strings or numbers. Got ['int', 'str']

In [22]:
# basic imputation / scaling
#
#imputer = SimpleImputer(strategy='mean')
#X_train_imputed = imputer.fit_transform(X_train)
#X_val_imputed = imputer.transform(X_val)

#scaler = StandardScaler()
#X_train_scaled = scaler.fit_transform(X_train_imputed)
#X_val_scaled = scaler.transform(X_val_imputed)

# convert numpy arrays -> PyTorch tensors



X_train_torch = torch.tensor(X_train_preprocessed.astype(np.float32))
y_train_torch = torch.tensor(y_train.astype(np.float32)).unsqueeze(1)
X_val_torch = torch.tensor(X_val_preprocessed.astype(np.float32))
y_val_torch = torch.tensor(y_val.astype(np.float32)).unsqueeze(1)


# DataLoader instances
train_dataset = TensorDataset(X_train_torch, y_train_torch) # makes them into joined Tensor
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True) # converts into a data loader, with 64 samples in a batch

val_dataset = TensorDataset(X_val_torch, y_val_torch)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False)


ValueError: could not convert string to float: '12. German (de)'

In [None]:
# a training function:

def train_one_epoch(dataloader, model, loss_fn, optimizer):
    model.train()  
    total_loss = 0.0
    num_samples = 0

    for inputs, targets in dataloader:

        # forward pass: compute model output
        predictions = model(inputs)
        
        # compute loss
        loss = loss_fn(predictions, targets)

        # backward pass: compute  gradient of  loss wrt to model params
        optimizer.zero_grad()  # zero prev gradients
        loss.backward()  # backpropagation
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1) # for clipping gradients - needed later

        # update the model parameters - NB: calling the step function on an Optimizer makes an update to its params
        optimizer.step()

        # update total loss and the batch count
        total_loss += loss.item() * inputs.size(0) # this will depend on defined batchsize in dataloader?
        num_samples +=inputs.size(0) 

    average_loss = total_loss / num_samples
    print(f"Average loss: {average_loss:.4f}") # prints running avg
    return average_loss

In [None]:
# a function to evaluate error on entire val / test set

def evaluate_model(dataloader, model, loss_fn):
    model.eval()  
    total_loss = 0.0
    num_samples = 0

    with torch.no_grad():  # disable gradient computation during evaluation
        for inputs, targets in dataloader:

            # forward pass: 
            predictions = model(inputs)
            
            # loss
            loss = loss_fn(predictions, targets)

            # aggregate the loss
            total_loss += loss.item() * inputs.size(0)  
            num_samples += inputs.size(0)
            
            # NB THE DIFFERENCE HERE IS NOT LEARNING NEW PARAMS IN EVAL MODE

    average_loss = total_loss / num_samples  
    print(f"Average loss over evaluation data: {average_loss:.4f}")

    return average_loss

In [None]:
# defining basic NN model

class NeuralNetwork(nn.Module):
    def __init__(self, input_features, hidden_size):
        super(NeuralNetwork, self).__init__()
        self.hidden = nn.Linear(input_features, hidden_size)
        self.relu = nn.ReLU()
        self.output = nn.Linear(hidden_size, 1)

    def forward(self, x):
        x = self.hidden(x)
        x = self.relu(x)
        x = self.output(x)
        return x

In [None]:

#  training model w/ diff sized hidden layers

def run_training2(train_loader, val_loader, epochs, hidden_size):
    
    model = NeuralNetwork(input_features=X_train_torch.shape[1], hidden_size=hidden_size) # added this inside of the function no we want to try diff values - need to pass hidden_size now it has multiple layers
    loss_fn = nn.MSELoss() # set these as fixed params
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001) # set these as fixed params
    
    train_losses = []
    val_losses = []

    for epoch in range(epochs):
        train_loss = train_one_epoch(train_loader, model, loss_fn, optimizer)
        val_loss = evaluate_model(val_loader, model, loss_fn)

        train_losses.append(train_loss)
        val_losses.append(val_loss)
        print(f'Epoch {epoch+1}, Train Loss: {train_loss:.4f}, Validation Loss: {val_loss:.4f}')
    
    return train_losses, val_losses
        

# 10 hidden nodes
print("Training with 10 hidden nodes:")
train_losses_10, val_losses_10 = run_training2(train_loader=train_loader, 
                                             val_loader=val_loader,
                                             epochs = 20,
                                             hidden_size=10)

# 1000 hidden nodes
print("Training with 1000 hidden nodes:")
train_losses_1000, val_losses_1000 = run_training2(train_loader=train_loader, 
                                             val_loader=val_loader,
                                             epochs = 20,
                                             hidden_size=1000)

# plot
plt.figure(figsize=(12, 6))
plt.plot(train_losses_10, label='Training Loss (10 Nodes)')
plt.plot(val_losses_10, label='Validation Loss (10 Nodes)')
plt.plot(train_losses_1000, label='Training Loss (1000 Nodes)')
plt.plot(val_losses_1000, label='Validation Loss (1000 Nodes)')
plt.title('Training and Validation Losses')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.grid(True)
plt.show()


In [None]:
# training IN THIS CASE A LINEAR MODEL using Adam


# step 1: get data ready SEE ABOVE


# Step 2: Initialize the Model, Loss Function, and Optimizer

model = SimpleLinearRegression(input_features=X_train_torch.shape[1])
loss_fn = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001) 


# Step 3: Define Training Functions

def run_training(model, train_loader, val_loader, loss_fn, optimizer, epochs):
    train_losses = []
    val_losses = []
      
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.1) # decreasing step sizes as had issue with NaNs
    
    for epoch in range(epochs):
        train_loss = train_one_epoch(train_loader, model, loss_fn, optimizer)
        val_loss = evaluate_model(val_loader, model, loss_fn)
        #scheduler.step()  # decreasing step sizes as had issue with NaNs
        
        train_losses.append(train_loss)
        val_losses.append(val_loss)
        print(f'Epoch {epoch+1}, Train Loss: {train_loss:.4f}, Validation Loss: {val_loss:.4f}')
    
    return train_losses, val_losses


# Step 4: Training the Model and Collecting Errors

num_epochs = 500  # NB aritrary
train_losses, val_losses = run_training(model, train_loader, val_loader, loss_fn, optimizer, num_epochs)


# Step 5: Plotting  Learning Curves
plt.figure(figsize=(10, 6))
plt.plot(train_losses, label='Training Loss')
plt.plot(val_losses, label='Validation Loss')
plt.axhline(y=mse_val, color='r', linestyle='--', label='Baseline Error (Validation MSE)')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Learning Curves')
plt.legend()
plt.grid(True)
plt.show()
