In [2]:
!pip install xgboost

Collecting xgboost
  Downloading xgboost-3.0.2-py3-none-manylinux_2_28_x86_64.whl.metadata (2.1 kB)
Collecting nvidia-nccl-cu12 (from xgboost)
  Using cached nvidia_nccl_cu12-2.26.5-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (2.0 kB)
Downloading xgboost-3.0.2-py3-none-manylinux_2_28_x86_64.whl (253.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m253.9/253.9 MB[0m [31m84.8 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hUsing cached nvidia_nccl_cu12-2.26.5-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (318.1 MB)
Installing collected packages: nvidia-nccl-cu12, xgboost
Successfully installed nvidia-nccl-cu12-2.26.5 xgboost-3.0.2


In [5]:
!pip install sklearn-crfsuite

Collecting sklearn-crfsuite
  Downloading sklearn_crfsuite-0.5.0-py2.py3-none-any.whl.metadata (4.9 kB)
Collecting python-crfsuite>=0.9.7 (from sklearn-crfsuite)
  Downloading python_crfsuite-0.9.11-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.3 kB)
Collecting tabulate>=0.4.2 (from sklearn-crfsuite)
  Using cached tabulate-0.9.0-py3-none-any.whl.metadata (34 kB)
Downloading sklearn_crfsuite-0.5.0-py2.py3-none-any.whl (10 kB)
Downloading python_crfsuite-0.9.11-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[?25hUsing cached tabulate-0.9.0-py3-none-any.whl (35 kB)
Installing collected packages: tabulate, python-crfsuite, sklearn-crfsuite
Successfully installed python-crfsuite-0.9.11 sklearn-crfsuite-0.5.0 tabulate-0.9.0


In [6]:
import xgboost as xgb
import pandas as pd
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
import networkx as nx
import matplotlib.pyplot as plt
from sklearn_crfsuite import CRF

In [12]:
dataset = pd.read_csv("final_merged_phq1_9.csv")
print(dataset.columns)

Index(['Unnamed: 0', 'SEQN', 'com01', 'com02', 'com03', 'com07', 'com08',
       'com09', 'com10', 'com13', 'com15', 'com16', 'demo01', 'demo02',
       'demo03', 'demo05', 'demo06', 'demo07', 'demo08', 'demo10', 'demo11',
       'demo12', 'func01', 'func02', 'func03', 'func04', 'func05', 'func06',
       'func07', 'habit01', 'habit04', 'habit05', 'habit06', 'habit08',
       'habit09', 'healthcare01', 'healthcare02', 'healthcare04', 'job01',
       'job02', 'job04', 'lab04', 'lab06', 'lab08', 'lab10', 'lab11', 'lab12',
       'lab14', 'physical01', 'physical02', 'physical03', 'physical04',
       'physical05', 'physical06', 'target01', 'target02', 'target03',
       'target04', 'target05', 'target06', 'target07', 'target08', 'target09'],
      dtype='object')


In [17]:
X = dataset.drop(columns=[f"target0{i}" for i in range(1, 10)]) 
X = X.iloc[:, 2:]
Y = dataset[[f"target0{i}" for i in range(1, 10)]]

In [21]:
X.shape

(5519, 52)

In [20]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

**predict individual question using xgboost**

In [22]:
individual_pred = {}
for target in range(1, 10):
    target = "target0" + str(target)
    
    # Define and train the model
    model = xgb.XGBRegressor(objective="reg:squarederror", random_state=42)
    model.fit(X_train, Y_train[target])
        
    individual_pred[target] = model.predict(X_test)   

In [23]:
# Combine predictions into a DataFrame
individual_preds_df = pd.DataFrame(individual_pred)
print(individual_preds_df.head())

   target01  target02  target03  target04  target05  target06  target07  \
0  0.317716  0.480980  0.559162  0.135718  0.186693  0.639824  0.149314   
1  0.108467  0.351591  0.848316  0.407308  0.401271  0.357579  0.079028   
2  0.695894 -0.076630  1.910573  1.171484  2.057724  1.325833  0.004003   
3  0.361322  0.082060  0.209148  0.698716  0.497622  0.424329  0.104090   
4  0.679604  0.735004  1.433774  1.242154  0.667305  0.932744  0.831549   

   target08  target09  
0  0.213535  0.030997  
1 -0.053105  0.058031  
2  0.010430  0.048395  
3  0.193871  0.006146  
4  0.357596  0.031799  


**CRF for structured output**

In [25]:
# Convert each row into a sequence of features
X_sequences = []

for _, row in X.iterrows():
    
    # Create a sequence of 9 feature dictionaries for each instance
    sequence = []
    
    for i in range(1, 10):  # 9 labels (target01 to target09)
        feature_dict = {col: row[col] for col in X.columns}
        sequence.append(feature_dict)
    X_sequences.append(sequence)
    
    # Create Y_sequences (list of sequences of labels)
    Y_sequences = Y.apply(lambda row: [str(int(value)) for value in row], axis=1).tolist()

In [28]:
print(f"Number of feature sequences: {len(X_sequences)}")
print(f"Number of label sequences: {len(Y_sequences)}")

Number of feature sequences: 5519
Number of label sequences: 5519


In [29]:
crf = CRF(
        algorithm='lbfgs',  # Optimization algorithm
        max_iterations=100,  # Maximum number of iterations
        all_possible_transitions=True  # Allow transitions between all labels
    )

In [30]:
# Fit the CRF model
crf.fit(X_sequences, Y_sequences)

# Predict on new data
structured_pred = crf.predict(X_sequences)

**Accuracy Comparison**

In [31]:
# Evaluate Individual Predictions
individual_mae = {}

for target in range(1, 10):
    
    target = f"target0{target}"
    mae = mean_absolute_error(Y_test[target], individual_preds_df[target])
    individual_mae[target] = mae

average_mae = sum(individual_mae.values()) / len(individual_mae)
print(f"Average MAE across individual predictions: {average_mae:.4f}")

Average MAE across individual predictions: 0.5218


In [32]:
# Convert CRF predictions to DataFrame
structured_preds_df = pd.DataFrame(structured_pred, columns=[f"target0{i}" for i in range(1, 10)])

# Convert predictions and ground truth back to numerical format
structured_preds_df = structured_preds_df.apply(pd.to_numeric)
Y_numeric = Y.apply(pd.to_numeric)

# Evaluate Structured Predictions
structured_mae = mean_absolute_error(Y_numeric, structured_preds_df)

print("Structured Prediction MAE:", structured_mae)

Structured Prediction MAE: 0.45954379819210395
