# 5 Modeling - "Of Genomes And Genetics"

#### Objective: Apply statistical and machine learning models to identify and validate the correlation between genetic markers and health outcomes.

## Import

In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb
import time

## Load The Data

In [2]:
X_train = np.load('data/X_train.npy')
X_test = np.load('data/X_test.npy')
y_train = np.load('data/y_train.npy', allow_pickle=True)
y_test = np.load('data/y_test.npy', allow_pickle=True)

In [3]:
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

In [4]:
print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_test:", y_test.shape)
print("Data type of y_train:", type(y_train[0]))

Shape of X_train: (17666, 33)
Shape of X_test: (4417, 33)
Shape of y_train: (17666,)
Shape of y_test: (4417,)
Data type of y_train: <class 'str'>


## Prepare Data for Modeling

In [5]:
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)
rf_y_pred = rf_model.predict(X_test)
print("Random Forest Model Performance Metrics:")
print(classification_report(y_test, rf_y_pred))
print("Accuracy:", accuracy_score(y_test, rf_y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, rf_y_pred))

Random Forest Model Performance Metrics:
                                              precision    recall  f1-score   support

 Mitochondrial genetic inheritance disorders       0.56      0.88      0.69      2485
Multifactorial genetic inheritance disorders       0.38      0.01      0.01       433
            Single-gene inheritance diseases       0.36      0.13      0.19      1499

                                    accuracy                           0.54      4417
                                   macro avg       0.43      0.34      0.30      4417
                                weighted avg       0.48      0.54      0.45      4417

Accuracy: 0.5386008603124293
Confusion Matrix:
 [[2188    3  294]
 [ 386    3   44]
 [1309    2  188]]


In [6]:
xgb_model = xgb.XGBClassifier(objective='binary:logistic', n_estimators=300, learning_rate=0.05, max_depth=4, random_state=42)
xgb_model.fit(X_train, y_train_encoded)
xgb_y_pred = xgb_model.predict(X_test)
print("XGBoost Model Performance Metrics:")
print(classification_report(y_test_encoded, xgb_y_pred))
print("Accuracy:", accuracy_score(y_test_encoded, xgb_y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test_encoded, xgb_y_pred))

XGBoost Model Performance Metrics:
              precision    recall  f1-score   support

           0       0.56      0.96      0.71      2485
           1       0.00      0.00      0.00       433
           2       0.36      0.04      0.07      1499

    accuracy                           0.55      4417
   macro avg       0.31      0.33      0.26      4417
weighted avg       0.44      0.55      0.42      4417

Accuracy: 0.554675118858954
Confusion Matrix:
 [[2388    1   96]
 [ 421    0   12]
 [1437    0   62]]


In [7]:
rf_scores = cross_val_score(rf_model, X_train, y_train, cv=5)
print(f"RandomForest Cross-validated Accuracy: {rf_scores.mean():.2f} ± {rf_scores.std():.2f}")

RandomForest Cross-validated Accuracy: 0.53 ± 0.00


In [9]:
xgb_scores = cross_val_score(xgb_model, X_train, y_train_encoded, cv=5)
print(f"XGBoost Cross-validated Accuracy: {xgb_scores.mean():.2f} ± {xgb_scores.std():.2f}")

XGBoost Cross-validated Accuracy: 0.55 ± 0.00


In [11]:
xgb_param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 4, 5],
    'learning_rate': [0.01, 0.05, 0.1]
}
xgb_grid_search = GridSearchCV(estimator=xgb_model, param_grid=xgb_param_grid, cv=5, n_jobs=-1, verbose=2, scoring='accuracy')
start_time = time.time()
xgb_grid_search.fit(X_train, y_train_encoded)  # Use encoded labels
end_time = time.time()
print(f"XGBoost Grid search took {end_time - start_time:.2f} seconds.")
print("Best XGBoost parameters:", xgb_grid_search.best_params_)
print("Best XGBoost cross-validated accuracy:", xgb_grid_search.best_score_)

Fitting 5 folds for each of 27 candidates, totalling 135 fits
XGBoost Grid search took 16.13 seconds.
Best XGBoost parameters: {'learning_rate': 0.05, 'max_depth': 3, 'n_estimators': 100}
Best XGBoost cross-validated accuracy: 0.5584172578666886
