In [1]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import pickle
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import make_scorer, precision_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report

In [2]:
def preprocess(df):
    label_encoder = LabelEncoder()
    df['Label'] = label_encoder.fit_transform(df['Label'])
    X = df.drop(['Label', 'Weight'], axis=1)
    y = df['Label']
    return X, y

def fit_score(X, y, **kwargs):
    """Fit, cross-validate and print metrics
    Args:
        X (pandas.core.frame.DataFrame): Independent variables
        y (pandas.core.frame.DataFrame): Label to predict
    Returns:
        xgboost.sklearn.XGBClassifier: A baseline XGB classifier
    """
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    xgb_model = XGBClassifier(objective='binary:logistic', random_state=42, **kwargs)
    # xgb_model.set_params(params)
    xgb_model.fit(X_train, y_train)
    precision_scorer = make_scorer(precision_score)
    k_fold = KFold(n_splits=5, shuffle=True, random_state=42)
    precision_scores = cross_val_score(xgb_model, X_train, y_train, cv=k_fold, scoring=precision_scorer)
    for i, precision in enumerate(precision_scores):
        print(f'Fold {i+1}: Precision = {precision}')
    # Print the average precision across all folds
    print(f'\nAverage Precision: {precision_scores.mean()}')
    # Make predictions on the test data
    y_pred = xgb_model.predict(X_test)
    precision = precision_score(y_test, y_pred)
    print(f'\nTest data Precision: {precision}')
    # Print confusion matrix
    conf_matrix = confusion_matrix(y_test, y_pred)
    print("Confusion Matrix:")
    print(conf_matrix)
    # Print classification report
    class_report = classification_report(y_test, y_pred)
    print("\nClassification Report:")
    print(class_report)
    metrics = {'Average Precision': precision_scores.mean(), 'Test data Precision': precision, 'Confusion Matrix': conf_matrix, 'Classification Report': class_report}
    return xgb_model, metrics

def save(model, path):
    with open(path, 'wb') as model_file:
        pickle.dump(model, model_file)

In [3]:
params = {
        "eta" : 0.1,
        "max_depth": 6,
        "nthread" : 4,
    }

df = pd.read_csv('data/training.zip', low_memory=False)
X, y = preprocess(df)
model, metrics = fit_score(X, y, **params)

In [4]:
print(metrics)

In [5]:
import wandb
from wandb.xgboost import WandbCallback
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
from xgboost import XGBClassifier
from modelling import preprocess, fit_score, save
import os
from dotenv import load_dotenv
## uncomment when running locally. left out to be used by github workflows

load_dotenv()
WANDB_API_KEY = os.getenv("WANDB_API_KEY")
# Logging into wandb

wandb.login(key=WANDB_API_KEY)
# import fastparquet

parquet_file_path = 'data/preprocessed_data.parquet'
# Read the Parquet file into a Pandas DataFrame using fastparquet

df = pd.read_parquet(parquet_file_path, engine='fastparquet')
# setup parameters for xgboost

params = {
    "eta" : 0.1,
    "max_depth": 6,
    "nthread" : 4,
}
# Train and save the model locally

X, y = preprocess(df)
model, metrics = fit_score(X, y, **params)
save(model, 'models/Tuned_model.pkl')

In [6]:
run = wandb.init(
    project="Higgs-Boson",
    config=params,
    job_type = 'train_model'
)

run.use_artifact('harsh-ajay-jadhav/Higgs-Boson/df_preprocessed:latest', type='dataset')

<Artifact QXJ0aWZhY3Q6NzA5MTgyNzk1>

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011288888888904896, max=1.0…

In [7]:
model_artifact = wandb.Artifact(
        name= 'Tuned_model',
        type='model'
    )

In [8]:
def log_model(model_name, local_model_path, wandb_data_path, metrics=None):
    run = wandb.init(
        project="Higgs-Boson",
        config=params,
        job_type = 'train_model'
    )
    run.use_artifact(wandb_data_path, type='dataset')
    model_artifact = wandb.Artifact(
            name=model_name,
            type='model'
        )
    model_artifact.add_file(local_path=local_model_path)
    run.log_artifact(model_artifact)
    if metrics is not None:
        run.log(metrics)
    wandb.finish()

In [9]:
local_model_path = 'models/Tuned_model.pkl'

In [10]:
def log_model(model_name, local_model_path, wandb_data_path = 'harsh-ajay-jadhav/Higgs-Boson/df_preprocessed:v0', metrics=None):
    run = wandb.init(
        project="Higgs-Boson",
        config=params,
        job_type = 'train_model'
    )
    run.use_artifact(wandb_data_path, type='dataset')
    model_artifact = wandb.Artifact(
            name=model_name,
            type='model'
        )
    model_artifact.add_file(local_path=local_model_path)
    run.log_artifact(model_artifact)
    if metrics is not None:
        run.log(metrics)
    wandb.finish()

log_model('tuned_model', local_model_path)