<a href="https://colab.research.google.com/github/hellojohnkim/mmai894/blob/main/Pump_it_up_XGB_Baseline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score
from hyperopt import hp, fmin, tpe, STATUS_OK, Trials, atpe
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

##Pump It Up Faulty Water Pump Prediction Model

#Data Loading

In [3]:
#run this cell if you get denied running the data loading cell
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# Specifying the datasets file paths
training_set_values_file_path = '/content/drive/MyDrive/MMAI_Group/894_team/DrivenData_Competition/data/training_set_values.csv'
training_set_labels_file_path = '/content/drive/MyDrive/MMAI_Group/894_team/DrivenData_Competition/data/training_set_label.csv'
test_set_file_path = '/content/drive/MyDrive/MMAI_Group/894_team/DrivenData_Competition/data/test_set.csv'

# Loading the datasets
features_df = pd.read_csv(training_set_values_file_path)
labels_df = pd.read_csv(training_set_labels_file_path)
test = pd.read_csv(test_set_file_path)

# Displaying the first few rows of the datasets
features_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59400 entries, 0 to 59399
Data columns (total 40 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   id                     59400 non-null  int64  
 1   amount_tsh             59400 non-null  float64
 2   date_recorded          59400 non-null  object 
 3   funder                 55765 non-null  object 
 4   gps_height             59400 non-null  int64  
 5   installer              55745 non-null  object 
 6   longitude              59400 non-null  float64
 7   latitude               59400 non-null  float64
 8   wpt_name               59400 non-null  object 
 9   num_private            59400 non-null  int64  
 10  basin                  59400 non-null  object 
 11  subvillage             59029 non-null  object 
 12  region                 59400 non-null  object 
 13  region_code            59400 non-null  int64  
 14  district_code          59400 non-null  int64  
 15  lg

#Data Pre-Processing

In [3]:
from datetime import datetime
# Merge the data on id
data_df = features_df.merge(labels_df, on='id')

# Date feature transformation
data_df['date_recorded'] = pd.to_datetime(data_df['date_recorded'])
data_df['year_recorded'] = data_df['date_recorded'].dt.year
data_df['month_recorded'] = data_df['date_recorded'].dt.month
data_df['day_recorded'] = data_df['date_recorded'].dt.day
data_df['days_since_recorded'] = (data_df['date_recorded'] - data_df['date_recorded'].min()).dt.days
data_df.drop('date_recorded', axis=1, inplace=True)

# Encode 'construction_year' as a cyclical feature
max_year = data_df['construction_year'].max()
data_df['construction_year_sin'] = np.sin(2 * np.pi * data_df['construction_year'] / max_year)
data_df['construction_year_cos'] = np.cos(2 * np.pi * data_df['construction_year'] / max_year)

# Calculate 'age' as 'year_recorded' - 'construction_year'
data_df['age'] = data_df['year_recorded'] - data_df['construction_year']
data_df['age'].replace({0: np.nan}, inplace=True)  # Replace zero ages (indicating missing data) with NaN
data_df.drop(['construction_year'], axis=1, inplace=True)  # Drop original 'construction_year'

# Handling categorical variables
categorical_cols = data_df.select_dtypes(include=['object', 'category']).columns.tolist()
categorical_cols.remove('status_group')
categorical_cols.extend(['region_code', 'district_code'])  # Include region and district codes as categorical

preprocessor = ColumnTransformer(
    transformers=[
        ('onehot', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ])

X = data_df.drop('status_group', axis=1)
y = data_df['status_group']

pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('xgb', XGBClassifier(eval_metric='mlogloss', n_jobs=-1))
])

X.drop('id', axis=1, inplace=True)  # Drop 'id' column

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

### Data Preprocessing Summary

#### 1. Data Merging
- **Variables Involved**: `features_df` and `labels_df`
- **Purpose**: To consolidate feature and label data into a single DataFrame.
- **Method Applied**: Merged the two DataFrames on the 'id' column.

#### 2. Date Feature Transformation
- **Variable Affected**: `date_recorded`
- **Purpose**: To extract more granular time-related features.
- **Method Applied**: Converted `date_recorded` to a datetime object, then created separate columns for year, month, and day. Also calculated the number of days since the earliest recorded date, then dropped the original `date_recorded` column.

#### 3. Cyclical Encoding of 'construction_year'
- **Variable Affected**: `construction_year`
- **Purpose**: To transform a linear numeric feature into a cyclical one, acknowledging its periodic nature.
- **Method Applied**: Calculated sine and cosine transformations of `construction_year` and dropped the original column.

#### 4. Age Calculation
- **Variable Affected**: `age`
- **Purpose**: To derive a meaningful feature representing the age of the infrastructure.
- **Method Applied**: Calculated 'age' as the difference between the recorded year and the construction year. Replaced zero values (indicating missing data) with NaN.

#### 5. Handling Categorical Variables
- **Variables Affected**: Various categorical columns
- **Purpose**: To transform categorical data into a format suitable for machine learning algorithms.
- **Method Applied**: Identified all categorical columns and extended the list with region and district codes. Applied OneHotEncoder to these categorical variables in a preprocessing pipeline.

#### 6. Feature and Label Separation
- **Variables Affected**: `X` (features) and `y` (labels)
- **Purpose**: To prepare the dataset for model training.
- **Method Applied**: Separated the dataset into features (`X`) and labels (`y`), dropping the 'id' column from features and the 'status_group' column from labels.

#### 7. Label Encoding
- **Variable Affected**: `status_group`
- **Purpose**: To convert label data into a numerical format.
- **Method Applied**: Applied Label Encoding to the 'status_group' column.

This summary encapsulates the key steps undertaken in the data preprocessing phase of your project. It provides a clear and concise overview of each step, along with its purpose and the specific methods used.

#XGB Baseline Model

#XGB Baseline + Hyperparameter Tuning using Hyperopt

In [4]:
space = {
    'learning_rate': hp.uniform('learning_rate', 0.01, 0.2),
    'max_depth': hp.choice('max_depth', range(3, 15)),
    'n_estimators': hp.choice('n_estimators', range(50, 300)),
    'subsample': hp.uniform('subsample', 0.5, 1),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1),
    'gamma': hp.uniform('gamma', 0, 5),
    'min_child_weight': hp.uniform('min_child_weight', 0, 10),
    'reg_alpha': hp.uniform('reg_alpha', 0.0, 1.0),
    'reg_lambda': hp.uniform('reg_lambda', 1.0, 4.0),
}

def objective(params):
    pipeline.set_params(xgb__learning_rate=params['learning_rate'],
                        xgb__max_depth=params['max_depth'],
                        xgb__n_estimators=params['n_estimators'],
                        xgb__subsample=params['subsample'],
                        xgb__colsample_bytree=params['colsample_bytree'],
                        xgb__gamma=params['gamma'],
                        xgb__min_child_weight=params['min_child_weight'],
                        xgb__reg_alpha=params['reg_alpha'],
                        xgb__reg_lambda=params['reg_lambda'])
    score = cross_val_score(pipeline, X, y, cv=6, scoring='f1_weighted', n_jobs=-1).mean()
    return {'loss': -score, 'status': STATUS_OK}

trials = Trials()
best_params = fmin(fn=objective, space=space, algo=atpe.suggest, max_evals=60, trials=trials)

print("Best parameters:", best_params)

100%|██████████| 60/60 [19:57:45<00:00, 1197.75s/trial, best loss: -0.7982059849267075]
Best parameters: {'colsample_bytree': 0.9993491087620797, 'gamma': 0.37885665466692675, 'learning_rate': 0.17813266334475886, 'max_depth': 11, 'min_child_weight': 0.9243224187937522, 'n_estimators': 219, 'reg_alpha': 0.06930257498733917, 'reg_lambda': 1.502077184383809, 'subsample': 0.6934044303500844}


100%|██████████| 60/60 [19:57:45<00:00, 1197.75s/trial, best loss: -0.7982059849267075]
Best parameters: {'colsample_bytree': 0.9993491087620797, 'gamma': 0.37885665466692675, 'learning_rate': 0.17813266334475886, 'max_depth': 11, 'min_child_weight': 0.9243224187937522, 'n_estimators': 219, 'reg_alpha': 0.06930257498733917, 'reg_lambda': 1.502077184383809, 'subsample': 0.6934044303500844}

In [5]:
#best params from Peter's training:
best_params = {'colsample_bytree': 0.9993491087620797, 'gamma': 0.37885665466692675, 'learning_rate': 0.17813266334475886, 'max_depth': 11, 'min_child_weight': 0.9243224187937522, 'n_estimators': 219, 'reg_alpha': 0.06930257498733917, 'reg_lambda': 1.502077184383809, 'subsample': 0.6934044303500844}

final_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('xgb', XGBClassifier(eval_metric='mlogloss',
                          learning_rate=best_params['learning_rate'],
                          max_depth=best_params['max_depth'],
                          n_estimators=best_params['n_estimators'],
                          subsample=best_params['subsample'],
                          colsample_bytree=best_params['colsample_bytree'],
                          gamma=best_params['gamma'],
                          min_child_weight=best_params['min_child_weight'],
                          reg_alpha=best_params['reg_alpha'],
                          reg_lambda=best_params['reg_lambda']))
])

model = final_pipeline.fit(X, y)

In [6]:
test['date_recorded'] = pd.to_datetime(test['date_recorded'])
test['year_recorded'] = test['date_recorded'].dt.year
test['month_recorded'] = test['date_recorded'].dt.month
test['day_recorded'] = test['date_recorded'].dt.day
test['days_since_recorded'] = (test['date_recorded'] - test['date_recorded'].min()).dt.days
test.drop('date_recorded', axis=1, inplace=True)

def encode_cyclical_features(df, cols):
    for col in cols:
        max_val = df[col].max()
        df[col + '_sin'] = np.sin(2 * np.pi * df[col] / max_val)
        df[col + '_cos'] = np.cos(2 * np.pi * df[col] / max_val)
    return df

cyclical_cols = ['year_recorded', 'month_recorded', 'day_recorded']

test = encode_cyclical_features(test, cyclical_cols)
test.drop(['month_recorded', 'day_recorded'], axis=1, inplace=True)
categorical_cols = test.select_dtypes(include=['object', 'category']).columns.tolist()

predictions = model.predict(test)

In [7]:
predictions = label_encoder.inverse_transform(predictions)

In [11]:
final = pd.DataFrame({'Id': test.id, 'status_group': predictions})

In [12]:
# Saving to a CSV file
final.to_csv('/content/drive/MyDrive/MMAI_Group/894_team/DrivenData_Competition/notebooks/John/SubmissionFormat_1.csv', index=False)

In [13]:
final.rename(columns={'Id': 'id'}, inplace=True)
final.rename(columns={'status_group': 'status_group'}, inplace=True)