<a href="https://colab.research.google.com/github/hellojohnkim/mmai894/blob/main/Pump_it_up_xgb_base.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
import pandas as pd

# Specifying the datasets file paths
training_set_values_file_path = '/content/drive/MyDrive/MMAI_Group/894_team/DrivenData_Competition/data/training_set_values.csv'
training_set_labels_file_path = '/content/drive/MyDrive/MMAI_Group/894_team/DrivenData_Competition/data/training_set_label.csv'
test_set_file_path = '/content/drive/MyDrive/MMAI_Group/894_team/DrivenData_Competition/data/test_set.csv'

# Loading the datasets
training_values = pd.read_csv(training_set_values_file_path)
training_labels = pd.read_csv(training_set_labels_file_path)
test_values = pd.read_csv(test_set_file_path)

# Displaying the first few rows of the datasets
training_values.head(), training_labels.head(), test_values.head()

(      id  amount_tsh date_recorded        funder  gps_height     installer  \
 0  69572      6000.0    2011-03-14         Roman        1390         Roman   
 1   8776         0.0    2013-03-06       Grumeti        1399       GRUMETI   
 2  34310        25.0    2013-02-25  Lottery Club         686  World vision   
 3  67743         0.0    2013-01-28        Unicef         263        UNICEF   
 4  19728         0.0    2011-07-13   Action In A           0       Artisan   
 
    longitude   latitude              wpt_name  num_private  ... payment_type  \
 0  34.938093  -9.856322                  none            0  ...     annually   
 1  34.698766  -2.147466              Zahanati            0  ...    never pay   
 2  37.460664  -3.821329           Kwa Mahundi            0  ...   per bucket   
 3  38.486161 -11.155298  Zahanati Ya Nanyumbu            0  ...    never pay   
 4  31.130847  -1.825359               Shuleni            0  ...    never pay   
 
   water_quality quality_group      

In [10]:
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
import numpy as np

# One-Hot Encoding of Labels
encoder = OneHotEncoder(sparse=False)
encoded_labels = encoder.fit_transform(training_labels[['status_group']])

# Encoding DateTime Variables
training_values['date_recorded'] = pd.to_datetime(training_values['date_recorded'])
test_values['date_recorded'] = pd.to_datetime(test_values['date_recorded'])
training_values['year_recorded'] = training_values['date_recorded'].dt.year
training_values['month_recorded'] = training_values['date_recorded'].dt.month
training_values['day_recorded'] = training_values['date_recorded'].dt.day
test_values['year_recorded'] = test_values['date_recorded'].dt.year
test_values['month_recorded'] = test_values['date_recorded'].dt.month
test_values['day_recorded'] = test_values['date_recorded'].dt.day
training_values.drop('date_recorded', axis=1, inplace=True)
test_values.drop('date_recorded', axis=1, inplace=True)

# Columns to Drop
columns_to_drop = ['funder', 'installer', 'scheme_name']
training_values.drop(columns=columns_to_drop, inplace=True)
test_values.drop(columns=columns_to_drop, inplace=True)

# Simple Imputation for 'scheme_management'
imputer_scheme_management = SimpleImputer(strategy='most_frequent')
training_values['scheme_management'] = imputer_scheme_management.fit_transform(training_values[['scheme_management']])
test_values['scheme_management'] = imputer_scheme_management.transform(test_values[['scheme_management']])

# Label Encoding for categorical variables
categorical_columns = training_values.select_dtypes(include=['object']).columns.tolist()
label_encoders = {col: LabelEncoder() for col in categorical_columns if len(training_values[col].unique()) <= 50}

# Adjusting Label Encoding to handle unseen categories in test data
for col in categorical_columns:
    if training_values[col].dtype == 'object':
        # Combining the categories from both training and test sets
        combined_categories = pd.concat([training_values[col], test_values[col]], axis=0).astype(str).unique()

        # Creating a LabelEncoder with the combined categories
        le = LabelEncoder().fit(combined_categories)

        # Transforming both training and test sets
        training_values[col] = le.transform(training_values[col].astype(str))
        test_values[col] = le.transform(test_values[col].astype(str))

# Reapplying MinMaxScaler to numeric features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_columns)
    ],
    remainder='passthrough'  # Keep other columns unchanged
)

# Transforming the datasets again after re-encoding
X_train_processed = preprocessor.fit_transform(training_values)
X_test_processed = preprocessor.transform(test_values)

# Splitting the re-processed data into training and validation sets
X_train, X_val, y_train_labels, y_val_labels = train_test_split(X_train_processed, np.argmax(encoded_labels, axis=1), test_size=0.2, random_state=42)



(0.797053872053872,
 '              precision    recall  f1-score   support\n\n           0       0.78      0.91      0.84      6457\n           1       0.60      0.25      0.35       851\n           2       0.84      0.74      0.79      4572\n\n    accuracy                           0.80     11880\n   macro avg       0.74      0.63      0.66     11880\nweighted avg       0.79      0.80      0.79     11880\n')

In [17]:
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, accuracy_score

# Creating a baseline XGBoost model
xgb_baseline = XGBClassifier(random_state=42)

# Retraining the XGBoost model with the re-processed data
xgb_baseline.fit(X_train, y_train_labels)

# Making predictions on the validation set
y_val_pred = xgb_baseline.predict(X_val)

# Recalculating baseline performance
accuracy_val = accuracy_score(y_val_labels, y_val_pred)
classification_rep_val = classification_report(y_val_labels, y_val_pred, output_dict=True)

# Performance metrics
accuracy_val

# Convert the classification report to a DataFrame
classification_report_df = pd.DataFrame(classification_rep_val).transpose()

# Display the classficiation report and accuracy
print(classification_report_df)
print(accuracy_val)

              precision    recall  f1-score       support
0              0.781654  0.909246  0.840636   6457.000000
1              0.602817  0.251469  0.354892    851.000000
2              0.843049  0.740157  0.788260   4572.000000
accuracy       0.797054  0.797054  0.797054      0.797054
macro avg      0.742507  0.633624  0.661263  11880.000000
weighted avg   0.792471  0.797054  0.785684  11880.000000
0.797053872053872


In [None]:
# Tweaking hyperparameters of the XGBoost model
xgb_tweaked = XGBClassifier(
    n_estimators=1000,  # Increasing the number of trees
    learning_rate=0.1,  # Adjusting learning rate
    max_depth=8,  # Setting a maximum depth for trees
    subsample=0.8,  # Using 80% of data for fitting individual trees
    colsample_bytree=0.8,  # Using 80% of features for constructing each tree
    random_state=42
)

# Retraining the model with tweaked hyperparameters
xgb_tweaked.fit(X_train, y_train_labels)

# Making predictions on the validation set with the tweaked model
y_val_pred_tweaked = xgb_tweaked.predict(X_val)

# Recalculating performance with the tweaked model
accuracy_val_tweaked = accuracy_score(y_val_labels, y_val_pred_tweaked)
classification_rep_val_tweaked = classification_report(y_val_labels, y_val_pred_tweaked, output_dict=True)

# Convert the classification report to a DataFrame
classification_report_df = pd.DataFrame(classification_rep_val_tweaked).transpose()

# Display the classification report and accuracy
print(classification_report_df)
print(accuracy_val_tweaked)
