# XGBoost

In [56]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import xgboost as xgb
from sklearn.metrics import (
    mean_squared_error, 
    mean_absolute_error, 
    accuracy_score, 
    classification_report,
    confusion_matrix
)
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split

## XGBoost Regressor


In [None]:
!gdown 1xwJmYJxEia06sxUdJyGO7JFx4DNK1fbp -O "data/Problem3.csv"

### Loading Dataset

In [44]:
dataset_path = "data/Problem3.csv"

df = pd.read_csv(dataset_path)
df.head()

Unnamed: 0,X,Y,month,day,FFMC,DMC,DC,ISI,temp,RH,wind,rain,area
0,7,5,mar,fri,4.468204,26.2,94.3,1.808289,8.2,51,6.7,False,0.0
1,7,4,oct,tue,4.517431,35.4,669.1,2.04122,18.0,33,0.9,False,0.0
2,7,4,oct,sat,4.517431,43.7,686.9,2.04122,14.6,33,1.3,False,0.0
3,8,6,mar,fri,4.529368,33.3,77.5,2.302585,8.3,97,4.0,True,0.0
4,8,6,mar,sun,4.503137,51.3,102.2,2.360854,11.4,99,1.8,False,0.0


In [45]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 510 entries, 0 to 509
Data columns (total 13 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   X       510 non-null    int64  
 1   Y       510 non-null    int64  
 2   month   510 non-null    object 
 3   day     510 non-null    object 
 4   FFMC    510 non-null    float64
 5   DMC     510 non-null    float64
 6   DC      510 non-null    float64
 7   ISI     510 non-null    float64
 8   temp    510 non-null    float64
 9   RH      510 non-null    int64  
 10  wind    510 non-null    float64
 11  rain    510 non-null    bool   
 12  area    510 non-null    float64
dtypes: bool(1), float64(7), int64(3), object(2)
memory usage: 48.4+ KB


### Encoding Categorical Features

In [46]:
categorical_cols = df.select_dtypes(include=["object", "bool"]).columns.to_list()

for col_name in categorical_cols:
    n_categories = df[col_name].nunique()
    print(f"Number of categories in {col_name}: {n_categories}")

Number of categories in month: 12
Number of categories in day: 7
Number of categories in rain: 2


In [47]:
orinal_encoder = OrdinalEncoder()
encoded_categorical_cols = orinal_encoder.fit_transform(df[categorical_cols])

encoded_categorical_df = pd.DataFrame(encoded_categorical_cols, columns=categorical_cols)

numberical_df = df.drop(categorical_cols, axis=1)
encoded_df = pd.concat([numberical_df, encoded_categorical_df], axis=1)
encoded_df.head()

Unnamed: 0,X,Y,FFMC,DMC,DC,ISI,temp,RH,wind,area,month,day,rain
0,7,5,4.468204,26.2,94.3,1.808289,8.2,51,6.7,0.0,7.0,0.0,0.0
1,7,4,4.517431,35.4,669.1,2.04122,18.0,33,0.9,0.0,10.0,5.0,0.0
2,7,4,4.517431,43.7,686.9,2.04122,14.6,33,1.3,0.0,10.0,2.0,0.0
3,8,6,4.529368,33.3,77.5,2.302585,8.3,97,4.0,0.0,7.0,0.0,1.0
4,8,6,4.503137,51.3,102.2,2.360854,11.4,99,1.8,0.0,7.0,3.0,0.0


### Splitting Data into Training and Test Set

In [48]:
X = encoded_df.drop(columns=["area"])
y = encoded_df["area"]

X.shape, y.shape

((510, 12), (510,))

### Training and Test Set Evaluation

In [49]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=7)

X_train.shape, X_test.shape

((408, 12), (102, 12))

In [52]:
xg_reg = xgb.XGBRegressor(seed=7,
                            learning_rate=0.01,
                            n_estimators=102,
                            max_depth=3)
xg_reg.fit(X_train, y_train)

In [53]:
preds = xg_reg.predict(X_test)

mae = mean_absolute_error(y_test, preds)
mse = mean_squared_error(y_test, preds)

print("Evaluation, results on test set:")
print(f"Mean Absolute Error: {mae}")
print(f"Mean Squared Error: {mse}")

Evaluation, results on test set:
Mean Absolute Error: 1.1450939484366194
Mean Squared Error: 1.9840716073149989


## XGBoost Classifier

In [None]:
!gdown 1pVdH-2b_odeuEPdXbLQYDcHXxgqqBK4i -O "data/Problem4.csv"

### Loading Dataset

In [33]:
dataset_path = "data/Problem4.csv"

df = pd.read_csv(dataset_path)
df.head()

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline,Target
0,14.23,1.71,2.43,15.6,127.0,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065.0,0
1,13.2,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050.0,0
2,13.16,2.36,2.67,18.6,101.0,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185.0,0
3,14.37,1.95,2.5,16.8,113.0,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480.0,0
4,13.24,2.59,2.87,21.0,118.0,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735.0,0


In [34]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 178 entries, 0 to 177
Data columns (total 14 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   alcohol                       178 non-null    float64
 1   malic_acid                    178 non-null    float64
 2   ash                           178 non-null    float64
 3   alcalinity_of_ash             178 non-null    float64
 4   magnesium                     178 non-null    float64
 5   total_phenols                 178 non-null    float64
 6   flavanoids                    178 non-null    float64
 7   nonflavanoid_phenols          178 non-null    float64
 8   proanthocyanins               178 non-null    float64
 9   color_intensity               178 non-null    float64
 10  hue                           178 non-null    float64
 11  od280/od315_of_diluted_wines  178 non-null    float64
 12  proline                       178 non-null    float64
 13  Targe

In [35]:
df["Target"].value_counts()

Target
1    71
0    59
2    48
Name: count, dtype: int64

### Splitting Data into Training and Test Set

In [37]:
X, y = df.iloc[:, :-1], df.iloc[:, -1]

X.shape, y.shape

((178, 13), (178,))

In [38]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=7)

X_train.shape, X_test.shape

((142, 13), (36, 13))

### Training and Test Set Evaluation

In [39]:
xg_class = xgb.XGBClassifier(seed=7)

xg_class.fit(X_train, y_train)

In [40]:
preds = xg_class.predict(X_test)

train_acc = accuracy_score(y_train, xg_class.predict(X_train))
test_acc = accuracy_score(y_test, preds)

print(f"Train ACC: {train_acc}")
print(f"Test ACC: {test_acc}")

Train ACC: 1.0
Test ACC: 0.9444444444444444


In [43]:
print(classification_report(y_test, preds))

              precision    recall  f1-score   support

           0       1.00      0.86      0.92         7
           1       0.89      1.00      0.94        17
           2       1.00      0.92      0.96        12

    accuracy                           0.94        36
   macro avg       0.96      0.92      0.94        36
weighted avg       0.95      0.94      0.94        36

