In [None]:
import pandas as pd
import joblib
import numpy as np
import xgboost as xgb
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.feature_selection import SelectKBest, f_classif
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import  mean_squared_error, mean_absolute_error, r2_score


In [None]:
!python -m pip install "pymongo[srv]"

Collecting pymongo[srv]
  Downloading pymongo-4.11.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (22 kB)
[0mCollecting dnspython<3.0.0,>=1.16.0 (from pymongo[srv])
  Downloading dnspython-2.7.0-py3-none-any.whl.metadata (5.8 kB)
Downloading dnspython-2.7.0-py3-none-any.whl (313 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m313.6/313.6 kB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pymongo-4.11.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.4/1.4 MB[0m [31m24.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: dnspython, pymongo
Successfully installed dnspython-2.7.0 pymongo-4.11.3


In [None]:
import pymongo as pym
mongoClient = pym.MongoClient("mongodb+srv://piyuminanawodaya:077antiloger@cluster0.fff7k.mongodb.net/?retryWrites=true&w=majority")
# mongoClient.admin.command('ping')
db = mongoClient["bankingdata"]
collection = db["bankdata"]

In [None]:
row_data = collection.find({}, {'_id': 0})
bank = pd.DataFrame(list(row_data))
bank.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,empvarrate,conspriceidx,consconfidx,euribor3m,nremployed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no


In [None]:

categorical_cols = bank.select_dtypes(include=['object']).columns.tolist()
print(categorical_cols)
categorical_cols.remove('y')
categorical_cols.remove('contact')

#encoding categorical values
onehot_encoders = {}
processed_df = pd.DataFrame(index=bank.index)
for col in categorical_cols:
    encoder = OneHotEncoder(sparse_output=False)
    encoded_feature = encoder.fit_transform(bank[[col]])
    onehot_encoders[col] = encoder
    feature_names = encoder.get_feature_names_out([col])
    encoded_df = pd.DataFrame(encoded_feature, columns=feature_names, index=bank.index)

    processed_df = pd.concat([processed_df, encoded_df], axis=1)

    print(f"Encoded {col} into {len(feature_names)} columns")

joblib.dump(onehot_encoders, 'regression_onehot_encoders.joblib')

# Standardize numerical features
numerical_cols = bank.select_dtypes(include=['number']).columns.tolist()
numerical_cols.remove('conspriceidx')
scaler = StandardScaler()
scaled_numerical = scaler.fit_transform(bank[numerical_cols])

# Save the scaler model
joblib.dump(scaler, 'regression_standard_scaler.joblib')

scaled_df = pd.DataFrame(scaled_numerical, columns=numerical_cols, index=bank.index)

final_df = pd.concat([processed_df, scaled_df], axis=1)
final_df["conspriceidx"] = bank["conspriceidx"]


['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'day_of_week', 'poutcome', 'y']
Encoded job into 12 columns
Encoded marital into 4 columns
Encoded education into 8 columns
Encoded default into 3 columns
Encoded housing into 3 columns
Encoded loan into 3 columns
Encoded month into 10 columns
Encoded day_of_week into 5 columns
Encoded poutcome into 3 columns


In [None]:
X = final_df.drop('conspriceidx', axis=1)
y = final_df['conspriceidx']

print(y)

0        93.994
1        93.994
2        93.994
3        93.994
4        93.994
          ...  
41183    94.767
41184    94.767
41185    94.767
41186    94.767
41187    94.767
Name: conspriceidx, Length: 41188, dtype: float64


In [None]:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Create and train XGBoost Regressor
xgb_reg = xgb.XGBRegressor(
    objective='reg:squarederror',
    n_estimators=200,
    learning_rate=0.01,
    max_depth=6,
    random_state=42
)
xgb_reg.fit(X_train_scaled, y_train)

# Predictions
y_pred = xgb_reg.predict(X_test_scaled)

# Calculate regression metrics
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Print the metrics
print(f"Mean Squared Error (MSE): {mse:.4f}")
print(f"Mean Absolute Error (MAE): {mae:.4f}")
print(f"R-squared (R²): {r2:.4f}")

Mean Squared Error (MSE): 0.0060
Mean Absolute Error (MAE): 0.0684
R-squared (R²): 0.9819


In [None]:
joblib.dump(xgb_reg, 'XGB_reggression_model.joblib')

['XGB_reggression_model.joblib']

In [None]:
# Load models
onehot_encoders = joblib.load('regression_onehot_encoders.joblib')
standard_scaler = joblib.load('regression_standard_scaler.joblib')
XGB_reg = joblib.load('XGB_reggression_model.joblib')

# Load the bank dataset and select a single instance
bank = pd.read_csv('bank-additional-full.csv', delimiter=';')
X_single = bank.iloc[[4]]

# Define categorical and numerical columns
cat_cols = bank.select_dtypes(include=['object']).columns.tolist()
cat_cols.remove('y')
cat_cols.remove('contact')

num_cols = X_single.select_dtypes(include=['number']).columns.tolist()
num_cols.remove('conspriceidx')

# One-hot encode categorical features
encoded_features_df = pd.DataFrame()
for col in cat_cols:
    encoder = onehot_encoders[col]
    # Create temp DataFrame with single row value
    temp_df = pd.DataFrame({col: [X_single[col].values[0]]})
    # Transform with OneHotEncoder
    encoded_col = encoder.transform(temp_df)
    # Convert to DataFrame
    feature_names = encoder.get_feature_names_out([col])
    encoded_df = pd.DataFrame(encoded_col, columns=feature_names, index=[X_single.index[0]])
    # Append to encoded_features_df
    encoded_features_df = pd.concat([encoded_features_df, encoded_df], axis=1)

# Standardize numerical features
numerical_values = X_single[num_cols]
standardized_numerical = standard_scaler.transform(numerical_values)
standardized_numerical_df = pd.DataFrame(standardized_numerical, columns=num_cols, index=[X_single.index[0]])

# Combine features for inference
inference_new_df = pd.concat([encoded_features_df, standardized_numerical_df], axis=1)

# Print the shape of the inference DataFrame
print(f"Final inference shape: {inference_new_df.shape}")

# Make prediction
prediction = XGB_reg.predict(inference_new_df)[0]

# Get the actual value from the original dataset
actual_value = X_single['conspriceidx'].values[0]

# Print predicted and actual values
print(f"Predicted conspriceidx: {prediction:.4f}")
print(f"Actual conspriceidx: {actual_value:.4f}")

Final inference shape: (1, 60)
Predicted conspriceidx: 93.9398
Actual conspriceidx: 93.9940
