<a href="https://colab.research.google.com/github/i-ganza007/Multimodal-Data-Preprocessing/blob/main/Product_Recommendation_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, precision_score, recall_score
import matplotlib.pyplot as plt

In [None]:
dataset = pd.read_csv('/content/merged_dataset.csv')
dataset.head()

Unnamed: 0,id_number,transaction_id,purchase_amount,purchase_date,product_category,customer_rating,customer_id_new,social_media_platform,engagement_score,purchase_interest_score,review_sentiment
0,151,1001,408,2024-01-01,Sports,2.3,A151,TikTok,61,1.3,Neutral
1,151,1001,408,2024-01-01,Sports,2.3,A151,Twitter,72,1.6,Neutral
2,151,1001,408,2024-01-01,Sports,2.3,A151,Twitter,82,3.6,Negative
3,192,1002,332,2024-01-02,Electronics,4.2,A192,Instagram,60,4.3,Positive
4,114,1003,442,2024-01-03,Electronics,2.1,A114,Facebook,87,4.8,Negative


In [12]:
# Preprocessing
reduced_dataset = dataset.drop(['id_number','transaction_id','customer_id_new'],axis=1) # Dropped these columns since useless

In [13]:
# Getting input and output features
X = reduced_dataset.drop('product_category',axis=1)
Y = reduced_dataset['product_category']

In [28]:
# Extracting date features
le = LabelEncoder()
X['purchase_date'] = pd.to_datetime(X['purchase_date'])
X['purchase_month'] = X['purchase_date'].dt.month
X['purchase_day'] = X['purchase_date'].dt.day
X['purchase_weekday'] = X['purchase_date'].dt.weekday

X = X.astype({
    'purchase_amount': float,
    'customer_rating': float,
    'engagement_score': float,
    'purchase_interest_score': float,
    'purchase_month': float,
    'purchase_day': float,
    'purchase_weekday': float
})
# X['product_category'] = X['product_category'].astype('category')
X['social_media_platform'] = X['social_media_platform'].astype('category')
X['review_sentiment'] = le.fit_transform(X['review_sentiment'])
X['social_media_platform'] = le.fit_transform(X['social_media_platform'])
X = X.drop('purchase_date',axis=1)
X.head()

Unnamed: 0,purchase_amount,customer_rating,social_media_platform,engagement_score,purchase_interest_score,review_sentiment,purchase_month,purchase_day,purchase_weekday
0,0.992851,-0.645054,3,-0.965773,-1.549419,1,1.0,1.0,0.0
1,0.992851,-0.645054,4,-0.237948,-1.290236,1,1.0,1.0,0.0
2,0.992851,-0.645054,4,0.42371,0.43765,0,1.0,1.0,0.0
3,0.354429,0.930146,1,-1.031938,1.04241,2,1.0,2.0,1.0
4,1.278461,-0.810864,0,0.754539,1.474381,0,1.0,3.0,2.0


In [29]:
# Encoding Terms
Y = le.fit_transform(Y)

In [30]:
# Numerical Cols to scale
nums_cols = ['purchase_amount','customer_rating', 'engagement_score', 'purchase_interest_score']
scaler = StandardScaler()
X[nums_cols] = scaler.fit_transform(X[nums_cols])

In [32]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [33]:

model = LGBMClassifier(learning_rate=0.05, num_leaves=31, max_depth=7, n_estimators=100,
                      lambda_l1=0.1, lambda_l2=0.1, objective='multiclass', num_class=5)
model.fit(X_train, y_train)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000202 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 189
[LightGBM] [Info] Number of data points in the train set: 170, number of used features: 9
[LightGBM] [Info] Start training from score -1.734601
[LightGBM] [Info] Start training from score -1.839962
[LightGBM] [Info] Start training from score -1.524881
[LightGBM] [Info] Start training from score -1.768503
[LightGBM] [Info] Start training from score -1.285651


In [34]:
y_pred = model.predict(X_test)
print("Accuracy", accuracy_score(y_test, y_pred))
print("F1-score", f1_score(y_test, y_pred,average='weighted'))
print("F1-score", f1_score(y_test, y_pred,average='macro'))
print("Precision", precision_score(y_test, y_pred, average='weighted'))
print("Recall", recall_score(y_test, y_pred, average='weighted'))

Accuracy 0.6976744186046512
F1-score 0.6990161001788908
F1-score 0.6991758241758242
Precision 0.7173311184939092
Recall 0.6976744186046512


In [37]:
# Using randomforest
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

In [38]:
# Accuracy & Testing
y_pred = rf_model.predict(X_test)
print("Accuracy", accuracy_score(y_test, y_pred))
print("F1-score", f1_score(y_test, y_pred,average='weighted'))
print("F1-score", f1_score(y_test, y_pred,average='macro'))
print("Precision", precision_score(y_test, y_pred, average='weighted'))
print("Recall", recall_score(y_test, y_pred, average='weighted'))

Accuracy 0.7441860465116279
F1-score 0.7418990494229892
F1-score 0.7383559577677226
Precision 0.7437388193202147
Recall 0.7441860465116279


In [39]:
# Trying XGBoost
from xgboost import XGBClassifier
xgb_model = XGBClassifier(n_estimators=100, random_state=42)
xgb_model.fit(X_train, y_train)

In [40]:
# Accuracy and testing for XGBoost
y_pred = xgb_model.predict(X_test)
print("Accuracy", accuracy_score(y_test, y_pred))
print("F1-score", f1_score(y_test, y_pred,average='weighted'))
print("F1-score", f1_score(y_test, y_pred,average='macro'))
print("Precision", precision_score(y_test, y_pred, average='weighted'))
print("Recall", recall_score(y_test, y_pred, average='weighted'))

Accuracy 0.7674418604651163
F1-score 0.7680732400294644
F1-score 0.7635520361990951
Precision 0.7732238691541017
Recall 0.7674418604651163


In [41]:
import joblib as jb
model_xgboost = jb.dump(xgb_model, 'xgb_model.joblib')
model_random = jb.dump(rf_model, 'rf_model.joblib')
model_lgbm = jb.dump(model, 'lgbm_model.joblib')
