<a href="https://colab.research.google.com/github/irakozej/CNN-activity/blob/main/multimodal_model_training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Model 1: Facial Recognition Model (Random Forest)

#### Description:
This model classifies whether a face belongs to a known user or not based on the extracted image features (e.g., histogram, embeddings).


In [2]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [32]:
import os

# Set the folder path (adjust if your files are in a subfolder)
folder_path = '/content/drive/MyDrive/'




In [33]:
from google.colab import files
uploaded = files.upload()


Saving audio_features .csv to audio_features  (1).csv
Saving customer_social_profiles.csv to customer_social_profiles (1).csv
Saving customer_transactions.csv to customer_transactions (1).csv
Saving image_features (1).csv to image_features (1) (1).csv
Saving merged_engineered_data.csv to merged_engineered_data (1).csv


## 1. Facial Recognition Model (Random Forest)

In [34]:
%%writefile models/facial_model.py
# models/facial_model.py

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
import joblib
import os

def train_facial_model():
    df = pd.read_csv("image_features (1).csv")

    label_col = 'member_name'  # use actual column name from your dataset

    # Validate label column presence
    if label_col not in df.columns:
        raise KeyError(f"The '{label_col}' column is missing in the image_features.csv")

    # Drop rows with missing label values
    df = df.dropna(subset=[label_col])

    # Encode label column (target)
    label_encoder = LabelEncoder()
    df[label_col] = label_encoder.fit_transform(df[label_col])

    # Separate features and labels
    X = df.drop(columns=[label_col])
    y = df[label_col]

    # Encode any non-numeric feature columns
    for col in X.columns:
        if X[col].dtype == 'object':
            X[col] = LabelEncoder().fit_transform(X[col])

    # Split data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Train model
    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)

    # Save model and label encoder
    os.makedirs("saved_models", exist_ok=True)
    joblib.dump(model, "saved_models/facial_model.pkl")
    joblib.dump(label_encoder, "saved_models/facial_label_encoder.pkl")

    print("✅ Facial recognition model trained and saved successfully!")

if __name__ == "__main__":
    train_facial_model()


Overwriting models/facial_model.py


In [35]:
!python3 models/facial_model.py

✅ Facial recognition model trained and saved successfully!


## 2. Audio Recognition Model (Random Forest)

In [36]:
%%writefile models/voice_model.py
# Voiceprint Verification Model - Random Forest

import pandas as pd
import os
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
import joblib

def train_voice_model():
    df = pd.read_csv("/content/drive/MyDrive/audio_features.csv")

    # ✅ Extract label from filename (e.g. 'john_01.wav' -> 'john')
    if 'filename' not in df.columns:
        raise KeyError("The 'filename' column is missing in audio_features.csv")

    df['label'] = df['filename'].apply(lambda x: os.path.basename(x).split('_')[0])

    # ✅ Encode label
    le = LabelEncoder()
    df['label'] = le.fit_transform(df['label'])

    # ✅ Features (drop filename and label)
    X = df.drop(columns=['filename', 'label'])

    # ✅ Handle non-numeric types in features
    for col in X.columns:
        if X[col].dtype == 'object':
            X[col] = LabelEncoder().fit_transform(X[col])

    y = df['label']

    # ✅ Split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )

    # ✅ Train
    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)

    # ✅ Report
    y_pred = model.predict(X_test)
    print("🎤 Voice Recognition Model Report:")
    print(classification_report(y_test, y_pred, target_names=le.classes_))

    # ✅ Save
    os.makedirs("saved_models", exist_ok=True)
    joblib.dump(model, "saved_models/voice_model.pkl")
    joblib.dump(le, "saved_models/voice_label_encoder.pkl")

    print("✅ Voice model trained and saved successfully.")

if __name__ == "__main__":
    train_voice_model()


Overwriting models/voice_model.py


In [37]:
!python3 models/voice_model.py

🎤 Voice Recognition Model Report:
              precision    recall  f1-score   support

        omar       1.00      1.00      1.00         1

    accuracy                           1.00         1
   macro avg       1.00      1.00      1.00         1
weighted avg       1.00      1.00      1.00         1

✅ Voice model trained and saved successfully.


In [14]:
import pandas as pd
df = pd.read_csv("/content/drive/MyDrive/audio_features.csv")
print(df.columns.tolist())


['mfccs', 'rolloff', 'energy', 'filename']


## 3. Product Recognition Model (Random Forest)

In [38]:
%%writefile models/product_model.py

import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
import joblib
import os

def train_product_model():
    # Load dataset
    df = pd.read_csv("merged_engineered_data.csv")  # Adjust path if needed

    label_col = "product_category"
    if label_col not in df.columns:
        raise KeyError(f"'{label_col}' column not found in dataset")

    # Drop rows with missing label
    df = df.dropna(subset=[label_col])

    # Encode all object (string) columns
    label_encoders = {}
    for col in df.columns:
        if df[col].dtype == 'object':
            le = LabelEncoder()
            df[col] = le.fit_transform(df[col].astype(str))
            label_encoders[col] = le

    # Split into features and label
    X = df.drop(label_col, axis=1)
    y = df[label_col]

    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )

    # Train using Random Forest
    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)

    # Predict and evaluate
    y_pred = model.predict(X_test)
    print("✅ Product Recommendation Model Report:")
    print(classification_report(y_test, y_pred))

    # Save model and encoders
    os.makedirs("saved_models", exist_ok=True)
    joblib.dump(model, "saved_models/product_model.pkl")
    joblib.dump(label_encoders, "saved_models/product_label_encoders.pkl")

    print("✅ Model and encoders saved successfully.")

if __name__ == "__main__":
    train_product_model()


Overwriting models/product_model.py


In [39]:
!python3 models/product_model.py

✅ Product Recommendation Model Report:
              precision    recall  f1-score   support

           0       0.80      0.80      0.80        10
           1       0.50      0.40      0.44         5
           2       0.50      0.60      0.55        10
           3       0.70      1.00      0.82         7
           4       0.71      0.45      0.56        11

    accuracy                           0.65        43
   macro avg       0.64      0.65      0.63        43
weighted avg       0.66      0.65      0.64        43

✅ Model and encoders saved successfully.


In [47]:
import pandas as pd

df = pd.read_csv("/content/drive/MyDrive/merged_engineered_data.csv")
print(df.columns.tolist())


['customer_id_new', 'social_media_platform', 'engagement_score', 'purchase_interest_score', 'review_sentiment', 'customer_id_new_numeric', 'customer_id_legacy', 'transaction_id', 'purchase_amount', 'purchase_date', 'product_category', 'customer_rating', 'purchase_month', 'purchase_day_of_week', 'total_purchase_amount', 'number_of_transactions', 'average_customer_rating']


In [82]:
import pandas as pd

df = pd.read_csv("image_features.csv")
print(df.columns.tolist())


['member_name', 'expression', 'augmentation_type', 'image_path', 'height', 'width', 'aspect_ratio', 'total_pixels', 'blue_mean', 'blue_std', 'blue_min', 'blue_max', 'blue_median', 'green_mean', 'green_std', 'green_min', 'green_max', 'green_median', 'red_mean', 'red_std', 'red_min', 'red_max', 'red_median', 'hue_mean', 'hue_std', 'saturation_mean', 'saturation_std', 'value_mean', 'value_std', 'gray_mean', 'gray_std', 'gray_min', 'gray_max', 'gray_median', 'blue_hist_0', 'blue_hist_1', 'blue_hist_2', 'blue_hist_3', 'blue_hist_4', 'blue_hist_5', 'blue_hist_6', 'blue_hist_7', 'blue_hist_8', 'blue_hist_9', 'blue_hist_10', 'blue_hist_11', 'blue_hist_12', 'blue_hist_13', 'blue_hist_14', 'blue_hist_15', 'green_hist_0', 'green_hist_1', 'green_hist_2', 'green_hist_3', 'green_hist_4', 'green_hist_5', 'green_hist_6', 'green_hist_7', 'green_hist_8', 'green_hist_9', 'green_hist_10', 'green_hist_11', 'green_hist_12', 'green_hist_13', 'green_hist_14', 'green_hist_15', 'red_hist_0', 'red_hist_1', 'red_