<a href="https://colab.research.google.com/github/hargurjeet/LJMU_Thesis/blob/main/Mixtral_training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Training ML model with Mixtral

In [2]:
! pip install openai==0.28

Collecting openai==0.28
  Downloading openai-0.28.0-py3-none-any.whl.metadata (13 kB)
Downloading openai-0.28.0-py3-none-any.whl (76 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.5/76.5 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: openai
Successfully installed openai-0.28.0


## 1. Importing lib and Enriched dataset

In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, f1_score, classification_report
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import openai
import os
import seaborn as sns
import matplotlib.pyplot as plt

os.environ['OPENAI_API_KEY'] = ""
openai.api_key = os.getenv("OPENAI_API_KEY")

In [4]:
cust_data = pd.read_csv('/content/mixtral_output_post_cleaning.csv')
cust_data.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,Age,FrequentFlyer,AnnualIncomeClass,ServicesOpted,AccountSyncedToSocialMedia,BookedHotelOrNot,Target,GeneratedText
0,0,0,34,No,Middle Income,6,No,Yes,0,The customer is 34 years old and does not have...
1,1,1,34,Yes,Low Income,5,Yes,No,1,The customer's age is 34. This is a significan...
2,2,2,37,No,Middle Income,3,Yes,No,0,The customer's age is 37. This is a significan...
3,3,3,30,No,Middle Income,2,No,No,0,The customer is a 30-year-old individual who d...
4,4,4,30,No,Low Income,1,No,No,0,The customer's age is 30. This could indicate ...


## 2. Generate embedding with text-embedding-ada-002

In [6]:
def get_embedding(text):
    response = openai.Embedding.create(
        model="text-embedding-ada-002",
        input=text
    )
    return response['data'][0]['embedding']

cust_data['Embedding'] = cust_data['GeneratedText'].apply(get_embedding)

## 3. Training ML model

In [7]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    cust_data['Embedding'].tolist(),
    cust_data['Target'],  # Assuming 'Target' is your target variable
    test_size=0.2,
    random_state=42
)

# Standardize the embeddings
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Dimensionality Reduction with PCA
# pca = PCA(n_components=min(len(X_train), X_train.shape[1]))
pca = PCA(n_components=50)
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)

# Train a RandomForestClassifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train_pca, y_train)

# Make predictions and evaluate the model
y_pred = clf.predict(X_test_pca)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')  # Use 'macro' or 'micro' depending on your needs
recall = recall_score(y_test, y_pred, average='weighted')  # Use 'macro' or 'micro' depending on your needs

print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")

Accuracy: 0.82
Precision: 0.81
Recall: 0.82


## 4. Results

In [8]:
report = classification_report(y_test, y_pred)

In [9]:
print(report)

              precision    recall  f1-score   support

           0       0.83      0.99      0.90       153
           1       0.75      0.16      0.26        38

    accuracy                           0.82       191
   macro avg       0.79      0.57      0.58       191
weighted avg       0.81      0.82      0.77       191

