In [1]:
##Import required libraries
import pandas as pd 
import numpy as np
import tensorflow as tf
import re
import openai
import nltk
import optuna
import math
import optuna
import tensorflow as tf



# NLTK packages for text preprocessing
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
##Scikit-learn packages for preprocessing, data cleaning, and other data preparation
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import normalize
from sklearn.metrics import classification_report, accuracy_score , f1_score, log_loss
from sklearn.utils import class_weight

#Import XGBoost and Random Forest Classifer
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

from collections import Counter

# Optuna packages for Hyperparameter optimization
from optuna.pruners import MedianPruner
from optuna.pruners import HyperbandPruner
from optuna.integration import TFKerasPruningCallback


#Import tensorflow keras packages for deep learning models
from keras.models import Sequential
from keras.layers import Dense, Dropout, LeakyReLU,Activation,BatchNormalization,GlobalMaxPooling1D,Conv1D,Reshape
from keras.regularizers import l1_l2
from keras.callbacks import EarlyStopping, ReduceLROnPlateau
from keras.utils import to_categorical
from keras.optimizers.legacy import Adam
from keras.layers import BatchNormalization
from keras.models import load_model



# Randomiztion control for tensorflow packages 
tf.random.set_seed(0)
np.random.seed(0)




In [2]:
#Create mapping for new categories
#These categories where manually made but with help of kmeans clustering 
category_mapping = { 

    'display broken': 'display issue',
    'broken display': 'display issue',
    'display problem': 'display issue',
    'fuzzy display': 'display issue',
    'flashing display': 'display issue',
    'display flipped': 'display issue',
    'camera damaged': 'hardware issue',
    'damage camera': 'hardware issue',
    'cca changes' : 'testing issue',
    'cca entry': 'testing issue',
    'failing all batteries': 'battery issue',
    'batteries drain quickly': 'battery issue',
    'internal battery error': 'battery issue',
    'leaking batteires':'battery issue',
    'low battery message': 'battery issue',
    'no voltage': 'power issue',
    'unstable voltage' : 'power issue',
    'no power on': 'power issue',
    'charging problem' : 'power issue',
    'printing issues': 'hardware issue',
    'printer roller': 'hardware issue',
    'out of paper': 'hardware issue',
    'cable problems': 'cable issue',
    'cables - melting': 'cable issue',
    'only system tests': 'testing issue',
    'no tes results': 'testing issue', 
    'will not test': 'testing issue',
    'wrong sw': 'software issue',
    'corrupted sw': 'software issue',
    'over heating': 'fan issue',
    'broken fan': 'fan issue',
    'partial function': 'testing issue',
    'en2 message': 'software issue',
    'corrupted': 'software issue',
    'date problems': 'software issue',
    'usb port': 'hardware issue',
    'mdx logo': 'freezes',
    'lost pin': 'software issue',
    'broken clamps': 'hardware issue',
    'location id': 'bmis',
    'button failed': 'hardware issue',
    'damaged': 'hardware issue',
    'auto testing': 'testing issue',
    'cca issue': 'testing issue',
    'reboots':'software issue',
}



# Data Cleaning

In [3]:
#Load the origina CPX excel data
original_df = pd.read_excel('CPX RMA analysis R2.xlsx')

In [4]:
#Copy the columns of interest into a new dataframe and rename columns
#This makes sure we have do not mutate the original data, rather we will be working with a copy

clean_df = original_df[['Customer Reason', 'Complaint']].copy()
clean_df.columns = ['Description','Category']
clean_df['Description'] = clean_df['Description'] .astype(str) #Set datatype to string


In [5]:
#Check for rows with empty values end drop
clean_df.isna().any()
clean_df = clean_df.dropna()


In [6]:
#Apply lowercasing to category column, then add New_Category column with the new mapped categories
clean_df['Category'] = clean_df['Category'].str.lower()
clean_df['New_Category'] = clean_df['Category'].replace(category_mapping)

In [7]:
#Get index of rows labled as 'multiple issues'
multiple_issues_idx = clean_df.query("New_Category == 'multiple issues'").index
#Get only the rows whose category is not 'multiple issues' 
reduced_df = clean_df.query("New_Category != 'multiple issues'").copy()

In [8]:
#Get list of new categories
new_categories = reduced_df.New_Category.value_counts().index.values

# Text preprocessing

Preprocessing for NLP is the process of cleaning and formatting raw text data before it's input to a machine learning model, which typically involves steps like tokenization, lowercasing, removing stop words, punctuation, and numbers, as well as stemming or lemmatization. This step is crucial as it transforms text into a form that can be represented as numerical features and understood by machine learning algorithms.Howerver, not all preprocessing steps are required for every natural language processing (NLP) task. The specific preprocessing steps one must choose to use will depend on the nature of the task, the specific requirements of your machine learning model, and the characteristics of the data. For that reason different preprocessing steps will be tried, tested, and evluated to see how different preprocessing steps affect the accuracy of the model

In [9]:
'''Tokenization is the process of breaking down text into individual words,
phrases, symbols, or other meaningful elements, which are known as tokens.
In the context of natural language processing (NLP), tokenization is typically
the first step in preparing text data for analysis or machine learning.'''
tokens = nltk.word_tokenize(' '.join(clean_df['Description']))

# Count the occurrences of each token
counter = Counter(tokens)

# Find the most common tokens
most_common_tokens = counter.most_common(100)  # Replace 20 with the number of tokens you want to find

print(most_common_tokens)

[(':', 3676), ('.', 2263), ('-', 1433), ('THE', 1415), ('#', 1371), ('TO', 1258), ('@', 1013), (',', 968), ('the', 764), ('AND', 638), ('UPDATE', 551), ('to', 534), ('SN', 528), ('NOT', 517), ('A', 461), ('IS', 427), ('WILL', 424), ('TOYOTA', 416), ('DLR', 414), ('PO', 357), ('and', 351), ('OF', 347), ('IT', 341), ('TESTER', 303), ('FOR', 303), ('ON', 299), ('*', 267), ('BAR', 265), ("N'T", 259), ('HE', 255), ('$', 251), ('is', 248), ('ISSUE', 245), ('UNIT', 242), ('I', 241), ('IN', 236), ('Toyota', 236), ('BATTERY', 226), ('TEST', 218), ('T', 200), ('CONNECT', 200), ('Contact', 198), ('unit', 195), ('S/N', 193), ('for', 190), ('REPAIR', 187), ('WIFI', 182), ('HAS', 181), ('WARRANTY', 179), ('KIT', 174), ('NISSAN', 174), ('WITH', 172), ('CUSTOMER', 168), ('EMAIL', 168), ('a', 162), ('BACK', 161), ('WAS', 161), ('it', 159), (')', 159), ('TOOL', 158), ('WHEN', 157), ('CPX-900P', 157), ('AN', 155), ('will', 153), ('(', 153), ('not', 151), ('BATTERIES', 149), ('RMA', 146), ('THAT', 145), (

Examining the most frequently occurring tokens can assist us in identifying those that are irrelevant or constitute noise. If a token appears very frequently across the dataset, it may not be a good indicator of a specific class category. The underlying concept is that words with high variance across documents are often better predictors, as they can distinguish between different classes, while words with low variance, which appear uniformly across documents, provide less discriminative power.

----------------------------

The following block performs several preprocessing steps on a given text. The function first converts the text to lowercase, then removes specific patterns and email addresses. If the openai_flag is not set, the function further cleans the text by removing punctuation, stop words, and numbers, and applies stemming to reduce words to their root form. 

In [10]:
# Function for preprocessing text in a dataframe
# The first 8 most commmon tokens are noise / irrelevant so we will remove them. 

ps = PorterStemmer()
all_steps = False
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
      ps = PorterStemmer()
      # Convert to lowercase
      text = text.lower()

      # Remove specific patterns
      text = re.sub(r'[a-zA-Z0-9]*_x001e_([a-zA-Z0-9]*_)*', '', text)

      text= text.replace("\n", " ")
    
      # Remove email addresses
      text = re.sub(r'\S+@\S+', '', text)
            
      if (all_steps):
          # Remove punctuation
          text = re.sub(r'[.:,@*$/#-]', ' ', text)
          # Remove stop words
          text = ' '.join([word for word in text.split() if word not in stop_words])
          # Apply stemming
          text = ' '.join(ps.stem(word) for word in text.split())
          #Remove numbers
          text = re.sub(r'\d+', '', text)
            
          remove_words = ['toyota', 'sn', 'po','nissan','ph','p','n','email']
          text = ' '.join([word for word in text.split() if word not in remove_words])

      return text


In [11]:
#Apply processing and store restult in a new column
all_steps = True
reduced_df['Processed_Description_All'] = reduced_df['Description'].apply(preprocess_text)
all_steps = False
reduced_df['Processed_Description_Some'] = reduced_df['Description'].apply(preprocess_text)

In [12]:
#look a preprocessing results
print(reduced_df['Processed_Description_All'].iloc[1])

print(reduced_df['Processed_Description_Some'].iloc[1])

burnsvil dlr koeun technic advisor burnsvil busi phone cpx tester current first one read batteri give warn clamp wrong termin switch clamp state clamp connect serial number one second broken posit clamp need new test cabl replac need sent also? serial number one let know next step


## Create Embeddings

Main methods of text representation: Transformer-based models like GPT, Word2Vec, and TF-IDF.

1. **Transformer-based models (like GPT)**: These models are based on the Transformer architecture, which was introduced in the paper "Attention is All You Need" by Vaswani et al. The key idea behind this architecture is the self-attention mechanism, which allows the model to weigh the importance of words in a sentence relative to each other. GPT (Generative Pretrained Transformer) is a specific implementation of this architecture that is trained to predict the next word in a sentence, given all the previous words. This allows it to learn a rich understanding of language, including grammar, semantics, and even some world knowledge. GPT and similar models are particularly good for tasks that require understanding of context, such as translation, summarization, and question answering.

2. **Word2Vec**: This is a method for learning vector representations of words, introduced by Mikolov et al. at Google. It works by training a neural network to predict a word given its context (Continuous Bag of Words, CBOW) or to predict the context given a word (Skip-Gram). The learned word vectors capture many linguistic regularities and patterns; for example, vector('King') - vector('Man') + vector('Woman') roughly equals vector('Queen'). Word2Vec is useful for tasks that require understanding of word similarity and analogy, but it doesn't capture the context of a sentence as well as transformer models.

3. **TF-IDF (Term Frequency-Inverse Document Frequency)**: This is a statistical method for representing text. It calculates the importance of a word in a document relative to a corpus, based on how often the word appears in the document (Term Frequency) and how rare the word is in the corpus (Inverse Document Frequency). TF-IDF is simple and efficient, and it's good for tasks like information retrieval and document classification where the exact wording isn't as important as the overall topic or theme. However, it doesn't capture the semantics of words or the structure of sentences.

As for which one is better, it depends on the specific task:

- If you need to understand the context of a sentence or generate human-like text, a transformer-based model like GPT would be the best choice.
- If you're working with a task that requires understanding of word similarity and analogy, and less about the context or order of words, Word2Vec would be a good choice.
- If you're trying to classify documents or retrieve information based on keywords, and you don't need to understand the semantics or syntax of the text, TF-IDF would be sufficient. 

These are general guidelines and the best choice can depend on many factors, including the size and nature of your dataset, the computational resources available, and the specific requirements of your task.

In [13]:
## Setup embedders

openai.api_key = "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"

#This function sends a request to create an embedding for text, and retreives the resposne
def create_embedding(text):
    response = openai.Embedding.create(
    model= "text-embedding-ada-002",
    input = text
    )

    return response['data'][0]['embedding']

vectorizer = TfidfVectorizer()

In [14]:
# Apply TF-IDF Vectorizer
X_tfidf_all = vectorizer.fit_transform(reduced_df['Processed_Description_All'])
X_tfidf_some = vectorizer.fit_transform(reduced_df['Processed_Description_Some'])

In [15]:
## Create embeddings and export to csv if embeddings have not been created and exported already

run_openai_embedder = False

if (run_openai_embedder): 
    X_openai_all = reduced_df['OpenAI_Embeddings_All'] = reduced_df['Processed_Description_All'] .apply(create_embedding)
    X_openai_some = reduced_df['OpenAI_Embeddings_Some'] = reduced_df['Processed_Description_Some'] .apply(create_embedding)

export_to_csv = False 

if (export_to_csv):
    reduced_df.to_csv('Processed_CPX_Data.csv',index = False)


In [16]:
#Assuming open ai embeddings have been created and stored in local folder, import csv into data frame
openai_embeddings = pd.read_csv('Processed_CPX_Data.csv')[['OpenAI_Embeddings_All','OpenAI_Embeddings_Some']]

In [17]:
''' Embeddings are stored as string represenations of lists. This block evaluates 
each row in OpenAI_Embeddings_All and OpenAI_Embeddings_Some as list of floats, then creates a 
2D matrix ''' 

X_openai_all = np.stack(openai_embeddings.OpenAI_Embeddings_All.apply(eval))
X_openai_some = np.stack(openai_embeddings.OpenAI_Embeddings_Some.apply(eval))

In [18]:
X_openai_all = normalize(X_openai_all, norm='l2')
X_openai_some = normalize(X_openai_some,norm = 'l2')

In [19]:
reduced_df.New_Category.value_counts()

update issues        273
cable issue          112
data access issue     97
wifi issues           91
no details            84
vin scan              49
fan issue             48
hardware issue        41
freezes               36
decision accuracy     35
battery issue         34
software issue        29
power issue           26
shuts off             26
testing issue         22
display issue         17
temp sensor error     16
bmis                  14
Name: New_Category, dtype: int64

In [20]:
''' Simple label encoding is appropriate for encoding the category or target label values'
This assigns a unique integer value to each category or label name '''
y = reduced_df.New_Category.values
labels = reduced_df.New_Category.values
le = LabelEncoder()
# Fit the encoder to your categories and transform
y = le.fit_transform(labels)

# Classification (Random Forest)

### --- OpenAI Embeddings Classification (All Preprocessing Methods)---

In [21]:
#Split data into testing and training data set. Set the testing size to 30% and stratify the data on y
X_train_all, X_test_all, y_train_all, y_test_all = train_test_split (X_openai_all,y,stratify = y, random_state=42,test_size=.30)

In [22]:
rfclf = RandomForestClassifier(random_state=42)
rfclf.fit(X_train_all,y_train_all)

In [23]:
y_pred_train_all = rfclf.predict(X_train_all)  # Predictions for train set
y_pred_test_all = rfclf.predict(X_test_all)   #Predictions for the test set

#Print predicitons for the train set
target_names = le.classes_ 
print(f'''-----Train Set Performance (Random Forest All)-----
Embedding Method: OpenAI 
Total Accuracy: {accuracy_score(y_train_all, y_pred_train_all)}\n
{classification_report(y_train_all, y_pred_train_all,target_names=target_names)}''')


-----Train Set Performance (Random Forest All)-----
Embedding Method: OpenAI 
Total Accuracy: 0.9755102040816327

                   precision    recall  f1-score   support

    battery issue       0.88      0.96      0.92        24
             bmis       1.00      0.90      0.95        10
      cable issue       0.99      0.99      0.99        78
data access issue       0.99      0.99      0.99        68
decision accuracy       0.96      0.92      0.94        25
    display issue       0.92      1.00      0.96        12
        fan issue       0.97      1.00      0.99        34
          freezes       0.96      1.00      0.98        25
   hardware issue       1.00      0.90      0.95        29
       no details       1.00      1.00      1.00        59
      power issue       1.00      0.89      0.94        18
        shuts off       0.89      0.94      0.92        18
   software issue       1.00      0.95      0.97        20
temp sensor error       0.92      1.00      0.96        11


In [24]:
#Print predictions for the test set
print(f'''-----Test Set Performance (Random Forest All)-----
Embedding Method: OpenAI
Total Accuracy:{accuracy_score(y_test_all, y_pred_test_all)}\n
{classification_report(y_test_all, y_pred_test_all,zero_division = 1,target_names=target_names)}''')

-----Test Set Performance (Random Forest All)-----
Embedding Method: OpenAI
Total Accuracy:0.546031746031746

                   precision    recall  f1-score   support

    battery issue       0.40      0.20      0.27        10
             bmis       1.00      0.00      0.00         4
      cable issue       0.53      0.68      0.60        34
data access issue       0.48      0.76      0.59        29
decision accuracy       0.00      0.00      0.00        10
    display issue       1.00      0.00      0.00         5
        fan issue       1.00      0.14      0.25        14
          freezes       1.00      0.09      0.17        11
   hardware issue       0.33      0.08      0.13        12
       no details       0.95      0.80      0.87        25
      power issue       1.00      0.12      0.22         8
        shuts off       0.50      0.12      0.20         8
   software issue       0.00      0.00      0.00         9
temp sensor error       1.00      0.00      0.00         5
    

### --- OpenAI Embeddings Classification (Some Preprocessing Methods)---

In [25]:
#Split data into testing and training data set. Set the testing size to 30% and stratify the data on y
X_train_some, X_test_some, y_train_some, y_test_some = train_test_split (X_openai_some,y,stratify = y, random_state=42,test_size=.30)

In [26]:
rfclf2 = RandomForestClassifier(random_state=42)
rfclf2.fit(X_train_some,y_train_some)

In [27]:
y_pred_train_some = rfclf2.predict(X_train_some)  # Predictions for train set
y_pred_test_some = rfclf2.predict(X_test_some)   #Predictions for the test set

#Print predicitons for the train set
print(f'''-----Test Set Performance (Random Forest Some)----- 
Embedding Method: OpenAI
Total Accuracy: {accuracy_score(y_train_some, y_pred_train_some)}\n
{classification_report(y_train_some, y_pred_train_some,zero_division = 1,target_names=target_names)}''')


-----Test Set Performance (Random Forest Some)----- 
Embedding Method: OpenAI
Total Accuracy: 0.9782312925170068

                   precision    recall  f1-score   support

    battery issue       0.96      0.92      0.94        24
             bmis       1.00      1.00      1.00        10
      cable issue       0.97      0.99      0.98        78
data access issue       0.99      0.99      0.99        68
decision accuracy       1.00      0.96      0.98        25
    display issue       0.92      1.00      0.96        12
        fan issue       0.97      1.00      0.99        34
          freezes       0.96      0.92      0.94        25
   hardware issue       0.93      0.97      0.95        29
       no details       1.00      1.00      1.00        59
      power issue       1.00      0.94      0.97        18
        shuts off       1.00      0.94      0.97        18
   software issue       1.00      0.95      0.97        20
temp sensor error       0.92      1.00      0.96        11


In [28]:
#Print predictions for the test set
print(f'''-----Test Set Performance (Random Forest Some)-----
Embedding Method: OpenAI 
Total Accuracy:{accuracy_score(y_test_some, y_pred_test_some)}\n
{classification_report(y_test_some, y_pred_test_some, zero_division = 1,target_names=target_names)}''')

-----Test Set Performance (Random Forest Some)-----
Embedding Method: OpenAI 
Total Accuracy:0.6222222222222222

                   precision    recall  f1-score   support

    battery issue       0.62      0.50      0.56        10
             bmis       1.00      0.00      0.00         4
      cable issue       0.60      0.88      0.71        34
data access issue       0.44      0.79      0.57        29
decision accuracy       0.00      0.00      0.00        10
    display issue       1.00      0.00      0.00         5
        fan issue       0.60      0.21      0.32        14
          freezes       0.75      0.27      0.40        11
   hardware issue       0.25      0.08      0.12        12
       no details       0.96      0.92      0.94        25
      power issue       1.00      0.12      0.22         8
        shuts off       0.50      0.12      0.20         8
   software issue       0.00      0.00      0.00         9
temp sensor error       1.00      0.00      0.00         5
 

### --- TFIDF Embeddings Classification (All Preprocessing Methods)---

In [29]:
X_train_all, X_test_all, y_train_all, y_test_all = train_test_split (X_tfidf_all,y,stratify = y, random_state=42,test_size=.30)

In [30]:
rfclf = RandomForestClassifier(random_state=42)
rfclf.fit(X_train_all,y_train_all)

In [31]:
y_pred_train_all = rfclf.predict(X_train_all)  # Predictions for train set
y_pred_test_all = rfclf.predict(X_test_all)   #Predictions for the test set

#Print predicitons for the train set
print(f'''-----Train Set Performance (Random Forest All)-----
Embedding Method: TFIDF 
Total Accuracy: {accuracy_score(y_train_all, y_pred_train_all)}\n
{classification_report(y_train_all, y_pred_train_all,target_names=target_names)}''')

-----Train Set Performance (Random Forest All)-----
Embedding Method: TFIDF 
Total Accuracy: 0.9700680272108844

                   precision    recall  f1-score   support

    battery issue       0.85      0.96      0.90        24
             bmis       1.00      0.90      0.95        10
      cable issue       0.95      0.99      0.97        78
data access issue       0.99      0.99      0.99        68
decision accuracy       1.00      0.88      0.94        25
    display issue       0.92      1.00      0.96        12
        fan issue       0.97      1.00      0.99        34
          freezes       0.96      0.92      0.94        25
   hardware issue       1.00      0.90      0.95        29
       no details       1.00      1.00      1.00        59
      power issue       1.00      0.89      0.94        18
        shuts off       0.89      0.94      0.92        18
   software issue       1.00      0.90      0.95        20
temp sensor error       0.92      1.00      0.96        11
 

In [32]:
#Print predictions for the test set
print(f'''-----Test Set Performance (Random Forest All)-----
Embedding Method: TFIDF
Total Accuracy:{accuracy_score(y_test_all, y_pred_test_all)}\n
{classification_report(y_test_all, y_pred_test_all,zero_division = 1,target_names=target_names)}''')

-----Test Set Performance (Random Forest All)-----
Embedding Method: TFIDF
Total Accuracy:0.6793650793650794

                   precision    recall  f1-score   support

    battery issue       0.50      0.50      0.50        10
             bmis       1.00      0.00      0.00         4
      cable issue       0.58      0.85      0.69        34
data access issue       0.60      0.86      0.70        29
decision accuracy       0.33      0.10      0.15        10
    display issue       1.00      0.00      0.00         5
        fan issue       0.78      0.50      0.61        14
          freezes       0.80      0.36      0.50        11
   hardware issue       0.25      0.08      0.12        12
       no details       0.62      0.92      0.74        25
      power issue       1.00      0.25      0.40         8
        shuts off       0.50      0.12      0.20         8
   software issue       0.00      0.00      0.00         9
temp sensor error       1.00      0.60      0.75         5
    

### --- TFIDF Embeddings Classification (Some Preprocessing Methods)---

In [33]:
X_train_some, X_test_some, y_train_some, y_test_some = train_test_split (X_tfidf_some,y,stratify = y, random_state=42,test_size=.30)

In [34]:
rfclf2 = RandomForestClassifier(random_state=42)
rfclf2.fit(X_train_some,y_train_some)

In [35]:
y_pred_train_some = rfclf2.predict(X_train_some)  # Predictions for train set
y_pred_test_some = rfclf2.predict(X_test_some)   #Predictions for the test set

#Print predicitons for the train set
print(f'''-----Train Set Performance (Random Forest Some)----
Embedding Method: TFIDF 
Total Accuracy: {accuracy_score(y_train_some, y_pred_train_some)}\n
{classification_report(y_train_some, y_pred_train_some,zero_division = 1,target_names=target_names)}''')

-----Train Set Performance (Random Forest Some)----
Embedding Method: TFIDF 
Total Accuracy: 0.9700680272108844

                   precision    recall  f1-score   support

    battery issue       0.85      0.96      0.90        24
             bmis       1.00      0.90      0.95        10
      cable issue       0.96      0.97      0.97        78
data access issue       0.99      0.99      0.99        68
decision accuracy       1.00      0.88      0.94        25
    display issue       0.92      1.00      0.96        12
        fan issue       0.97      1.00      0.99        34
          freezes       0.96      0.92      0.94        25
   hardware issue       1.00      0.90      0.95        29
       no details       1.00      1.00      1.00        59
      power issue       1.00      0.89      0.94        18
        shuts off       0.89      0.94      0.92        18
   software issue       0.95      0.95      0.95        20
temp sensor error       0.92      1.00      0.96        11
 

In [36]:
#Print predictions for the test set
print(f'''-----Test Set Performance(Random Forest Some)-----
Embedding Method: TFIDF
Total Accuracy:{accuracy_score(y_test_some, y_pred_test_some)}\n
{classification_report(y_test_some, y_pred_test_some, zero_division = 1,target_names=target_names)}''')

-----Test Set Performance(Random Forest Some)-----
Embedding Method: TFIDF
Total Accuracy:0.6285714285714286

                   precision    recall  f1-score   support

    battery issue       0.62      0.50      0.56        10
             bmis       1.00      0.00      0.00         4
      cable issue       0.55      0.76      0.64        34
data access issue       0.47      0.79      0.59        29
decision accuracy       0.50      0.10      0.17        10
    display issue       1.00      0.00      0.00         5
        fan issue       0.67      0.29      0.40        14
          freezes       1.00      0.18      0.31        11
   hardware issue       0.50      0.17      0.25        12
       no details       0.85      0.92      0.88        25
      power issue       1.00      0.12      0.22         8
        shuts off       0.33      0.12      0.18         8
   software issue       0.00      0.00      0.00         9
temp sensor error       1.00      0.00      0.00         5
    

# Classification (XGBoost)

### --- OpenAI Embeddings Classification (All Preprocessing Methods)---

In [37]:
#Split data into testing and training data set. Set the testing size to 30% and stratify the data on y
X_train_all, X_test_all, y_train_all, y_test_all = train_test_split (X_openai_all,y,stratify = y, random_state=42,test_size=.30)

In [38]:
xgbclf = XGBClassifier(objective='multi:softmax', random_state=42, n_jobs = -1)
xgbclf.fit(X_train_all,y_train_all)

In [39]:
y_pred_train_all = xgbclf.predict(X_train_all)  # Predictions for train set
y_pred_test_all = xgbclf.predict(X_test_all)   #Predictions for the test set

#Print predicitons for the train set
print(f'''-----Train Set Performance (XGBoost All)-----
Embedding Method: OpenAI
Total Accuracy: {accuracy_score(y_train_all, y_pred_train_all)}\n
{classification_report(y_train_all, y_pred_train_all,target_names=target_names)}''')


-----Train Set Performance (XGBoost All)-----
Embedding Method: OpenAI
Total Accuracy: 0.9755102040816327

                   precision    recall  f1-score   support

    battery issue       1.00      0.92      0.96        24
             bmis       0.91      1.00      0.95        10
      cable issue       0.95      1.00      0.97        78
data access issue       0.99      0.99      0.99        68
decision accuracy       0.92      0.96      0.94        25
    display issue       1.00      0.92      0.96        12
        fan issue       0.97      1.00      0.99        34
          freezes       0.96      1.00      0.98        25
   hardware issue       0.96      0.93      0.95        29
       no details       1.00      1.00      1.00        59
      power issue       1.00      0.89      0.94        18
        shuts off       0.94      0.89      0.91        18
   software issue       0.95      1.00      0.98        20
temp sensor error       1.00      0.91      0.95        11
    tes

In [40]:
#Print predictions for the test set
print(f'''-----Test Set Performance (XGBoost All)-----
Embedding Method: OpenAI
Total Accuracy:{accuracy_score(y_test_all, y_pred_test_all)}\n
{classification_report(y_test_all, y_pred_test_all,zero_division = 1,target_names=target_names)}''')

-----Test Set Performance (XGBoost All)-----
Embedding Method: OpenAI
Total Accuracy:0.6222222222222222

                   precision    recall  f1-score   support

    battery issue       0.36      0.50      0.42        10
             bmis       0.00      0.00      0.00         4
      cable issue       0.58      0.76      0.66        34
data access issue       0.52      0.79      0.63        29
decision accuracy       0.00      0.00      0.00        10
    display issue       0.50      0.20      0.29         5
        fan issue       0.33      0.14      0.20        14
          freezes       0.60      0.27      0.37        11
   hardware issue       0.33      0.17      0.22        12
       no details       0.72      0.84      0.78        25
      power issue       1.00      0.12      0.22         8
        shuts off       0.50      0.12      0.20         8
   software issue       0.25      0.11      0.15         9
temp sensor error       1.00      0.20      0.33         5
    testi

### --- OpenAI Embeddings Classification (Some Preprocessing Methods)---

In [41]:
#Split data into testing and training data set. Set the testing size to 30% and stratify the data on y
X_train_some, X_test_some, y_train_some, y_test_some = train_test_split (X_openai_some,y,stratify = y, random_state=42,test_size=.30)

In [42]:
xgbclf2 = XGBClassifier(objective='multi:softmax',num_class = 20, random_state=42, n_jobs = -1)
xgbclf2.fit(X_train_some,y_train_some)

In [43]:
y_pred_train_some = xgbclf2.predict(X_train_some)  # Predictions for train set
y_pred_test_some = xgbclf2.predict(X_test_some)   #Predictions for the test set

#Print predicitons for the train set
print(f'''-----Train Set Performance (XGBoost Some)----- 
Embedding Method: OpenAI
Total Accuracy: {accuracy_score(y_train_some, y_pred_train_some)}\n
{classification_report(y_train_some, y_pred_train_some,zero_division = 1,target_names=target_names)}''')


-----Train Set Performance (XGBoost Some)----- 
Embedding Method: OpenAI
Total Accuracy: 0.9782312925170068

                   precision    recall  f1-score   support

    battery issue       0.96      0.92      0.94        24
             bmis       1.00      1.00      1.00        10
      cable issue       0.96      0.99      0.97        78
data access issue       1.00      0.97      0.99        68
decision accuracy       1.00      0.96      0.98        25
    display issue       1.00      0.92      0.96        12
        fan issue       0.97      1.00      0.99        34
          freezes       0.96      0.92      0.94        25
   hardware issue       0.97      0.97      0.97        29
       no details       1.00      1.00      1.00        59
      power issue       0.95      1.00      0.97        18
        shuts off       0.95      1.00      0.97        18
   software issue       0.95      1.00      0.98        20
temp sensor error       1.00      0.91      0.95        11
    t

In [44]:
#Print predictions for the test set
print(f'''-----Test Set Performance (XGBoost Some)----- 
Embedding Method: OpenAI
Total Accuracy:{accuracy_score(y_test_some, y_pred_test_some)}\n
{classification_report(y_test_some, y_pred_test_some, zero_division = 1,target_names=target_names)}''')

-----Test Set Performance (XGBoost Some)----- 
Embedding Method: OpenAI
Total Accuracy:0.6412698412698413

                   precision    recall  f1-score   support

    battery issue       0.50      0.50      0.50        10
             bmis       1.00      0.00      0.00         4
      cable issue       0.57      0.76      0.65        34
data access issue       0.46      0.66      0.54        29
decision accuracy       0.17      0.10      0.12        10
    display issue       0.00      0.00      0.00         5
        fan issue       0.57      0.29      0.38        14
          freezes       0.40      0.36      0.38        11
   hardware issue       0.25      0.25      0.25        12
       no details       0.74      0.92      0.82        25
      power issue       0.33      0.12      0.18         8
        shuts off       0.33      0.12      0.18         8
   software issue       0.20      0.11      0.14         9
temp sensor error       1.00      0.40      0.57         5
    tes

### --- TFIDF Embeddings Classification (All Preprocessing Methods)---

In [45]:
X_train_all, X_test_all, y_train_all, y_test_all = train_test_split (X_tfidf_all,y,stratify = y, random_state=42,test_size=.30)

In [46]:
xgbclf = XGBClassifier(objective='multi:softmax', random_state=42, n_jobs = -1)
xgbclf.fit(X_train_all,y_train_all)

In [47]:
y_pred_train_all = xgbclf.predict(X_train_all)  # Predictions for train set
y_pred_test_all = xgbclf.predict(X_test_all)   #Predictions for the test set

#Print predicitons for the train set
print(f'''-----Train Set Performance (XGBoost All)----- 
Embedding Method: TFIDF
Total Accuracy: {accuracy_score(y_train_all, y_pred_train_all)}\n
{classification_report(y_train_all, y_pred_train_all,target_names=target_names)}''')

-----Train Set Performance (XGBoost All)----- 
Embedding Method: TFIDF
Total Accuracy: 0.9687074829931973

                   precision    recall  f1-score   support

    battery issue       1.00      0.88      0.93        24
             bmis       0.91      1.00      0.95        10
      cable issue       0.96      0.97      0.97        78
data access issue       0.99      0.99      0.99        68
decision accuracy       0.96      0.92      0.94        25
    display issue       0.92      1.00      0.96        12
        fan issue       0.97      1.00      0.99        34
          freezes       0.89      1.00      0.94        25
   hardware issue       0.90      0.93      0.92        29
       no details       0.98      1.00      0.99        59
      power issue       1.00      0.89      0.94        18
        shuts off       0.94      0.89      0.91        18
   software issue       0.95      0.95      0.95        20
temp sensor error       0.92      1.00      0.96        11
    tes

In [48]:
#Print predictions for the test set
print(f'''-----Test Set Performance (XGBoost All)----- 
Embedding Method: TFIDF
Total Accuracy:{accuracy_score(y_test_all, y_pred_test_all)}\n
{classification_report(y_test_all, y_pred_test_all,zero_division = 1,target_names=target_names)}''')

-----Test Set Performance (XGBoost All)----- 
Embedding Method: TFIDF
Total Accuracy:0.6952380952380952

                   precision    recall  f1-score   support

    battery issue       0.45      0.50      0.48        10
             bmis       0.50      0.25      0.33         4
      cable issue       0.74      0.85      0.79        34
data access issue       0.58      0.76      0.66        29
decision accuracy       0.27      0.30      0.29        10
    display issue       0.50      0.20      0.29         5
        fan issue       0.73      0.57      0.64        14
          freezes       0.67      0.36      0.47        11
   hardware issue       0.27      0.25      0.26        12
       no details       0.79      0.92      0.85        25
      power issue       0.40      0.25      0.31         8
        shuts off       0.22      0.25      0.24         8
   software issue       0.17      0.11      0.13         9
temp sensor error       1.00      0.60      0.75         5
    testi

### --- TFIDF Embeddings Classification (Some Preprocessing Methods)---

In [49]:
X_train_some, X_test_some, y_train_some, y_test_some = train_test_split (X_tfidf_some,y,stratify = y, random_state=42,test_size=.30)

In [50]:
xgbclf2 = XGBClassifier(objective='multi:softmax', random_state=42, n_jobs = -1)
xgbclf2.fit(X_train_some,y_train_some)

In [51]:
y_pred_train_some = xgbclf2.predict(X_train_some)  # Predictions for train set
y_pred_test_some = xgbclf2.predict(X_test_some)   #Predictions for the test set

#Print predicitons for the train set
print(f'''-----Train Set Performance (XGBoost Some)----- 
Embedding Method: TFIDF 
Total Accuracy: {accuracy_score(y_train_some, y_pred_train_some)}\n
{classification_report(y_train_some, y_pred_train_some,zero_division = 1,target_names=target_names)}''')

-----Train Set Performance (XGBoost Some)----- 
Embedding Method: TFIDF 
Total Accuracy: 0.9700680272108844

                   precision    recall  f1-score   support

    battery issue       0.96      0.92      0.94        24
             bmis       1.00      0.90      0.95        10
      cable issue       0.95      0.99      0.97        78
data access issue       1.00      0.97      0.99        68
decision accuracy       0.96      0.92      0.94        25
    display issue       1.00      0.92      0.96        12
        fan issue       0.97      1.00      0.99        34
          freezes       0.96      0.92      0.94        25
   hardware issue       0.90      0.93      0.92        29
       no details       1.00      1.00      1.00        59
      power issue       1.00      0.89      0.94        18
        shuts off       0.89      0.94      0.92        18
   software issue       0.95      0.95      0.95        20
temp sensor error       0.92      1.00      0.96        11
    t

In [52]:
#Print predictions for the test set
print(f'''-----Test Set Performance (XGBoost Some)----- 
Embedding Method: TFIDF
Total Accuracy:{accuracy_score(y_test_some, y_pred_test_some)}\n
{classification_report(y_test_some, y_pred_test_some, zero_division = 1,target_names=target_names)}''')

-----Test Set Performance (XGBoost Some)----- 
Embedding Method: TFIDF
Total Accuracy:0.6412698412698413

                   precision    recall  f1-score   support

    battery issue       0.45      0.50      0.48        10
             bmis       0.00      0.00      0.00         4
      cable issue       0.62      0.74      0.68        34
data access issue       0.46      0.66      0.54        29
decision accuracy       0.11      0.10      0.11        10
    display issue       0.00      0.00      0.00         5
        fan issue       0.50      0.29      0.36        14
          freezes       0.83      0.45      0.59        11
   hardware issue       0.27      0.25      0.26        12
       no details       0.92      0.96      0.94        25
      power issue       0.67      0.25      0.36         8
        shuts off       0.14      0.12      0.13         8
   software issue       0.00      0.00      0.00         9
temp sensor error       0.75      0.60      0.67         5
    test

## Preliminary Results

After comparing the different classification reports of Random Forest and XGBoost with all and some of the preprocessing steps applied, the best combination of model and preprocessing steps was XGBoost with TFIDF using all preprocessing steps. XGBoost with TFIDF and all processing steps acheived a test set accuracy score of .70 and an f1-score of .69 without hyperparameter tuning. The next step is to tune the xgboost hyperparameters to potentially increase accruacy and f1 score. If hyperparameter tuning is not enough, other methods such as ensemble learning will be implementd. 

---

# Optimized Hyperparameter Tuning with Optuna

Hyperparameter tuning with Optuna involves automatically searching for the best combination of hyperparameters for a machine learning model. Optuna uses an optimization algorithm, such as the Tree-structured Parzen Estimator (TPE), to efficiently explore the hyperparameter search space and find the optimal values.

In the code, the `objective` function defines the model and the hyperparameters to be optimized. Optuna suggests different hyperparameter values during the optimization process, and the `objective` function evaluates the model's performance using these suggested values. The evaluation metric, such as F1 score, is used to assess the model's quality. Optuna tracks the evaluation metric and adjusts the suggested hyperparameters accordingly.

The `study.optimize` method initiates the hyperparameter optimization process. It runs a specified number of trials, evaluating different combinations of hyperparameters. Optuna keeps track of the best-performing set of hyperparameters based on the evaluation metric. After all trials are completed, the best trial is retrieved using `study.best_trial`, and the optimal hyperparameters and their corresponding evaluation metric value are printed.

Overall, Optuna automates the process of hyperparameter tuning, efficiently exploring the hyperparameter space and identifying the best configuration for the given machine learning task.

In [54]:
X_train_all, X_temp_all, y_train_all, y_temp_all = train_test_split (X_tfidf_all,y,stratify = y, random_state=42,test_size=.30)
X_val_all, X_test_all, y_val_all, y_test_all = train_test_split (X_temp_all,y_temp_all,stratify = y_temp_all, random_state=42,test_size=.50)

## XGBoost Optimization

In [55]:

def objective(trial):
    """
    Hyperparameter optimization objective function using Optuna.
    
    Args:
        trial: Optuna trial object.
    
    Returns:
        float: Weighted F1 score obtained using the XGBoost model with optimized hyperparameters.
    """
    # Define hyperparameters
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 400),
        'max_depth': trial.suggest_int('max_depth', 1, 3),
        'learning_rate': trial.suggest_float('learning_rate', 0.4, 1.0),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.3, 0.8),
        'gamma': trial.suggest_float('gamma', 0.5, 1.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-4, 2.0),  # L2 regularization
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-4, 2.0),  # L1 regularization
    }

    model = XGBClassifier(**params, eval_metric='mlogloss', objective='multi:softprob', random_state=42,
                        callbacks=[optuna.integration.XGBoostPruningCallback(trial, observation_key="validation_0-mlogloss")],
                        tree_method='hist')
    model.fit(X_train_all, y_train_all,eval_set= [(X_val_all,y_val_all)],verbose = 0)
    y_pred = model.predict(X_val_all)
    f1 = f1_score(y_val_all, y_pred, average='weighted')
    
    return f1

# Create a study with a pruner
study = optuna.create_study(direction='maximize', pruner=HyperbandPruner())
study.optimize(objective, n_trials=200, n_jobs=-1, show_progress_bar=True)

#Print trial, value, and parameters with the best result.
print('Best trial:')
trial = study.best_trial

print('  Value: ', trial.value)
print('  Params: ')
for key, value in trial.params.items():
    print('    {}: {}'.format(key, value))


[I 2023-08-02 18:43:27,411] A new study created in memory with name: no-name-18bfa29a-6733-40ea-8715-c027b033c464


  0%|          | 0/200 [00:00<?, ?it/s]

[I 2023-08-02 18:43:39,813] Trial 1 finished with value: 0.6655290088027407 and parameters: {'n_estimators': 130, 'max_depth': 2, 'learning_rate': 0.8774948823587133, 'subsample': 0.8236790507540539, 'colsample_bytree': 0.473768865829419, 'gamma': 0.7125207457824918, 'reg_lambda': 0.4122492656428716, 'reg_alpha': 0.14853349841907745}. Best is trial 1 with value: 0.6655290088027407.
[I 2023-08-02 18:43:39,875] Trial 0 pruned. Trial was pruned at iteration 126.
[I 2023-08-02 18:43:39,883] Trial 7 pruned. Trial was pruned at iteration 127.
[I 2023-08-02 18:43:39,890] Trial 5 pruned. Trial was pruned at iteration 131.
[I 2023-08-02 18:43:41,425] Trial 3 finished with value: 0.6436525077915176 and parameters: {'n_estimators': 172, 'max_depth': 1, 'learning_rate': 0.45931714831810627, 'subsample': 0.751023142226025, 'colsample_bytree': 0.5952824313732179, 'gamma': 0.6096271656764756, 'reg_lambda': 1.0241438508683265, 'reg_alpha': 1.7308089740114365}. Best is trial 1 with value: 0.66552900880

Exception ignored in: <function ZipFile.__del__ at 0x106326660>
Traceback (most recent call last):
  File "/Users/josevera/miniconda3/envs/mlenv/lib/python3.11/zipfile.py", line 1872, in __del__
KeyboardInterrupt: 


KeyboardInterrupt: 

## Best XGB Params

In [26]:
best_xgb_params = {
    'n_estimators': 499,
    'max_depth': 1,
    'learning_rate': 0.6308805211760938,
    'subsample': 0.6248763597930178,
    'colsample_bytree': 0.34055509207982654,
    'gamma': 0.8793020954978124,
    'reg_lambda': 1.7276802719908124,
    'reg_alpha': 0.2200904663014992}

In [35]:

# Train the model with the best hyperparameters
best_xgb = XGBClassifier(**best_xgb_params, eval_metric='merror', objective='multi:softprob', random_state=42, tree_method='hist')

In [34]:
best_xgb.fit(X_train_all, y_train_all,eval_set= [(X_val_all,y_val_all)],verbose=1)

# Now you can use best_model to make predictions
y_pred = best_xgb.predict(X_val_all)


[0]	validation_0-merror:0.57325


[1]	validation_0-merror:0.55414
[2]	validation_0-merror:0.52229
[3]	validation_0-merror:0.45860
[4]	validation_0-merror:0.43949
[5]	validation_0-merror:0.40764
[6]	validation_0-merror:0.38217
[7]	validation_0-merror:0.36943
[8]	validation_0-merror:0.35669
[9]	validation_0-merror:0.36306
[10]	validation_0-merror:0.33758
[11]	validation_0-merror:0.31847
[12]	validation_0-merror:0.33121
[13]	validation_0-merror:0.35032
[14]	validation_0-merror:0.35669
[15]	validation_0-merror:0.34395
[16]	validation_0-merror:0.32484
[17]	validation_0-merror:0.33758
[18]	validation_0-merror:0.32484
[19]	validation_0-merror:0.30573
[20]	validation_0-merror:0.31210
[21]	validation_0-merror:0.32484
[22]	validation_0-merror:0.29936
[23]	validation_0-merror:0.31210
[24]	validation_0-merror:0.33121
[25]	validation_0-merror:0.34395
[26]	validation_0-merror:0.33121
[27]	validation_0-merror:0.33121
[28]	validation_0-merror:0.32484
[29]	validation_0-merror:0.33758
[30]	validation_0-merror:0.33121
[31]	validation_0-m

In [37]:
print(classification_report(y_val_all,y_pred,target_names=le.classes_))

                   precision    recall  f1-score   support

    battery issue       0.12      0.20      0.15         5
             bmis       1.00      0.50      0.67         2
      cable issue       0.74      0.82      0.78        17
data access issue       0.71      0.71      0.71        14
decision accuracy       0.33      0.60      0.43         5
    display issue       1.00      0.33      0.50         3
        fan issue       0.75      0.43      0.55         7
          freezes       0.75      0.60      0.67         5
   hardware issue       0.40      0.33      0.36         6
       no details       0.75      1.00      0.86        12
      power issue       0.50      0.50      0.50         4
        shuts off       1.00      0.25      0.40         4
   software issue       0.40      0.40      0.40         5
temp sensor error       1.00      0.67      0.80         3
    testing issue       1.00      0.25      0.40         4
    update issues       0.88      0.90      0.89       

In [38]:

# Now you can use best_model to make predictions
y_pred_test = best_xgb.predict(X_test_all)


In [39]:
print(classification_report(y_test_all,y_pred_test,target_names=le.classes_))

                   precision    recall  f1-score   support

    battery issue       1.00      0.80      0.89         5
             bmis       0.50      0.50      0.50         2
      cable issue       0.67      0.82      0.74        17
data access issue       0.67      0.67      0.67        15
decision accuracy       0.25      0.20      0.22         5
    display issue       0.50      0.50      0.50         2
        fan issue       0.80      0.57      0.67         7
          freezes       0.20      0.17      0.18         6
   hardware issue       0.20      0.17      0.18         6
       no details       0.79      0.85      0.81        13
      power issue       1.00      0.25      0.40         4
        shuts off       0.40      0.50      0.44         4
   software issue       0.00      0.00      0.00         4
temp sensor error       1.00      0.50      0.67         2
    testing issue       1.00      0.67      0.80         3
    update issues       0.87      0.98      0.92       

# Deep Learning Method (OpenAI Some)

After trying to improve the xgboost model with other xgboost models, I saw that perfomrance did not go up significantlly. After doing more research on text embeddings and classificaiton models for text, I learned that xgboost models are not capable of understanding the full information encoded by transoformer embeddings (OpenAI Embeddings). Instead, models like GPT, BERT are able to understand complex relationships between words/characters, by using deep learning models. The following cells explore the classificaiton performanc of deep learning on OpenAI's embeddings.

In [61]:
#Split data into testing and training data set. Set the testing size to 30% and stratify the data on y
X_train_some, X_temp_some, y_train_some, y_temp_some = train_test_split (X_openai_some,y,stratify = y, random_state=42,test_size=.30)
## Training neural networks, and optimizing hyperparameters can be imporved by providing a validation set 
X_val_some, X_test_some, y_val_some, y_test_some = train_test_split (X_temp_some,y_temp_some,stratify = y_temp_some, random_state=42,test_size=.50)

### Deep Learning Hyperparameter Tunning

In [24]:

# One-hot encode the labels
y_train_encoded = to_categorical(y_train_some)
y_val_encoded = to_categorical(y_val_some)
y_test_encoded = to_categorical(y_test_some)

#Convert X's to Tensors
X_train_tensor = tf.constant(X_train_some, dtype=tf.float32)
X_val_tensor = tf.constant(X_val_some, dtype=tf.float32)
X_test_tensor = tf.constant(X_test_some, dtype=tf.float32)

# Convert y's to TensorFlow tensors
y_train_tensor = tf.constant(y_train_encoded, dtype=tf.float32)
y_val_tensor = tf.constant(y_val_encoded, dtype=tf.float32)
y_test_tensor = tf.constant(y_test_encoded, dtype=tf.float32)


In [None]:


class_weights = class_weight.compute_class_weight(class_weight='balanced',classes= np.unique(y_train_some), y=y_train_some)
class_weights = dict(enumerate(class_weights))


def objective(trial):
    # Define the hyperparameters to tune
    optimizer_name = trial.suggest_categorical('optimizer_name', ['adam', 'adamax', 'nadam'])

    if optimizer_name == 'sgd':
        learning_rate = trial.suggest_float('sgd_learning_rate', 1e-6, 1e-2)
        optimizer = tf.keras.optimizers.legacy.SGD(learning_rate=learning_rate)
    elif optimizer_name == 'sgd_with_momentum':
        learning_rate = trial.suggest_float('momentum_learning_rate', 1e-6, 1e-2)
        momentum = trial.suggest_float('momentum_momentum', 0.1, 0.9)
        optimizer = tf.keras.optimizers.legacy.SGD(learning_rate=learning_rate, momentum=momentum)
    elif optimizer_name == 'rmsprop':
        learning_rate = trial.suggest_float('rmsprop_learning_rate', 1e-6, 1e-2)
        rho = trial.suggest_float('rmsprop_rho', 0.1, 0.9)
        optimizer = tf.keras.optimizers.legacy.RMSprop(learning_rate=learning_rate, rho=rho)
    elif optimizer_name == 'adam':
        learning_rate = trial.suggest_float('adam_learning_rate', 1e-6, 1e-2)
        beta_1 = trial.suggest_float('adam_beta_1', 0.8, 0.99)
        beta_2 = trial.suggest_float('adam_beta_2', 0.9, 0.9999)
        optimizer = tf.keras.optimizers.legacy.Adam(learning_rate=learning_rate, beta_1=beta_1, beta_2=beta_2)
    elif optimizer_name == 'adamax':
        learning_rate = trial.suggest_float('adamax_learning_rate', 1e-6, 1e-2)
        beta_1 = trial.suggest_float('adamax_beta_1', 0.8, 0.99)
        beta_2 = trial.suggest_float('adamax_beta_2', 0.9, 0.9999)
        optimizer = tf.keras.optimizers.legacy.Adamax(learning_rate=learning_rate, beta_1=beta_1, beta_2=beta_2)
    elif optimizer_name == 'nadam':
        learning_rate = trial.suggest_float('nadam_learning_rate', 1e-6, 1e-2)
        beta_1 = trial.suggest_float('nadam_beta_1', 0.8, 0.99)
        beta_2 = trial.suggest_float('nadam_beta_2', 0.9, 0.9999)
        optimizer = tf.keras.optimizers.legacy.Nadam(learning_rate=learning_rate, beta_1=beta_1, beta_2=beta_2)

        
        '''{'dropout_rate': 0.10921940617643529,
 'learning_rate': 0.0004922542982014772,
 'dense_units': 762,
 'l2_regularization_exp': -5.378788139919697,
 'l1_regularization_exp': -5.325881561279877,
 'epochs': 26,
 'batch_size': 16,
 'weight_initializer': 'random_uniform',
 'activation_function': 'relu'}'''
    
    dropout_rate = trial.suggest_float('dropout_rate', 0.1, 1)
    dropout_rate2 = trial.suggest_float('dropout_rate2', 0.1, 1)
    dense_units = trial.suggest_int('dense_units', 8, 1024)
    l2_regularization_exp = trial.suggest_float('l2_regularization_exp', -6, 1)
    l1_regularization_exp = trial.suggest_float('l1_regularization_exp', -6, 1)
    batch_size = trial.suggest_categorical('batch_size', [4,8,16,32, 64, 128, 256])
    weight_initializer = trial.suggest_categorical('weight_initializer', ['random_uniform', 'random_normal', 'glorot_uniform'])
    activation_function = trial.suggest_categorical('activation_function', ['relu', 'sigmoid', 'tanh','leaky_relu'])
    epochs = trial.suggest_int('epochs',5,40)


    # Calculate l2_regularization from the exponent
    l2_regularization = math.pow(10, l2_regularization_exp)
    l1_regularization = math.pow(10, l1_regularization_exp)

    
    # Define the model
    def create_model():
        model = Sequential()
        model.add(Dense(dense_units, input_dim=1536, 
                        kernel_regularizer=tf.keras.regularizers.l1_l2(l1=l1_regularization, l2=l2_regularization),
                        kernel_initializer=weight_initializer))
        model.add(Dropout(dropout_rate2))
        model.add(BatchNormalization())
        model.add(LeakyReLU() if activation_function == 'leaky_relu' else Activation(activation_function))
        model.add(Dropout(dropout_rate))
        model.add(BatchNormalization())
        model.add(Dense(18, activation='softmax'))
        return model

    # Create the model inside the objective function to avoid TensorFlow retracing warning
    model = create_model()
    model.compile(loss='categorical_crossentropy',optimizer=optimizer, metrics=[tf.keras.metrics.F1Score('weighted')])
    lr_scheduler = ReduceLROnPlateau(monitor='val_f1_score')


    # Train the model
    model.fit(X_train_tensor, y_train_tensor, validation_data=(X_val_tensor, y_val_tensor),
              epochs=epochs, batch_size=batch_size, callbacks=[lr_scheduler,TFKerasPruningCallback(trial, 'val_f1_score')],
              verbose=0,workers = 1,class_weight=class_weights)

    # Predict the classes on the validation data
    y_pred = model.predict(X_val_tensor)
    y_pred_classes = np.argmax(y_pred, axis=1)

    # Calculate the weighted F1-score
    f1 = f1_score(y_val_some, y_pred_classes, average='weighted')

    return f1

# Create an Optuna study and optimize the objective function 
study = optuna.create_study(direction='maximize', pruner=optuna.pruners.HyperbandPruner(min_resource = 20))
study.optimize(objective, n_trials=300, n_jobs= 8,show_progress_bar=True)  # Set n_jobs=1 to avoid the TensorFlow retracing warning

# Print the best hyperparameters and the corresponding accuracy
best_params = study.best_params
best_f1 = study.best_value

# Print the best hyperparameters and the corresponding accuracy
print("Best Hyperparameters:", best_params)
print("Best F1-Score:", best_f1)

In [None]:
best_params

In [None]:
#Best Neural Network Parameters
{
 'optimizer_name': 'adam',
 'learning_rate': 7.322790338548657e-07,
 'beta_1': 0.9446718096733093,
 'beta_2': 0.900862991809845,
 'dropout_rate': 0.11800425769664818, 
 'dropout_rate2': 0.564894261824532,
 'learning_rate': 'Not provided in the configuration',
 'dense_units': 125,
 'l2_regularization_exp': 6.953548108867835e-06,
 'l1_regularization_exp': 1.0231772193947108e-06,
 'epochs': 26,
 'batch_size': 16,
 'weight_initializer': 'random_normal',
 'activation_function': 'sigmoid'
}


In [98]:
def create_best_model():
    model = Sequential()
    model.add(Dense(best_params['dense_units'], input_dim=1536, 
                    kernel_regularizer=tf.keras.regularizers.l1_l2(l1=math.pow(10, best_params['l1_regularization_exp']), 
                                                                  l2=math.pow(10, best_params['l2_regularization_exp'])),
                    kernel_initializer=best_params['weight_initializer']))
    model.add(Dropout(best_params['dropout_rate']))
    model.add(BatchNormalization())
    model.add(LeakyReLU() if best_params['activation_function'] == 'leaky_relu' else Activation(best_params['activation_function']))
    model.add(Dropout(best_params['dropout_rate2']))
    model.add(BatchNormalization())
    model.add(Dense(18, activation='softmax'))
    return model

# Create the best model
best_optimizer = tf.keras.optimizers.legacy.Adam(learning_rate=best_params['adam_learning_rate'], beta_1=best_params['adam_beta_1'], beta_2=best_params['adam_beta_2'])
best_model = create_best_model()

best_model.compile(loss='categorical_crossentropy', optimizer=best_optimizer, metrics=[tf.keras.metrics.F1Score('weighted')])

lr_scheduler = ReduceLROnPlateau(monitor='val_f1_score')

# Train the best model
best_model.fit(X_train_some, y_train_encoded, validation_data=(X_val_some, y_val_encoded),
          epochs=50, batch_size=best_params['batch_size'], callbacks=[lr_scheduler],
          verbose=1,workers = 8,class_weight=class_weights)


# Evaluate the model on the test dataset
_, f1 = best_model.evaluate(X_test_some, y_test_encoded, verbose=0)
print("Test F1_Score: ", f1)



Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Test F1_Score:  0.7496675252914429


In [101]:
best_model.save('best_neural_network')

INFO:tensorflow:Assets written to: nn5/assets


INFO:tensorflow:Assets written to: nn5/assets


In [103]:
# Assuming you have trained and evaluated your model on the test dataset
y_pred = best_model.predict(X_val_some)
y_pred_classes = np.argmax(y_pred, axis=1)

# Generate the classification report
classification_rep = classification_report(y_val_some, y_pred_classes, target_names=target_names)

# Print the classification report
print(classification_rep)


                   precision    recall  f1-score   support

    battery issue       0.17      0.20      0.18         5
             bmis       0.25      0.50      0.33         2
      cable issue       0.81      0.76      0.79        17
data access issue       0.69      0.64      0.67        14
decision accuracy       0.45      1.00      0.62         5
    display issue       0.75      1.00      0.86         3
        fan issue       0.71      0.71      0.71         7
          freezes       0.75      0.60      0.67         5
   hardware issue       0.50      0.67      0.57         6
       no details       0.92      1.00      0.96        12
      power issue       1.00      0.50      0.67         4
        shuts off       1.00      0.25      0.40         4
   software issue       0.20      0.20      0.20         5
temp sensor error       1.00      0.67      0.80         3
    testing issue       0.67      0.50      0.57         4
    update issues       0.94      0.80      0.87       

In [25]:
# Save the model to disk
# best_model.save('best_neural_network')

# Load the model from disk
loaded_model = tf.keras.models.load_model('best_neural_network')

In [26]:

# Assuming you have trained and evaluated your model on the test dataset
y_pred = loaded_model.predict(X_val_some)
y_pred_classes = np.argmax(y_pred, axis=1)

# Generate the classification report
target_names = le.classes_  # Replace with your actual class names
classification_rep = classification_report(y_val_some, y_pred_classes, target_names=target_names)

# Print the classification report
print(classification_rep)


                   precision    recall  f1-score   support

    battery issue       0.17      0.20      0.18         5
             bmis       0.25      0.50      0.33         2
      cable issue       0.81      0.76      0.79        17
data access issue       0.69      0.64      0.67        14
decision accuracy       0.45      1.00      0.62         5
    display issue       0.75      1.00      0.86         3
        fan issue       0.71      0.71      0.71         7
          freezes       0.75      0.60      0.67         5
   hardware issue       0.50      0.67      0.57         6
       no details       0.92      1.00      0.96        12
      power issue       1.00      0.50      0.67         4
        shuts off       1.00      0.25      0.40         4
   software issue       0.20      0.20      0.20         5
temp sensor error       1.00      0.67      0.80         3
    testing issue       0.67      0.50      0.57         4
    update issues       0.94      0.80      0.87       

In [29]:
loaded_model.optimizer.get_config()

{'name': 'Adam',
 'learning_rate': 7.322790338548657e-07,
 'decay': 0.0,
 'beta_1': 0.9446718096733093,
 'beta_2': 0.900862991809845,
 'epsilon': 1e-07,
 'amsgrad': False}

# Testing on more CPX data

In [438]:
new_descriptions = pd.read_excel('CPX RMA Data 2022-2023.xlsx')[['Customer Reason']].copy()
new_descriptions.columns = ['New_Description']
new_descriptions['New_Description'] = new_descriptions['New_Description'].astype(str)
new_descriptions = new_descriptions.dropna().drop_duplicates()

In [456]:
#Preprocess New Description and apply Embedding


'''new_descriptions['New_Processed_Description'] = new_descriptions['New_Description'].apply(preprocess_text)
new_desc_embeddings = new_descriptions['New_Processed_Description'].apply(create_embedding)
new_descriptions['Embedding'] = new_desc_embeddings
new_descriptions.to_csv('Processed_CPX_2022_2023.csv',index = False)'''

new_descriptions=pd.read_csv('Processed_CPX_2022_2023.csv')['New_Description'].astype(str)

In [459]:
new_embedded_descriptions = pd.read_csv('Processed_CPX_2022_2023.csv')[['Embedding']]

In [461]:
new_embedding_matrix = np.stack(new_embedded_descriptions.Embedding.apply(eval))

In [462]:
X_new_embeddings = normalize(new_embedding_matrix, norm='l2')


In [463]:
y_pred_proba = loaded_model.predict(X_new_embeddings)
y_pred_classes = np.argmax(y_pred_proba, axis=1)



In [464]:
predicted_classes = le.inverse_transform(y_pred_classes)

In [471]:
new_description_results = pd.DataFrame(new_descriptions)

In [473]:
new_description_results['Predicted_Category'] = predicted_classes

# Deep Learning + XGBoost

In [27]:
X_train_all, X_temp_all, y_train_all, y_temp_all = train_test_split (X_tfidf_all,y,stratify = y, random_state=42,test_size=.30)
X_val_all, X_test_all, y_val_all, y_test_all = train_test_split(X_temp_all,y_temp_all, stratify=y_temp_all, random_state=42, test_size=.50)

#Split data into testing and training data set. Set the testing size to 30% and stratify the data on y
X_train_some, X_temp_some, y_train_some, y_temp_some = train_test_split (X_openai_some,y,stratify = y, random_state=42,test_size=.30)
## Training neural networks, and optimizing hyperparameters can be imporved by providing a validation set 
X_val_some, X_test_some, y_val_some, y_test_some = train_test_split (X_temp_some,y_temp_some,stratify = y_temp_some, random_state=42,test_size=.50)

In [28]:
y_val_some.size

157

In [29]:
best_xgb = XGBClassifier(**best_xgb_params,eval_metric='mlogloss', objective='multi:softprob', tree_method='hist',n_jobs = -1,random_state = 42)

best_xgb.fit(X_train_all, y_train_all)

y_proba_xgb = best_xgb.predict_proba(X_val_all)
y_pred_xgb = best_xgb.predict(X_val_all)


In [30]:
target_names = le.classes_  # Replace with your actual class names
print(classification_report(y_val_all, y_pred_xgb,target_names = le.classes_))

                   precision    recall  f1-score   support

    battery issue       0.12      0.20      0.15         5
             bmis       1.00      0.50      0.67         2
      cable issue       0.74      0.82      0.78        17
data access issue       0.71      0.71      0.71        14
decision accuracy       0.33      0.60      0.43         5
    display issue       1.00      0.33      0.50         3
        fan issue       0.75      0.43      0.55         7
          freezes       0.75      0.60      0.67         5
   hardware issue       0.40      0.33      0.36         6
       no details       0.75      1.00      0.86        12
      power issue       0.50      0.50      0.50         4
        shuts off       1.00      0.25      0.40         4
   software issue       0.40      0.40      0.40         5
temp sensor error       1.00      0.67      0.80         3
    testing issue       1.00      0.25      0.40         4
    update issues       0.88      0.90      0.89       

In [31]:
y_proba_xgb_test = best_xgb.predict_proba(X_test_all)
y_pred_xgb_test = best_xgb.predict(X_test_all)
print(classification_report(y_test_all, y_pred_xgb_test,target_names = le.classes_))

                   precision    recall  f1-score   support

    battery issue       1.00      0.80      0.89         5
             bmis       0.50      0.50      0.50         2
      cable issue       0.67      0.82      0.74        17
data access issue       0.67      0.67      0.67        15
decision accuracy       0.25      0.20      0.22         5
    display issue       0.50      0.50      0.50         2
        fan issue       0.80      0.57      0.67         7
          freezes       0.20      0.17      0.18         6
   hardware issue       0.20      0.17      0.18         6
       no details       0.79      0.85      0.81        13
      power issue       1.00      0.25      0.40         4
        shuts off       0.40      0.50      0.44         4
   software issue       0.00      0.00      0.00         4
temp sensor error       1.00      0.50      0.67         2
    testing issue       1.00      0.67      0.80         3
    update issues       0.87      0.98      0.92       

## Load Deep Learning Model

In [59]:
# Load the model from disk
loaded_model = tf.keras.models.load_model('best_neural_network')
loaded_model2 = tf.keras.models.load_model('nn4')


In [60]:

##Best DL Model
y_proba_dlv = loaded_model.predict(X_val_some)
y_pred_dlv = np.argmax(y_proba_dlv, axis=1)

##Second Best DL Model
y_proba_dlv2 = loaded_model2.predict(X_val_some)
y_pred_dlv2 = np.argmax(y_proba_dlv2, axis=1)

# Generate the classification report
target_names = le.classes_  # Replace with your actual class names

print('Validation Performance:\n')
print(classification_report(y_val_some, y_pred_dlv, target_names=target_names))

Validation Performance:

                   precision    recall  f1-score   support

    battery issue       0.17      0.20      0.18         5
             bmis       0.25      0.50      0.33         2
      cable issue       0.81      0.76      0.79        17
data access issue       0.69      0.64      0.67        14
decision accuracy       0.45      1.00      0.62         5
    display issue       0.75      1.00      0.86         3
        fan issue       0.71      0.71      0.71         7
          freezes       0.75      0.60      0.67         5
   hardware issue       0.50      0.67      0.57         6
       no details       0.92      1.00      0.96        12
      power issue       1.00      0.50      0.67         4
        shuts off       1.00      0.25      0.40         4
   software issue       0.20      0.20      0.20         5
temp sensor error       1.00      0.67      0.80         3
    testing issue       0.67      0.50      0.57         4
    update issues       0.94  

In [65]:
# Assuming you have trained and evaluated your model on the test dataset
y_proba_dlt = loaded_model.predict(X_test_some)
y_pred_dlt = np.argmax(y_proba_dlt, axis=1)


##Second Best DL Model
y_proba_dlt2 = loaded_model2.predict(X_test_some)
y_pred_dlt2 = np.argmax(y_proba_dlt2, axis=1)

# Generate the classification report
target_names = le.classes_  # Replace with your actual class names
print('Test Performance:\n')
print(classification_report(y_test_some, y_pred_dlt, target_names=target_names))

Test Performance:

                   precision    recall  f1-score   support

    battery issue       0.83      1.00      0.91         5
             bmis       0.67      1.00      0.80         2
      cable issue       0.72      0.76      0.74        17
data access issue       0.63      0.80      0.71        15
decision accuracy       0.14      0.20      0.17         5
    display issue       0.50      0.50      0.50         2
        fan issue       0.71      0.71      0.71         7
          freezes       0.80      0.67      0.73         6
   hardware issue       0.20      0.17      0.18         6
       no details       0.86      0.92      0.89        13
      power issue       0.67      0.50      0.57         4
        shuts off       0.60      0.75      0.67         4
   software issue       0.33      0.25      0.29         4
temp sensor error       1.00      0.50      0.67         2
    testing issue       0.67      0.67      0.67         3
    update issues       0.97      0.

In [54]:
y_test_some

array([17, 15, 15,  9, 15, 15, 15,  0, 15,  9, 15,  7, 15, 15, 16, 17,  7,
       15,  8, 17, 15,  1,  9,  3, 15, 15,  4,  2,  2, 15,  2, 17,  3, 15,
       17,  8,  5, 12,  7, 11, 15,  6, 15,  0, 15,  6,  3,  2,  3, 12,  2,
        9, 15,  4,  9, 17,  9,  2,  7, 15, 17,  3,  6,  2, 15,  2,  3, 15,
       15,  7, 15,  3,  8, 16, 10, 10,  6,  8, 15,  9, 16, 15,  2,  3, 15,
        9,  7,  9,  1, 16, 15, 15,  3, 17,  2, 12, 10, 12, 15, 15,  5, 14,
       15,  2, 17,  4, 15,  9,  4, 15,  8,  9, 15, 16,  4,  6,  8, 15, 16,
        3,  6,  2, 15, 11, 13,  3, 13, 15,  0, 10,  2,  2,  0, 16,  0, 15,
        3, 17, 17,  2,  2, 15,  3, 15,  3, 11, 17, 14,  6, 17,  3, 11, 16,
       17,  9, 14,  2,  9])

In [62]:
from sklearn.linear_model import LogisticRegression
X_meta = np.concatenate([y_proba_dlv,y_proba_dlv2], axis=1)

# Train a meta-learner model on the predicted probabilities
# Here we use XGBoost as the meta-learner
meta_learner = XGBClassifier(random_state=42,eval_metric='mlogloss',objective = 'multi:softmax')
meta_learner.fit(X_meta, y_val_some)  # Assuming y_test_all and y_test_some are the same


In [63]:
y_proba_dlv = loaded_model.predict(X_val_some)
X_meta_val = np.concatenate([y_proba_dlv, y_proba_dlv2], axis=1)
y_pred_meta = meta_learner.predict(X_meta_val)

print(classification_report(y_val_some, y_pred_meta,target_names = le.classes_))

                   precision    recall  f1-score   support

    battery issue       1.00      1.00      1.00         5
             bmis       1.00      1.00      1.00         2
      cable issue       1.00      1.00      1.00        17
data access issue       1.00      1.00      1.00        14
decision accuracy       1.00      1.00      1.00         5
    display issue       1.00      1.00      1.00         3
        fan issue       1.00      1.00      1.00         7
          freezes       1.00      1.00      1.00         5
   hardware issue       1.00      1.00      1.00         6
       no details       1.00      1.00      1.00        12
      power issue       1.00      1.00      1.00         4
        shuts off       1.00      0.75      0.86         4
   software issue       1.00      1.00      1.00         5
temp sensor error       1.00      1.00      1.00         3
    testing issue       1.00      1.00      1.00         4
    update issues       0.98      1.00      0.99       

In [66]:
X_meta_test = np.concatenate([y_proba_dlt, y_proba_dlt2], axis=1)

In [69]:
y_pred_meta = meta_learner.predict(X_meta_test)

print(classification_report(y_test_some, y_pred_meta,target_names = le.classes_))

                   precision    recall  f1-score   support

    battery issue       0.83      1.00      0.91         5
             bmis       1.00      0.50      0.67         2
      cable issue       0.68      0.76      0.72        17
data access issue       0.67      0.67      0.67        15
decision accuracy       0.50      0.20      0.29         5
    display issue       0.50      0.50      0.50         2
        fan issue       0.60      0.86      0.71         7
          freezes       0.67      0.67      0.67         6
   hardware issue       0.29      0.33      0.31         6
       no details       1.00      0.85      0.92        13
      power issue       0.00      0.00      0.00         4
        shuts off       0.33      0.50      0.40         4
   software issue       0.00      0.00      0.00         4
temp sensor error       1.00      0.50      0.67         2
    testing issue       0.33      0.33      0.33         3
    update issues       0.91      0.95      0.93       

##  Meta Learner Hyper Parameter Tuning

In [72]:
import numpy as np

def objective(trial):
    # Sample hyperparameters for the meta-learner
    max_depth = trial.suggest_int("max_depth", 1, 10)
    learning_rate = trial.suggest_float("learning_rate", 0.01, 0.1)
    n_estimators = trial.suggest_int("n_estimators", 50, 500)
    subsample = trial.suggest_float("subsample", 0.2, 1.0)
    colsample_bytree = trial.suggest_float("colsample_bytree", 0.2, 1.0)
    reg_alpha = trial.suggest_float("reg_alpha", 0.0, 1.0)
    reg_lambda = trial.suggest_float("reg_lambda", 0.0, 1.0)
    
    # Train the meta-learner on the predicted probabilities
    meta_learner = XGBClassifier(
        random_state=42,
        eval_metric='merror',
        objective='multi:softmax',
        max_depth=max_depth,
        learning_rate=learning_rate,
        n_estimators=n_estimators,
        subsample = subsample,
        reg_alpha = reg_alpha,
        reg_lambda = reg_lambda,
        colsample_bytree=colsample_bytree,
        callbacks=[optuna.integration.XGBoostPruningCallback(trial, observation_key="validation_0-merror")]
    )



    # Get prediction for the val and test sets from learners
    #Val Set
    X_meta_val = np.concatenate([y_proba_dlv,y_proba_dlv2], axis=1)

    #Test Set
    y_proba_dlt = loaded_model.predict(X_test_some)
    X_meta_test = np.concatenate([y_proba_dlt,y_proba_dlt2], axis=1)
    #Build Meta Learner
    meta_learner.fit(X_meta_val, y_val_some,eval_set = [(X_meta_test,y_test_some)],verbose = 0)
    y_pred_meta = meta_learner.predict(X_meta_test)
    
    # Calculate and return the accuracy as the objective
    f1 = f1_score(y_test_some,y_pred_meta,average = 'weighted')
    return f1

# Run the optimization using Hyperband pruning
study = optuna.create_study(
    direction="maximize",
    pruner=HyperbandPruner(min_resource=25)
)

study.optimize(objective, n_trials=300,n_jobs = -1, show_progress_bar = True)

[I 2023-08-16 12:52:55,442] A new study created in memory with name: no-name-fd03715a-2f7c-4864-9791-1a40aeb797f9


  0%|          | 0/300 [00:00<?, ?it/s]

[I 2023-08-16 12:52:58,040] Trial 0 finished with value: 0.6430878006455072 and parameters: {'max_depth': 10, 'learning_rate': 0.028894143326076368, 'n_estimators': 60, 'subsample': 0.3336614741312672, 'colsample_bytree': 0.6548827083255392, 'reg_alpha': 0.1530066144102149, 'reg_lambda': 0.8762655880515675}. Best is trial 0 with value: 0.6430878006455072.
[I 2023-08-16 12:52:58,071] Trial 6 pruned. Trial was pruned at iteration 44.
[I 2023-08-16 12:52:58,088] Trial 1 pruned. Trial was pruned at iteration 49.
[I 2023-08-16 12:52:58,818] Trial 10 pruned. Trial was pruned at iteration 25.
[I 2023-08-16 12:52:59,792] Trial 3 pruned. Trial was pruned at iteration 75.
[I 2023-08-16 12:52:59,876] Trial 2 pruned. Trial was pruned at iteration 75.
[I 2023-08-16 12:53:00,981] Trial 12 pruned. Trial was pruned at iteration 25.
[I 2023-08-16 12:53:01,134] Trial 9 pruned. Trial was pruned at iteration 75.
[I 2023-08-16 12:53:01,185] Trial 13 pruned. Trial was pruned at iteration 25.
[I 2023-08-16 1

In [73]:
study.best_params

{'max_depth': 7,
 'learning_rate': 0.037914583026431935,
 'n_estimators': 146,
 'subsample': 0.5736373943198284,
 'colsample_bytree': 0.8749771610457713,
 'reg_alpha': 0.45261287773891445,
 'reg_lambda': 0.8360368643167558}

In [74]:
best_meta_params = {'max_depth': 3,
 'learning_rate': 0.04097970899174354,
 'n_estimators': 148,
 'subsample': 0.5813429245642656,
 'colsample_bytree': 0.5029078289354505,
 'reg_alpha': 0.23997676193839024,
 'reg_lambda': 0.7623265939843972}


In [75]:
# Train the meta-learner on the predicted probabilities
meta_learner = XGBClassifier(
    random_state=42,
    eval_metric='merror',
    objective='multi:softmax',**best_meta_params)
    
meta_learner.fit(X_meta, y_val_some,eval_set = [(X_meta_test,y_test_some)],verbose = 2)
    
# Predict with the meta-learner on the test set
y_proba_dlt = loaded_model.predict(X_test_some)
    
X_meta_test = np.concatenate([y_proba_dlt,y_proba_dlt2], axis=1)
y_pred_meta = meta_learner.predict(X_meta_test)
    
# Calculate and return the accuracy as the objective
f1 = f1_score(y_test_some,y_pred_meta,average = 'weighted')
print(f1)
print(classification_report(y_test_some,y_pred_meta,target_names = le.classes_))

[0]	validation_0-merror:0.43671
[2]	validation_0-merror:0.37975
[4]	validation_0-merror:0.36709
[6]	validation_0-merror:0.32911
[8]	validation_0-merror:0.31646
[10]	validation_0-merror:0.31646
[12]	validation_0-merror:0.29747
[14]	validation_0-merror:0.27848
[16]	validation_0-merror:0.28481
[18]	validation_0-merror:0.28481
[20]	validation_0-merror:0.28481
[22]	validation_0-merror:0.27215
[24]	validation_0-merror:0.26582
[26]	validation_0-merror:0.26582
[28]	validation_0-merror:0.25949
[30]	validation_0-merror:0.26582
[32]	validation_0-merror:0.26582
[34]	validation_0-merror:0.26582
[36]	validation_0-merror:0.26582
[38]	validation_0-merror:0.27215
[40]	validation_0-merror:0.27215
[42]	validation_0-merror:0.26582
[44]	validation_0-merror:0.26582
[46]	validation_0-merror:0.27848
[48]	validation_0-merror:0.27215
[50]	validation_0-merror:0.27215
[52]	validation_0-merror:0.27215
[54]	validation_0-merror:0.27215
[56]	validation_0-merror:0.26582
[58]	validation_0-merror:0.27848
[60]	validation

## Conclusion

Ultimately, deep learning proved to be the best performing classifier. Using the predicted probabilies of the two best deep learning models to train a third xgboost model (meta learner) did not reproduce any imporvements in results. Other methods of stacking have not been fully explored, so there may still be other methods of improving model performance without additional training data.