In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk
import langdetect 
import spacy
from sklearn import feature_extraction, manifold
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
df=pd.read_csv('1.csv')
df.head()

In [None]:
## Rename Columns
df = df.rename(columns={"category":"y", "headline":"text"})

In [None]:
#  Check the blank value
df.isnull().sum()  


In [None]:
df.dropna(inplace=True) 

In [None]:
#  Check the blank value
df.isnull().sum()  

# Task 1. Data understanding 

In [None]:
#Bar chart showing label frequencies to study target variable distribution
x = "y"
fig, ax = plt.subplots()
fig.suptitle(x, fontsize=12)
df[x].reset_index().groupby(x).count().sort_values(by= 
       "index").plot(kind="barh", legend=False, 
        ax=ax).grid(axis='x')
plt.show()

### It is obvious that the number of travel variables is greater than home&living

In [None]:
df['lang'] = df["text"].apply(lambda x: langdetect.detect(x) if 
                                 x.strip() != "" else "")
df.head()

In [None]:
x = "lang"
fig, ax = plt.subplots()
fig.suptitle(x, fontsize=12)
df[x].reset_index().groupby(x).count().sort_values(by= 
       "index").plot(kind="barh", legend=False, 
        ax=ax).grid(axis='x')
plt.show()

### It is clear that English is the primary language, so we filter the dataset in English

In [None]:
df = df[df["lang"]=="en"]

## Text pre-processing

In [None]:
nltk.download('wordnet')

def clean_text(text):
    # Clean the text (convert to lowercase, remove punctuation and characters, then strip)
    text = re.sub(r'[^\w\s]', '', str(text).lower().strip())
    return text

def remove_stopwords(lst_text, lst_stopwords):
    # Remove stopwords
    return [word for word in lst_text if word not in lst_stopwords]

def stem_text(lst_text, flg_stemm):
    # Perform stemming
    if flg_stemm:
        ps = nltk.stem.porter.PorterStemmer()
        return [ps.stem(word) for word in lst_text]
    return lst_text

def lemmatize_text(lst_text, flg_lemm):
    # Perform lemmatization
    if flg_lemm:
        lem = nltk.stem.wordnet.WordNetLemmatizer()
        return [lem.lemmatize(word) for word in lst_text]
    return lst_text

def utils_preprocess_text(text, flg_stemm=False, flg_lemm=True, lst_stopwords=None):
    # Clean the text
    text = clean_text(text)
    
    # Tokenize the text
    lst_text = text.split()
    
    # Remove stopwords
    if lst_stopwords is not None:
        lst_text = remove_stopwords(lst_text, lst_stopwords)
    
    # Stem the text
    lst_text = stem_text(lst_text, flg_stemm)
    
    # Lemmatize the text
    lst_text = lemmatize_text(lst_text, flg_lemm)
    
    # Convert the list back to a string
    text = " ".join(lst_text)
    return text

In [None]:
lst_stopwords = nltk.corpus.stopwords.words("english")
lst_stopwords

In [None]:
df["text_clean"] = df["text"].apply(lambda x: utils_preprocess_text(x, False, True, lst_stopwords))
df

In [None]:
nltk.download('punkt')


## Analysis of commonly used terms

In [None]:
y = "TRAVEL"
top=10
corpus = df[df["y"]==y]["text_clean"]
lst_tokens = nltk.tokenize.word_tokenize(corpus.str.cat(sep=" "))
fig, ax = plt.subplots(nrows=1, ncols=2)
fig.suptitle("Most frequent words", fontsize=15)

## unigrams
dic_words_freq = nltk.FreqDist(lst_tokens)
dtf_uni = pd.DataFrame(dic_words_freq.most_common(), 
                       columns=["Word","Freq"])
dtf_uni.set_index("Word").iloc[:top,:].sort_values(by="Freq").plot(
                  kind="barh", title="Unigrams", ax=ax[0], 
                  legend=False).grid(axis='x')
ax[0].set(ylabel=None)

## bigrams
dic_words_freq = nltk.FreqDist(nltk.ngrams(lst_tokens, 2))
dtf_bi = pd.DataFrame(dic_words_freq.most_common(), 
                      columns=["Word","Freq"])
dtf_bi["Word"] = dtf_bi["Word"].apply(lambda x: " ".join(
                   string for string in x) )
dtf_bi.set_index("Word").iloc[:top,:].sort_values(by="Freq").plot(
                  kind="barh", title="Bigrams", ax=ax[1],
                  legend=False).grid(axis='x')
ax[1].set(ylabel=None)
plt.show()

In [None]:
y = "HOME & LIVING"
top=10
corpus = df[df["y"]==y]["text_clean"]
lst_tokens = nltk.tokenize.word_tokenize(corpus.str.cat(sep=" "))
fig, ax = plt.subplots(nrows=1, ncols=2)
fig.suptitle("Most frequent words", fontsize=15)

## unigrams
dic_words_freq = nltk.FreqDist(lst_tokens)
dtf_uni = pd.DataFrame(dic_words_freq.most_common(), 
                       columns=["Word","Freq"])
dtf_uni.set_index("Word").iloc[:top,:].sort_values(by="Freq").plot(
                  kind="barh", title="Unigrams", ax=ax[0], 
                  legend=False).grid(axis='x')
ax[0].set(ylabel=None)

## bigrams
dic_words_freq = nltk.FreqDist(nltk.ngrams(lst_tokens, 2))
dtf_bi = pd.DataFrame(dic_words_freq.most_common(), 
                      columns=["Word","Freq"])
dtf_bi["Word"] = dtf_bi["Word"].apply(lambda x: " ".join(
                   string for string in x) )
dtf_bi.set_index("Word").iloc[:top,:].sort_values(by="Freq").plot(
                  kind="barh", title="Bigrams", ax=ax[1],
                  legend=False).grid(axis='x')
ax[1].set(ylabel=None)
plt.show()

## Analyse other features in the dataset and their relationship with the label 

In [None]:
# Replace empty values in the 'authors' field with 'Unknown'
df['authors'].fillna('Unknown', inplace=True)

# Calculate the number of articles for each author and plot a bar chart
author_counts = df['authors'].value_counts()
plt.figure(figsize=(10,5))
author_counts[:20].plot(kind='bar')
plt.title('Top 20 authors with most articles')
plt.show()

# Convert the 'date' field to datetime format and extract the month and day of the week
df['date'] = pd.to_datetime(df['date'])
df['month'] = df['date'].dt.month
df['day_of_week'] = df['date'].dt.dayofweek

# Plot the relationship between month and article categories
plt.figure(figsize=(10,5))
sns.countplot(data=df, x='month', hue='y')
plt.title('Article categories distribution by month')
plt.show()

# Plot the relationship between day of the week and article categories
plt.figure(figsize=(10,5))
sns.countplot(data=df, x='day_of_week', hue='y')
plt.title('Article categories distribution by day of the week')
plt.show()


## Length Analysis

In [None]:
df['sentence_count'] = df["text"].apply(lambda x: len(str(x).split(".")))
df['word_count'] = df["text"].apply(lambda x: len(str(x).split(" ")))
df['avg_sentence_lenght'] = df['word_count'] / df['sentence_count']
df['word_count']

In [None]:
df['avg_sentence_lenght']

In [None]:
#  Check the blank value
df.isnull().sum()  


In [None]:
df.dropna(inplace=True) 


In [None]:
df.head()

# Data Preparation & Modelling 

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, ENGLISH_STOP_WORDS, TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report, cohen_kappa_score
import tensorflow as tf
from tensorflow import keras
import numpy as np 
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import LSTM, GRU, Dense, Embedding, Dropout
from tensorflow.keras.preprocessing import text, sequence 
from tensorflow.keras.models import Sequential
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import string
from tensorflow.keras import models
from tensorflow.keras import layers
from tensorflow.keras import losses
from tensorflow.keras import metrics
from tensorflow.keras import optimizers
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from sklearn.model_selection import train_test_split
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import time
from sklearn.manifold import TSNE
from tensorflow.keras.models import Model,Sequential
from tensorflow.keras.layers import Embedding,Dense,LSTM,Dropout,GlobalAveragePooling1D,Bidirectional
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import ModelCheckpoint,EarlyStopping
from tensorflow.keras.utils import plot_model

In [None]:
# Reading training data from .CSV file
train_df = pd.read_csv('train.csv')

# Extract features and labels
X_train = train_df['text_clean'].values
y_train = train_df['y'].values


In [None]:

# Read Testing data From .CSV file
test_df = pd.read_csv('test.csv')

# Extract features and labels
X_test = test_df['text_clean'].values
y_test = test_df['y'].values

In [None]:
# Read Validation data from .CSV
val_df = pd.read_csv('valid.csv')

# Extract labels and features
X_val = val_df['text_clean'].values
y_val = val_df['y'].values


In [None]:
vocab_size =20000
max_length = 150
trunc_type='post'
padding_type='post'
oov_tok = "<OOV>"

In [None]:
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(X_train)

word_index = tokenizer.word_index

# Convert training data to sequences
X_train = tokenizer.texts_to_sequences(X_train)
X_train = pad_sequences(X_train, maxlen=max_length, padding=padding_type, truncating=trunc_type)

# Convert training labels to one-hot encoded vectors
y_train = np.asarray(y_train)
y_train = pd.get_dummies(y_train)

# Convert validation data to sequences
X_val = tokenizer.texts_to_sequences(X_val)
X_val = pad_sequences(X_val, maxlen=max_length, padding=padding_type, truncating=trunc_type)

# Convert validation labels to one-hot encoded vectors
y_val = np.asarray(y_val)
y_val = pd.get_dummies(y_val)

# Convert data and labels to arrays
train_set = np.array(X_train)
val_set = np.array(X_val)
train_label = np.array(y_train)
val_label = np.array(y_val)
# test_set = np.array(X_test)
# test_label = np.array(y_test)

# Convert test labels to one-hot encoded vectors and extract the class indices
y_test = pd.get_dummies(y_test)
y_test = np.asarray(y_test)
y_test = np.argmax(y_test, axis=1)

# Print the shapes of the training and validation data and labels
print('Train set shape:', train_set.shape)
print('Train label shape:', train_label.shape)
print('Validation set shape:', val_set.shape)
print('Validation label shape:', val_label.shape)


## Naive Bayes classifier

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

# # Create a Naive Bayes classifier
# nb_classifier = MultinomialNB()

# # Convert the two-dimensional label array to a one-dimensional array
train_label_1 = np.argmax(train_label, axis=1)
val_label_1 = np.argmax(val_label, axis=1)

# # Train the model using the training set
# nb_classifier.fit(train_set, train_label_1)

# # Make predictions on the training set
# train_predictions = nb_classifier.predict(train_set)
# train_accuracy = accuracy_score(train_label_1, train_predictions)
# print("Training Set Accuracy:", train_accuracy)

# # Make predictions on the validation set
# val_predictions = nb_classifier.predict(val_set)
# val_accuracy = accuracy_score(val_label_1, val_predictions)
# print("Validation Set Accuracy:", val_accuracy)


In [None]:
# # Print all the parameters of the trained model
# print("Model Parameters:")
# print(nb_classifier.get_params())

In [None]:
import joblib

# # Save the trained model
# joblib.dump(nb_classifier, 'naive_bayes_model.pkl')



## DecisionTree

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

# # Create a Decision Tree classifier
# dt_classifier = DecisionTreeClassifier()

# # Train the model using the training set
# dt_classifier.fit(train_set, train_label_1)

# # Make predictions on the training set
# train_predictions = dt_classifier.predict(train_set)
# train_accuracy = accuracy_score(train_label_1, train_predictions)
# print("Training Set Accuracy:", train_accuracy)

# # Make predictions on the validation set
# val_predictions = dt_classifier.predict(val_set)
# val_accuracy = accuracy_score(val_label_1, val_predictions)
# print("Validation Set Accuracy:", val_accuracy)


In [None]:
# # Print all the parameters of the trained model
# print("Model Parameters:")
# print(dt_classifier.get_params())

In [None]:
# joblib.dump(dt_classifier, 'dt_model.pkl')


In [None]:
path_to_glove_file =  'glove.6B.100d.txt/glove.6B.100d.txt'

In [None]:
num_tokens = len(tokenizer.word_index.items()) + 2
embedding_dim = 100
hits = 0
misses = 0

# Create an empty dictionary to store word vectors
embeddings_index = {}

# Open the GloVe file and read line by line
with open(path_to_glove_file, encoding='utf-8') as f:
    for line in f:
        # Split each line into word and coefficients
        word, coefs = line.split(maxsplit=1)
        # Convert coefficients to a numpy array
        coefs = np.fromstring(coefs, "f", sep=" ")
        # Store the word and its corresponding coefficients in the dictionary
        embeddings_index[word] = coefs

# Print the number of word vectors found in the GloVe file
print("Found %s word vectors." % len(embeddings_index))

# Create an embedding matrix with all zeros
embedding_matrix = np.zeros((num_tokens, embedding_dim))

# Iterate over each word in the tokenizer's word index
for word, i in tokenizer.word_index.items():
    # Get the corresponding embedding vector from the embeddings dictionary
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # If an embedding vector is found, update the corresponding row in the embedding matrix
        embedding_matrix[i] = embedding_vector
        hits += 1
    else:
        # If an embedding vector is not found, keep the row as all zeros
        misses += 1

# Print the number of words converted to embeddings and the number of misses
print("Converted %d words (%d misses)" % (hits, misses))


##  end-to-end classifier using deep learning

In [None]:
# early_stop = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001)

# tf.keras.backend.clear_session()
# embed_size = 100

# model = keras.models.Sequential([
#     # Embedding layer with pre-trained word embeddings
#     Embedding(num_tokens,
#               embedding_dim,
#               embeddings_initializer=keras.initializers.Constant(embedding_matrix),
#               mask_zero=True,
#               input_shape=[None],
#               trainable=False),
#     # Bidirectional LSTM layer
#     keras.layers.Bidirectional(keras.layers.LSTM(256, dropout=0.4)),
#     # Output Dense layer with softmax activation for binary classification
#     keras.layers.Dense(2, activation="softmax")
# ])

# # Print the model summary
# model.summary()


In [None]:
# opt = keras.optimizers.Adam(learning_rate=0.001)
# model.compile(loss="categorical_crossentropy", optimizer=opt, metrics=["accuracy"])
# history = model.fit( train_set,train_label,
#                      batch_size = 32,
#                      steps_per_epoch=len(X_train) // 32, 
#                      validation_data = (val_set , val_label),
#                      validation_steps = len(val_set)//32, epochs=20,
#                      callbacks=  early_stop )

In [None]:
# model.save("deeplearning_model.h5")


##  Evaluation

In [None]:
#loading model
from tensorflow.keras.models import load_model
import pickle
dl_model=load_model("deeplearning_model.h5")
# Load the Naive Bayes model from the file
naive_bayes_model = joblib.load('naive_bayes_model.pkl')
#Load the Decision Tree model from the file
decision_tree_model = joblib.load('dt_model.pkl')

In [None]:
dl_model.summary()

### I choose accuracy_score as the metric There are several reasons for this:Intuitiveness: Accuracy is an intuitive and easy to understand metric that indicates the percentage of samples correctly predicted by the model. People usually have an intuitive understanding of accuracy and are able to compare it to other metrics.Composite: Accuracy combines the predictive power of a model across all categories, and it takes into account the model's classification accuracy across the entire data set. This makes accuracy a commonly used performance metric, especially when the categories are of similar importance.Generality: Accuracy is applicable to many types of classification problems, whether dichotomous or multicategorical. This makes it a common evaluation metric that can be applied to a variety of domains and application scenarios.

### The larger the accuracy_score, the better

### Evaluate performance on validation and training sets

In [None]:
# Make predictions on the training set
train_predictions = naive_bayes_model.predict(train_set)
train_accuracy = accuracy_score(train_label_1, train_predictions)
print("Training Set Accuracy:", train_accuracy)

# Make predictions on the validation set
val_predictions = naive_bayes_model.predict(val_set)
val_accuracy = accuracy_score(val_label_1, val_predictions)
print("Validation Set Accuracy:", val_accuracy)


In [None]:
# Make predictions on the training set using the decision tree model
train_predictions_dt = decision_tree_model.predict(train_set)
train_accuracy_dt = accuracy_score(train_label_1, train_predictions_dt)
print("Training Set Accuracy:", train_accuracy_dt)

# Make predictions on the validation set using the decision tree model
val_predictions_dt = decision_tree_model.predict(val_set)
val_accuracy_dt = accuracy_score(val_label_1, val_predictions_dt)
print("Validation Set Accuracy:", val_accuracy_dt)


In [None]:
# Make predictions on the training set using the deep learning model
train_predictions_dl = dl_model.predict(train_set)
train_predictions_dl = np.argmax(train_predictions_dl, axis=1)
train_accuracy_dl = accuracy_score(np.argmax(train_label, axis=1), train_predictions_dl)
print("Training Set Accuracy:", train_accuracy_dl)

# Make predictions on the validation set using the deep learning model
val_predictions_dl = dl_model.predict(val_set)
val_predictions_dl = np.argmax(val_predictions_dl, axis=1)
val_accuracy_dl = accuracy_score(np.argmax(val_label, axis=1), val_predictions_dl)
print("Validation Set Accuracy:", val_accuracy_dl)


### The training set generally outperforms the validation setï¼ŒModel Performance Deep Learning Model > Decision Tree> naive_bayes

### 11 Finding the optimal parameters

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
import joblib

# Create a Naive Bayes classifier
nb_classifier1 = MultinomialNB()

# Define the parameter grid to search over
param_grid = {'alpha': [0.05,0.1, 0.5, 1.0],
              'fit_prior': [True, False],
              'class_prior': [None, [0.2, 0.8], [0.5, 0.5]]}

# Create the GridSearchCV object
grid_search = GridSearchCV(nb_classifier1, param_grid, cv=5)

# Fit the GridSearchCV object on the training set
grid_search.fit(train_set, train_label_1)

# Get the best parameter values and the corresponding model
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

# Save the optimized Naive Bayes model
joblib.dump(best_model, 'optimized_naive_bayes_model.pkl')

# Make predictions on the training set using the best model
train_predictions_opt_bayes = best_model.predict(train_set)
train_accuracy_opt_bayes = accuracy_score(train_label_1, train_predictions_opt_bayes)
print("Training Set Accuracy (optimized):", train_accuracy_opt_bayes)

# Make predictions on the validation set using the best model
val_predictions_opt_bayes = best_model.predict(val_set)
val_accuracy_opt_bayes = accuracy_score(val_label_1, val_predictions_opt_bayes)
print("Validation Set Accuracy (optimized):", val_accuracy_opt_bayes)
best_params


In [None]:
# Create a Decision Tree classifier
dt_classifier = DecisionTreeClassifier()

# Define the parameter grid to search over
param_grid = {'criterion': ['gini', 'entropy'],
              'max_depth': [None, 5, 10],
              'min_samples_split': [2, 5, 10],
              'min_samples_leaf': [1, 2, 4]}

# Create the GridSearchCV object
grid_search = GridSearchCV(dt_classifier, param_grid, cv=5)

# Fit the GridSearchCV object on the training set
grid_search.fit(train_set, train_label_1)

# Get the best parameter values and the corresponding model
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

# Save the optimized Decision Tree model
joblib.dump(best_model, 'optimized_decision_tree_model.pkl')

# Make predictions on the training set using the best model
train_predictions_opt_dt = best_model.predict(train_set)
train_accuracy_opt_dt = accuracy_score(train_label_1, train_predictions_opt_dt)
print("Training Set Accuracy (optimized):", train_accuracy_opt_dt)

# Make predictions on the validation set using the best model
val_predictions_opt_dt = best_model.predict(val_set)
val_accuracy_opt_dt = accuracy_score(val_label_1, val_predictions_opt_dt)
print("Validation Set Accuracy (optimized):", val_accuracy_opt_dt)


### Add an additional LSTM layer to the model

In [None]:
# # Define early stopping callback
# early_stop = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3, min_delta=0.001)

# # Clear previous TensorFlow session
# tf.keras.backend.clear_session()

# # Define embedding size
# embed_size = 100

# # Create the optimized model
# model_opt = keras.models.Sequential([
#     # Embedding layer with pre-trained word embeddings
#     Embedding(num_tokens,
#               embedding_dim,
#               embeddings_initializer=keras.initializers.Constant(embedding_matrix),
#               mask_zero=True,
#               input_shape=[None],
#               trainable=False),
# # Bidirectional LSTM layers
#     keras.layers.Bidirectional(keras.layers.LSTM(256, dropout=0.4, return_sequences=True)),
#     keras.layers.Bidirectional(keras.layers.LSTM(128, dropout=0.3)),
#     keras.layers.Dense(2, activation="softmax")
# ])

# # Compile the model with the optimizer and loss function
# opt = keras.optimizers.Adam(learning_rate=0.001)
# model_opt.compile(loss="categorical_crossentropy", optimizer=opt, metrics=["accuracy"])

# # Print the summary of the optimized model
# model_opt.summary()


In [None]:
# history = model_opt.fit( train_set,train_label,
#                      batch_size = 32,
#                      steps_per_epoch=len(X_train) // 32, 
#                      validation_data = (val_set , val_label),
#                      validation_steps = len(val_set)//32, epochs=20,
#                      callbacks=  early_stop )

In [None]:
# model_opt.save("deeplearning_model_opt.h5")
model_opt=load_model("deeplearning_model_opt.h5")

In [None]:
# Make predictions on the training set using the deep learning model
train_predictions_dl_opt = model_opt.predict(train_set)
train_predictions_dl_opt = np.argmax(train_predictions_dl_opt, axis=1)
train_accuracy_dl_opt = accuracy_score(np.argmax(train_label, axis=1), train_predictions_dl_opt)
print("Training Set Accuracy:", train_accuracy_dl_opt)

In [None]:
# Make predictions on the validation set using the deep learning model
val_predictions_dl_opt = model_opt.predict(val_set)
val_predictions_dl_opt = np.argmax(val_predictions_dl_opt, axis=1)
val_accuracy_dl_opt = accuracy_score(np.argmax(val_label, axis=1), val_predictions_dl_opt)
print("Validation Set Accuracy:", val_accuracy_dl_opt)

### It can be seen that the classification accuracy of the best model among the three models, i.e., the deep learning model's Accuracy rate of over 90%, which can accomplish the task very well.

### Cross-validation

In [None]:
# Load the optimized Naive Bayes model
optimized_naive_bayes_model = joblib.load('optimized_naive_bayes_model.pkl')
# Combine the training and validation sets
combined_train_set = np.concatenate((train_set, val_set), axis=0)
combined_train_label = np.concatenate((train_label_1, val_label_1), axis=0)
# Perform cross-validation on the combined dataset
scores = cross_val_score(optimized_naive_bayes_model, combined_train_set, combined_train_label, cv=5, scoring='accuracy')

# Print the cross-validation scores
print("Cross-Validation Scores:", scores)
print("Average Accuracy:", scores.mean())

In [None]:
# Load the optimized Naive Bayes model
optimized_decision_tree_model = joblib.load('optimized_decision_tree_model.pkl')
# Perform cross-validation on the combined dataset
scores_dt = cross_val_score(optimized_decision_tree_model, combined_train_set, combined_train_label, cv=5, scoring='accuracy')

# Print the cross-validation scores
print("Cross-Validation Scores:", scores_dt)
print("Average Accuracy:", scores_dt.mean())

In [None]:
dl_model_opt=load_model("deeplearning_model_opt.h5")

In [None]:
merged_data = np.concatenate((train_set, val_set), axis=0)
merged_labels = np.concatenate((train_label, val_label), axis=0)


In [None]:
from sklearn.model_selection import KFold

# Define the number of folds for cross-validation
n_splits = 5
kf = KFold(n_splits=n_splits, shuffle=True)

# Initialize a list to store the evaluation scores for each fold
accuracy_scores = []

# Train and evaluate the model for each fold
for train_index, test_index in kf.split(merged_data):
    # Get the training and testing data for the current fold
    X_train_fold, X_test_fold = merged_data[train_index], merged_data[test_index]
    y_train_fold, y_test_fold = merged_labels[train_index], merged_labels[test_index]

    # Use the pre-trained model to predict on the testing fold
    y_pred_fold = dl_model_opt.predict(X_test_fold)
    y_pred_fold = np.argmax(y_pred_fold, axis=1)

    # Calculate the accuracy score for the current fold
    accuracy_fold = accuracy_score(np.argmax(y_test_fold, axis=1), y_pred_fold)

    # Append the accuracy score to the list
    accuracy_scores.append(accuracy_fold)

# Print the evaluation scores for each fold
for i, score in enumerate(accuracy_scores):
    print("Fold", i+1, "Accuracy:", score)

# Calculate the average accuracy score
average_accuracy = np.mean(accuracy_scores)
print("Average Accuracy:", average_accuracy)



### Select the best model

In [None]:
best_model=load_model("deeplearning_model_opt.h5")

In [None]:
# Convert test data to sequences
X_test = tokenizer.texts_to_sequences(X_test)
X_test = pad_sequences(X_test, maxlen=max_length, padding=padding_type, truncating=trunc_type)

# Convert test labels to one-hot encoded vectors
y_test = np.asarray(y_test)
y_test = pd.get_dummies(y_test)

# Convert data and labels to arrays
test_set = np.array(X_test)
test_label = np.array(y_test)
# Make predictions on the test set using the deep learning model
test_predictions = best_model.predict(test_set)
test_predictions = np.argmax(test_predictions, axis=1)
test_accuracy = accuracy_score(np.argmax(test_label, axis=1), test_predictions)
print("Test Set Accuracy:", test_accuracy)

### Indicators do not differ significantly from the validation set

### Retraining bestmodel

In [None]:
# # Merge the training and validation sets
# merged_set = np.concatenate((train_set, val_set), axis=0)
# merged_label = np.concatenate((train_label, val_label), axis=0)

# # Compile the model with the optimizer and loss function
# opt = keras.optimizers.Adam(learning_rate=0.001)
# best_model.compile(loss="categorical_crossentropy", optimizer=opt, metrics=["accuracy"])

# # Print the summary of the best model
# best_model.summary()

# # Train the model on the merged set
# history = best_model.fit(merged_set, merged_label,
#                          batch_size=32,
#                          steps_per_epoch=len(merged_set) // 32,
#                          validation_data=(val_set, val_label),
#                          validation_steps=len(val_set) // 32,
#                          epochs=20,
#                          callbacks=early_stop)


In [None]:
# best_model.save("best_model.h5")

In [None]:
best_model_all=load_model("best_model.h5")

In [None]:

# Make predictions on the test set using the deep learning model
test_predictions = best_model_all.predict(test_set)
test_predictions = np.argmax(test_predictions, axis=1)
test_accuracy = accuracy_score(np.argmax(test_label, axis=1), test_predictions)
print("Test Set Accuracy:", test_accuracy)