# Importing Libraries

In [None]:
# Data Visualization
import matplotlib.pyplot as plt  # Matplotlib is a data visualization library used for creating static, animated, and interactive visualizations in Python.
import seaborn as sns  # Seaborn is a Python data visualization library based on Matplotlib that provides a high-level interface for drawing attractive and informative statistical graphics.

# Text Processing
from string import punctuation  # A string of punctuation characters used for tokenizing and preprocessing text data.
from nltk.tokenize import word_tokenize  # A tokenizer that splits text into words and punctuation marks, removing whitespace and other formatting characters.
from nltk.corpus import stopwords  # A collection of common words that are often removed from text data before analysis, such as "the," "and," and "a."
from nltk.stem import LancasterStemmer, PorterStemmer, SnowballStemmer  # Stemming algorithms used to reduce words to their base or root form.
from nltk.stem.wordnet import WordNetLemmatizer  # A lemmatization algorithm used to reduce words to their base or root form, similar to stemming.

# Data Processing
import re  # A module used for regular expression operations in Python.
import warnings  # A module used for handling warnings in Python.
import numpy as np  # NumPy is a library used for working with arrays and numerical operations in Python.
import pandas as pd  # Pandas is a library used for data manipulation and analysis, including reading and writing CSV files.

# Machine Learning
import pickle  # A module used for object serialization and deserialization in Python.
from sklearn.naive_bayes import MultinomialNB  # A Naive Bayes classifier used for text classification tasks.
from sklearn.svm import LinearSVC  # A Support Vector Machine classifier used for text classification tasks.
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import ConfusionMatrixDisplay, accuracy_score, confusion_matrix, classification_report, roc_auc_score, precision_recall_curve, auc, PrecisionRecallDisplay, roc_curve  # A collection of metrics used for evaluating machine learning models.
from sklearn.model_selection import GridSearchCV
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis  # A Linear Discriminant Analysis classifier used for text classification tasks.
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer  # A vectorizer used to transform text data into numerical feature vectors.
from sklearn.linear_model import LogisticRegression  # A Logistic Regression classifier used for text classification tasks.
from sklearn.model_selection import train_test_split, StratifiedKFold, GroupKFold, GroupShuffleSplit, LeaveOneOut, learning_curve, cross_val_score, LearningCurveDisplay # A module used for splitting data into training and testing sets.
from wordcloud import WordCloud  # A data visualization technique used to display text data in a visual format, where the size of each word represents its frequency.
import scipy.stats as stats
from tqdm.auto import tqdm
from fitter import Fitter
from collections import Counter
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.combine import SMOTEENN

warnings.filterwarnings('ignore')

# Reading csv file and analysis

In [None]:
df = pd.read_csv('./definitive_dataset.csv',delimiter=',', encoding='ISO-8859-1')

In [None]:
df.head()

In [None]:
# Useful data 
df.describe()

In [None]:
# Variance
df.var()

In [None]:
# Bias
df.skew()

In [None]:
# Print the type of each column
df.info()

In [None]:
# Check if there is missing data
df.isnull().sum()

In [None]:
# Check the dataset shape
df.shape

## Dropping unnecessary columns

In [None]:
# All the media values in the column are the same
df['media'].value_counts().plot(kind='bar')
plt.show()

In [None]:
# Remove the unused column
df=df.drop(columns=['media'])
df.head()

## Analysis with graphs

In [None]:
# Create a new column just to analyze the comments length
df['length'] = df['text'].apply(lambda x: len(str(x)))
df.head()

In [None]:
df['length'].value_counts().plot()
plt.show()

In [None]:
df['length'].plot(kind = 'hist' , bins = 200) 
plt.show()

In [None]:
# Show the length of the comments via each sentiment
ax = df.hist(column = 'length', by = 'sentiment', bins = 50 , figsize = (8, 8))
plt.suptitle('Length via each Sentiment')

In [None]:
# Obtain the mean of the positive comments and the negative ones
negative_mean = df.loc[df['sentiment'] == 0, 'length'].mean()
positive_mean = df.loc[df['sentiment'] == 1, 'length'].mean()

# Obtain the mean of the lenght of the overall comments
length_mean = np.mean(df['length'])

In [None]:
# Create tags for the bars
labels = ['Sentiment 0 length', 'Sentiment 1 length']

values = [positive_mean, negative_mean]

plt.bar(labels, values)
plt.axhline(length_mean, color='red', linestyle='--', label='Overall mean')

plt.xlabel('Sentiment')
plt.ylabel('Mean')
plt.title('Comments length mean by sentiment')

plt.show()

Negative comments mean is highly above the positive comments mean so the negative comments are larger respecting comment length.

In [None]:
# Show how many examples belong to each side 
counts = df['sentiment'].value_counts()
print(counts)
plt.bar(counts.index, counts.values)
plt.xticks(counts.index, ['1', '0'])
plt.xlabel('Sentiment')
plt.ylabel('Count')
plt.title('Sentiment distribution')
plt.show()

After watching this graph I could expect an unbalanced classes problem when trying to train models with this difference, but since the difference is 68% (1) to 32% (0) I won't consider it as a huge umbalance problem.

In [None]:
# Store in a variable all the text of the dataframe
texts = df['text']

# Preprocess the data

In [None]:
stuff_to_be_removed = list(stopwords.words('english'))+list(punctuation) # List of stopwords and punctuation signs to be removed or ignored
lem = WordNetLemmatizer()
corpus = df['text'].tolist() # Create the corpus containing the list of all the texts of the dataframe

In [None]:
stuff_to_be_removed

### WordNetLemmatizer usage example

In [None]:
list1 = ['kites', 'babies', 'dogs', 'flying', 'smiling',
         'driving', 'died', 'tried', 'feet', 'meeting']
for words in list1:
    print(words + " ---> " + lem.lemmatize(words))

In [None]:
import nltk
# Sentence lemmatization examples
string = 'the cat is sitting with the bats on the striped mat under many flying geese'

# Converting String into tokens
list2 = nltk.word_tokenize(string)
print("Tokenized sentence: ",list2, "\n")

lemmatized_string = ' '.join([lem.lemmatize(words) for words in list2])

print("Lemmatized string: ",lemmatized_string)

## Preprocessing function

In [None]:
# Remove digits
final_corpus = []
for i in df.index:
    try:
        text = re.sub("(\\d|\\W)+"," ",df['text'][i])
        text = re.sub(r'[ÂÃ]', 'A', text)
        text = re.sub(r"[şŝšś]", "s", text)
        text = re.sub(r"[ĤĦĥħ]", "H", text)
        text = re.sub(r"[ĆĈĊČćĉċč]", "c", text)
        #text = [lem.lemmatize(word) for word in text if word not in set(stuff_to_be_removed)] # Apply lemmatizing and remove stopwords
        if text not in set(stuff_to_be_removed):
            text = ''.join(text)
            final_corpus.append(text)
        else:
            df.drop(i, axis=0, inplace=True)
    except:
        df.drop(i, axis=0, inplace=True)

In [None]:
final_corpus

In [None]:
# Store the final_corpus obtained in the cell above into the dataframe data_cleaned
data_cleaned = pd.DataFrame()
data_cleaned["text"] = final_corpus
data_cleaned["sentiment"] = df["sentiment"].values

In [None]:
# Store the final_corpus obtained in the cell above into the dataframe data_eda
data_eda = pd.DataFrame()
data_eda['text'] = final_corpus
data_eda['sentiment'] = df['sentiment'].values

In [None]:
# Extract the positive and negative labeled comments
positive = data_eda[data_eda['sentiment'] == 1]
positive_list = positive['text'].tolist()
negative = data_eda[data_eda['sentiment'] == 0]
negative_list = negative['text'].tolist()

In [None]:
positive_all = "".join([word for sent in positive_list for word in sent ])
negative_all = "".join([word for sent in negative_list for word in sent ])

## Word cloud positive data

In [None]:
# Generate and display a word cloud for non-offensive words
WordCloud()
wordcloud = WordCloud(width=1000,
                      height=500,
                      background_color='skyblue',
                      max_words = 90).generate(positive_all)

plt.figure(figsize=(30,20))
plt.imshow(wordcloud)
plt.title("Non-offensive")
plt.show()

## Word cloud negative data

In [None]:
WordCloud()
wordcloud = WordCloud(width=1000,
                      height=500,
                      background_color='skyblue',
                      max_words = 90).generate(negative_all)

plt.figure(figsize=(30,20))
plt.imshow(wordcloud)
plt.title("Offensive")
plt.show()

# Feature extraction

## TFIDF or CountVectorizer for sentiment analysis

In [None]:
# Create the vectorizer
tfidf = TfidfVectorizer(use_idf=True)
bigram_tfidf = TfidfVectorizer(ngram_range=(1, 2), use_idf=True) # We use (1,2) because if we use only (2,2) we will only consider bigrams and not individual words

# Check if he ngram creatiion of bigrams is correct
analyze = bigram_tfidf.build_analyzer()
analyze('Bi-grams are cool!') == (['bi', 'grams', 'are', 'cool', 'bi grams', 'grams are', 'are cool'])

In [None]:
# Vectorize the corpus
xt = tfidf.fit_transform(data_cleaned["text"])
xt_bi = bigram_tfidf.fit_transform(data_cleaned["text"])
y = data_cleaned['sentiment']

### Check what the vectorizers return

In [None]:
# See the indexes of the numeric representation of each n_gram (word)
print(tfidf.vocabulary_)

In [None]:
# For example, the index of the word goat in the vectorizer is the 803 
print(tfidf.vocabulary_['goat'])

In [None]:
# Get the vectorizer with unigrams features and the amount of features
feature_names = tfidf.get_feature_names_out()
print(feature_names)
print("Amount of features: ",len(feature_names))

In [None]:
# Get the vectorizer features of a word
feature_names = tfidf.get_feature_names_out()[803]
print(feature_names)

In [None]:
# Get the IDF values associated to each feature
print(tfidf.idf_)

In [None]:
# Get the vectorizer with bigrams features and the amount of features
feature_names = bigram_tfidf.get_feature_names_out()
print(feature_names)
print("Amount of features: ",len(bigram_tfidf.idf_)) # The amount of features is almost the triple than using only unigrams

In [None]:
# See the difference between this vocabulary and the other one with unigrams
# Here we see two words toghether and also unigrams and we were not seeing that on the previous one
print(bigram_tfidf.vocabulary_)

## See how the amount of features affects vectorizers

In [None]:
# Different features list
num_features = [1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000, 10000]
results = {'Logistic Regression': [], 'Decision Tree': [], 'Linear SVC': [], 'Multinomial NB': [], 'Random Forest': []}

for n in num_features:

    bigram_tfidf = TfidfVectorizer(ngram_range=(1, 2), use_idf=True, max_df=0.5, max_features=n)
    xt_bi = bigram_tfidf.fit_transform(data_cleaned["text"])

    kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    
    # Train and evaluate models
    models = [
        ('Logistic Regression', LogisticRegression()),
        ('Decision Tree', DecisionTreeClassifier()),
        ('Linear SVC', LinearSVC()),
        ('Multinomial NB', MultinomialNB()),
        ('Random Forest', RandomForestClassifier())
    ]
    
    for model_name, model in models:
        accuracies = []

        for train_index, test_index in kfold.split(xt_bi, y):
            X_train, X_test = xt_bi[train_index], xt_bi[test_index]
            y_train, y_test = y[train_index], y[test_index]

            model.fit(X_train, y_train)
            y_pred = model.predict(X_train)
            accuracy = accuracy_score(y_train, y_pred)
            accuracies.append(accuracy)

        average_accuracy = np.mean(accuracies)
        results[model_name].append(average_accuracy)
            
# Plot results
plt.figure(figsize=(10, 6))
plt.xticks(num_features)
for model_name, accuracies in results.items():
    plt.plot(num_features, accuracies, label=model_name)

plt.xlabel('Número de Características')
plt.ylabel('Precisión')
plt.title('Precisión en función del número de características')
plt.legend()
plt.show()

In [None]:
models_names = ['Logistic Regression', 'Decision Tree', 'Linear SVC', 'Multinomial NB', 'Random Forest']

In [None]:
featuress = {'0': 1000, '1': 2000, '2': 3000, '3': 4000, '4': 5000, '5': 6000, '6': 7000, '7': 8000, '8': 9000, '9': 10000}

In [None]:
results_df = pd.DataFrame(columns=['Model', 'Number of Features', 'Accuracy'])

for model in models_names:
    feature = results[model].index(max(results[model]))
    accuracy = results[model][results[model].index(max(results[model]))]
    feature_name = featuress[str(feature)]
    results_df = results_df.append({'Model': model, 'Number of Features': feature_name, 'Accuracy': accuracy}, ignore_index=True)

print("TF-IDF vectorizer results")
results_df

In [None]:
# Different features list
num_features2 = [1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000, 10000]

# Dict to store the results
results2 = {'Logistic Regression': [], 'Decision Tree': [], 'Linear SVC': [], 'Multinomial NB': [], 'Random Forest': []}

for n in num_features2:

    vectorizer = CountVectorizer(ngram_range=(1, 2), max_df=0.5, max_features=n)
    xt_bi_countvec = vectorizer.fit_transform(data_cleaned["text"])
    
    models = [
        ('Logistic Regression', LogisticRegression()),
        ('Decision Tree', DecisionTreeClassifier()),
        ('Linear SVC', LinearSVC()),
        ('Multinomial NB', MultinomialNB()),
        ('Random Forest', RandomForestClassifier())
    ]
    
    kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    for model_name, model in models:
        accuracies = []  # List to store the accuracies on each fold

        for train_index, test_index in kfold.split(xt_bi_countvec, y):
            X_train, X_test = xt_bi_countvec[train_index], xt_bi_countvec[test_index]
            y_train, y_test = y[train_index], y[test_index]

            model.fit(X_train, y_train)
            y_pred = model.predict(X_train)
            accuracy = accuracy_score(y_train, y_pred)
            accuracies.append(accuracy)

        average_accuracy = np.mean(accuracies)
        results2[model_name].append(average_accuracy)
            
# Plot results
plt.figure(figsize=(10, 6))
plt.xticks(num_features)
for model_name, accuracies in results2.items():
    plt.plot(num_features, accuracies, label=model_name)

plt.xlabel('Número de Características')
plt.ylabel('Precisión')
plt.title('Precisión en función del número de características')
plt.legend()
plt.show()

In [None]:
results_df2 = pd.DataFrame(columns=['Model', 'Number of Features', 'Accuracy'])

for model in models_names:
    feature = results2[model].index(max(results2[model]))
    accuracy = results2[model][results2[model].index(max(results2[model]))]
    feature_name = featuress[str(feature)]
    results_df2 = results_df2.append({'Model': model, 'Number of Features': feature_name, 'Accuracy': accuracy}, ignore_index=True)

print("CountVectorizer results")
results_df2

### Check if TFIDF vectorizer is better than CountVectorizer

In [None]:
# TF-IDF
model1 = LogisticRegression()
model2 = DecisionTreeClassifier()
model3 = LinearSVC()
model4 = MultinomialNB()
model5 = RandomForestClassifier()

bigram_tfidf = TfidfVectorizer(ngram_range=(1, 2), use_idf=True, max_df=0.5)
xt_bi = bigram_tfidf.fit_transform(data_cleaned["text"])

# Initialize StratifiedKFold with the desired number of folds
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

all_predictions = []
all_true_labels = []

# Iterate over the generated folds by StratifiedKFold
for train_index, test_index in kfold.split(xt_bi, y):
    X_train, X_test = xt_bi[train_index], xt_bi[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    model1.fit(X_train, y_train)
    y_pred = model1.predict(X_train)
    
    all_predictions.extend(y_pred)
    all_true_labels.extend(y_train)

# Get the accuracy
accuracy = accuracy_score(all_true_labels, all_predictions)
print("testing accuracy = ", accuracy*100)
print(classification_report(all_true_labels, all_predictions))

# Get the confusion matrix
confusion = confusion_matrix(all_true_labels, all_predictions)

# Get the total acmount of examples
total_examples = np.sum(confusion)

# Calculate the success rate by the total amount of examples
confusion_percentage = confusion / total_examples

# Create the confusion matrix with the success and failure percentages
plt.figure(figsize=(8, 6))
sns.heatmap(confusion_percentage, annot=True, cmap="Blues", cbar=False)
plt.xlabel("Predicted label")
plt.ylabel("True label")
plt.title("Confusion matrix (Percentage)")
plt.show()

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

# COUNT VECTORIZER
model1 = LogisticRegression()
model2 = DecisionTreeClassifier()
model3 = LinearSVC()
model4 = MultinomialNB()
model5 = RandomForestClassifier()

vectorizer = CountVectorizer(ngram_range=(1, 2), max_df=0.5, max_features=4000)
xt_bi_countvec = vectorizer.fit_transform(data_cleaned["text"])

# Initialize StratifiedKFold with the desired number of folds
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

all_predictions = []
all_true_labels = []

# Iterate over the generated folds by StratifiedKFold
for train_index, test_index in kfold.split(xt_bi_countvec, y):
    X_train, X_test = xt_bi_countvec[train_index], xt_bi_countvec[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    model3.fit(X_train, y_train)
    y_pred = model3.predict(X_train)
    
    all_predictions.extend(y_pred)
    all_true_labels.extend(y_train)

# Get the accuracy
accuracy = accuracy_score(all_true_labels, all_predictions)
print("Training accuracy = ", accuracy*100)
print(classification_report(all_true_labels, all_predictions))

# Get the confusion matrix
confusion = confusion_matrix(all_true_labels, all_predictions)

# Get the total acmount of examples
total_examples = np.sum(confusion)

# Calculate the success rate by the total amount of example
confusion_percentage = confusion / total_examples

# Create the confusion matrix with the success and failure percentages
plt.figure(figsize=(8, 6))
sns.heatmap(confusion_percentage, annot=True, cmap="Blues", cbar=False)
plt.xlabel("Predicted label")
plt.ylabel("True label")
plt.title("Confusion matrix (Percentage)")
plt.show()

## Train Test Split / Splitter classes

In [None]:
model1 = LogisticRegression()
model2 = DecisionTreeClassifier()
model3 = LinearSVC()
model4 = MultinomialNB()
model5 = RandomForestClassifier()

classifier = model5

tfidf = TfidfVectorizer(ngram_range=(1, 2), use_idf=True)
xt = tfidf.fit_transform(data_cleaned["text"])
y = data_cleaned['sentiment']

# Initialize StratifiedKFold with the desired number of folds
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Iterate over the generated folds by StratifiedKFold
for train_index, test_index in kfold.split(xt, y):
    X_train, X_test = xt[train_index], xt[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    classifier.fit(X_train, y_train)
    y_train_pred = classifier.predict(X_train)
    y_test_pred = classifier.predict(X_train)
    
    accuracy = accuracy_score(y_train, y_pred)
    accuracies.append(accuracy)

average_accuracy = sum(accuracies) / len(accuracies)
print("Average accuracy:", average_accuracy)

metrics(y_train,y_train_pred,y_test,y_test_pred)

### Metrics used to evaluate the models

In [None]:
"""# Function that uses the predictions of the model to get a report of the results, includes the accuracy score, a ConfusionMatrix
# and some other useful metrics
def metrics(y_train,y_train_pred,y_test,y_test_pred):
    print("training accuracy = ",round(accuracy_score(y_train,y_train_pred),2)*100)
    ConfusionMatrixDisplay.from_predictions(y_train,y_train_pred,normalize = 'all')
    print(classification_report(y_train,y_train_pred))
    plt.show()
    
    print("testing accuracy = ",round(accuracy_score(y_test,y_test_pred),2)*100)
    ConfusionMatrixDisplay.from_predictions(y_test,y_test_pred,normalize = 'all')
    print(classification_report(y_test,y_test_pred))
    plt.show()"""

### To plot the model

In [None]:
## The line / model
plt.scatter(y_test, predictions)
plt.xlabel("True Values")
plt.ylabel("Predictions")

In [None]:
# We can just change the model name and object to pickle depending on wich model do we want to dump
# Saving the model to a pickle object in order to access to the data later
#with open('dtTFG.pickle', 'wb') as handle:
#    pickle.dump(dt, handle, protocol=pickle.HIGHEST_PROTOCOL)

## Cross val score of the default models with the default data

In [None]:
# Before starting analyzing we create the models and a list with them
model1 = LogisticRegression()
model2 = DecisionTreeClassifier()
model3 = LinearSVC()
model4 = MultinomialNB()
model5 = RandomForestClassifier()

models = [model1, model2, model3, model4, model5]

We will see the accuracy of te default models with the default dataset

In [None]:
xt_bi = bigram_tfidf.fit_transform(data_cleaned["text"])
y = df['sentiment'].values

# Initialize StratifiedKFold with the desired number of folds
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Iterate over the generated folds by StratifiedKFold
for train_index, test_index in kfold.split(xt_bi, y):
    X_train, X_test = xt_bi[train_index], xt_bi[test_index]
    y_train, y_test = y[train_index], y[test_index]

# Create a dict to store the models' scores
first_results = {}

scores = []
for model in models:
    # Evaluate the model with cross_validation
    cv_scores = cross_val_score(model, X_train, y_train, cv=5)

    # Store the scores
    scores.append(cv_scores.mean())

# Add the scores to the dict
first_results["Score"] = scores

# Create a dataframe from the dict
first_results_df = pd.DataFrame(first_results)

# Mapping dict with the indices related to the models names
model_names = {0: 'LogisticRegression', 1: 'DecisionTree', 2: 'LinearSVC', 3: 'NB', 4: 'RandomForest'}

# Change the indexes for the models names
first_results_df.rename(index=model_names, inplace=True)

# Store the dataframe in a CSV file
first_results_df.to_csv('first_results_df_results.csv')

In [None]:
first_results_df

# Precision-recall curve, ROC curve & Learning curve

This metric will show us each model ability to distinguish between positive and negative classes across different probability thresholds.

We will plot see and plot the default scores of each model with the optimal vectorizer configuration but the default hyperparameter configuration and finally we will see all toghether.

## Linear SVC

In [None]:
xt = count_vec.fit_transform(data_cleaned["text"])
y = data_cleaned["sentiment"]

In [None]:
n_samples, n_features = xt.shape

In [None]:
print(n_samples, n_features)

In [None]:
linearSvc = LinearSVC()

tfidf = TfidfVectorizer(ngram_range=(1, 2), use_idf=True, max_features=1000)
xt = tfidf.fit_transform(data_cleaned["text"])
y = data_cleaned['sentiment']

predictions = []
true_labels = []

# Initialize StratifiedKFold with the desired number of folds
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Iterate over the generated folds by StratifiedKFold
for train_index, test_index in kfold.split(xt, y):
    X_train, X_test = xt[train_index], xt[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    linearSvc.fit(X_train, y_train)
    y_pred = linearSvc.predict(X_train)
    
    predictions.extend(y_pred)
    true_labels.extend(y_train)

# Calculate accuracy
accuracy = accuracy_score(true_labels, predictions)
print("Training accuracy = ", accuracy*100)
print(classification_report(true_labels, predictions))

# Get the confusion matrix
confusion = confusion_matrix(true_labels, predictions)

# Get the total examples amount
total_examples = np.sum(confusion)

# Calculate the success rate by the total amount of examples
confusion_percentage = confusion / total_examples

# Create a graphic of the confusion matrix
plt.figure(figsize=(6, 5))
sns.heatmap(confusion_percentage, annot=True, cmap="Blues", cbar=False)
plt.xlabel("Predicted label")
plt.ylabel("True label")
plt.title("LinearSVC confusion matrix")
plt.show()

# Precision-Recall curve
precision1, recall1, _ = precision_recall_curve(true_labels, predictions)
pr_auc1 = auc(recall1, precision1)

print('LinearSVC AUC = %0.2f' % pr_auc1)

disp = PrecisionRecallDisplay(precision=precision1, recall=recall1)

disp.plot()
plt.show()

## ROC curve

In [None]:
# Get the probabilities prediction of the model
y_scores = linearSvc.decision_function(X_train)

# Get the ROC curve
fpr, tpr, thresholds = roc_curve(y_train, y_scores)

# Get the area under the curve ROC (AUC-ROC)
auc_score = roc_auc_score(y_train, y_scores)

# Plot ROC curve
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label='LinearSVC (AUC = %0.2f)' % auc_score)
plt.plot([0, 1], [0, 1], 'k--')  # Línea de referencia
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve  - LinearSVC')
plt.legend(loc='lower right')
plt.show()

## Learning curve

In [None]:
LearningCurveDisplay.from_estimator(linearSvc, xt, y, score_type="both")
plt.show()

## Trying to improve the model:

In [None]:
# Define hyperparameters search space
parameters = {
    'C': [0.2, 0.3],
    'dual': [True, False],
    'fit_intercept': [True],
    'multi_class': ['crammer_singer', 'ovr']
}

# Define the cross_validation with 5 fold
cv = 5

# Do the hyperparameters search with GridSearchCV
grid_search = GridSearchCV(estimator=LinearSVC(), param_grid=parameters, cv=cv, scoring='accuracy', n_jobs=-1)

# Fit the model with the data
grid_search.fit(X_train, y_train)

# Print the best parameters and the best accuracy score
print("Best parameters: ", grid_search.best_params_)
print("Best accuracy score: ", grid_search.best_score_)

Finally the best params for this linearSVC model is only changing the value of the regularization parameter.

In [None]:
linearSvc = LinearSVC(random_state=42, C=0.2, fit_intercept=True, dual=False, multi_class='crammer_singer',
                      class_weight={0:0.6, 1:0.4})

tfidf = TfidfVectorizer(ngram_range=(1, 2), use_idf=True, max_features=1000)
xt = tfidf.fit_transform(data_cleaned["text"])
y = data_cleaned['sentiment']

predictions = []
true_labels = []

# Initialize StratifiedKFold with the desired number of folds
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Iterate over the generated folds by StratifiedKFold
for train_index, test_index in kfold.split(xt, y):
    X_train, X_test = xt[train_index], xt[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    linearSvc.fit(X_train, y_train)
    y_pred = linearSvc.predict(X_train)
    
    predictions.extend(y_pred)
    true_labels.extend(y_train)

# Calculate accuracy
accuracy = accuracy_score(true_labels, predictions)
print("Training accuracy = ", accuracy*100)
print(classification_report(true_labels, predictions))

# Get the confusion matrix
confusion = confusion_matrix(true_labels, predictions)

# Get the total examples amount
total_examples = np.sum(confusion)

# Calculate the success rate by the total amount of examples
confusion_percentage = confusion / total_examples

# Create a graphic of the confusion matrix
plt.figure(figsize=(6, 5))
sns.heatmap(confusion_percentage, annot=True, cmap="Blues", cbar=False)
plt.xlabel("Predicted label")
plt.ylabel("True label")
plt.title("LinearSVC confusion matrix")
plt.show()

# Precision-Recall curve
precision1, recall1, _ = precision_recall_curve(true_labels, predictions)
pr_auc1 = auc(recall1, precision1)

print('LinearSVC AUC = %0.2f' % pr_auc1)

disp = PrecisionRecallDisplay(precision=precision1, recall=recall1)

disp.plot()
plt.show()

## ROC curve

In [None]:
# Get the probabilities prediction of the model
y_scores = linearSvc.decision_function(X_train)

# Get the ROC curve
fpr, tpr, thresholds = roc_curve(y_train, y_scores)

# Get the area under the curve ROC (AUC-ROC)
auc_score = roc_auc_score(y_train, y_scores)

# Plot ROC curve
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label='LinearSVC (AUC = %0.2f)' % auc_score)
plt.plot([0, 1], [0, 1], 'k--')  # Línea de referencia
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve  - LinearSVC')
plt.legend(loc='lower right')
plt.show()

## Learning curve

In [None]:
LearningCurveDisplay.from_estimator(linearSvc, xt, y, score_type="both")
plt.show()

## Logistic Regression

In [None]:
logisticRegression = LogisticRegression()

tfidf = TfidfVectorizer(ngram_range=(1, 2), use_idf=True, max_features=4000)
xt = tfidf.fit_transform(data_cleaned["text"])
y = data_cleaned['sentiment']

predictions = []
true_labels = []

# Initialize StratifiedKFold with the desired number of folds
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Iterate over the generated folds by StratifiedKFold
for train_index, test_index in kfold.split(xt, y):
    X_train, X_test = xt[train_index], xt[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    logisticRegression.fit(X_train, y_train)
    y_pred = logisticRegression.predict(X_train)
    
    predictions.extend(y_pred)
    true_labels.extend(y_train)

# Calculate accuracy
accuracy = accuracy_score(true_labels, predictions)
print("Training accuracy = ", accuracy*100)
print(classification_report(true_labels, predictions))

# Get the confusion matrix
confusion = confusion_matrix(true_labels, predictions)

# Get the total examples amount
total_examples = np.sum(confusion)

# Calculate the success rate by the total amount of examples
confusion_percentage = confusion / total_examples

# Create a graphic of the confusion matrix
plt.figure(figsize=(6, 5))
sns.heatmap(confusion_percentage, annot=True, cmap="Blues", cbar=False)
plt.xlabel("Predicted label")
plt.ylabel("True label")
plt.title("LogisticRegression Confusion matrix")
plt.show()

# Precision-Recall curve
precision2, recall2, _ = precision_recall_curve(true_labels, predictions)
pr_auc2 = auc(recall2, precision2)

print('Logistic Regression AUC = %0.2f' % pr_auc2)

disp = PrecisionRecallDisplay(precision=precision2, recall=recall2)
                              
disp.plot()
plt.show()

In [None]:
# Get the probabilities prediction of the model
proba = logisticRegression.predict_proba(X_train)

# Get the probabilities for the positive class (class index 1)
y_scores = proba[:, 1]

# Get the ROC curve
fpr, tpr, thresholds = roc_curve(y_train, y_scores)

# Get the area under the ROC curve (AUC-ROC)
auc_score = roc_auc_score(y_train, y_scores)

# Plot ROC curve
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label='Logistic Regression (AUC = %0.2f)' % auc_score)
plt.plot([0, 1], [0, 1], 'k--')  # Reference line
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve - Logistic Regression')
plt.legend(loc='lower right')
plt.show()

## Learning curve

In [None]:
LearningCurveDisplay.from_estimator(logisticRegression, xt, y, score_type="both")
plt.show()

## Trying to improve the model:

In [None]:
# Define hyperparameters search space
parameters = {
    'dual': [True],
    'fit_intercept': [True],
    'solver': ['liblinear'],
    'random_state':[42],
    'l1_ratio':[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
}

# Define the cross_validation with 5 fold
cv = 5

# Do the hyperparameters search with GridSearchCV
grid_search = GridSearchCV(estimator=LogisticRegression(), param_grid=parameters, cv=cv, scoring='accuracy', n_jobs=-1)

# Fit the model with the data
grid_search.fit(X_train, y_train)

# Print the best parameters and the best accuracy score
print("Best parameters: ", grid_search.best_params_)
print("Best accuracy score: ", grid_search.best_score_)

In [None]:
logisticRegression = LogisticRegression(random_state=42, fit_intercept=True, solver='liblinear', dual=True)

tfidf = TfidfVectorizer(ngram_range=(1, 2), use_idf=True, max_features=4000)
xt = tfidf.fit_transform(data_cleaned["text"])
y = data_cleaned['sentiment']

predictions = []
true_labels = []

# Initialize StratifiedKFold with the desired number of folds
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Iterate over the generated folds by StratifiedKFold
for train_index, test_index in kfold.split(xt, y):
    X_train, X_test = xt[train_index], xt[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    logisticRegression.fit(X_train, y_train)
    y_pred = logisticRegression.predict(X_train)
    
    predictions.extend(y_pred)
    true_labels.extend(y_train)

# Calculate accuracy
accuracy = accuracy_score(true_labels, predictions)
print("Training accuracy = ", accuracy*100)
print(classification_report(true_labels, predictions))

# Get the confusion matrix
confusion = confusion_matrix(true_labels, predictions)

# Get the total examples amount
total_examples = np.sum(confusion)

# Calculate the success rate by the total amount of examples
confusion_percentage = confusion / total_examples

# Create a graphic of the confusion matrix
plt.figure(figsize=(6, 5))
sns.heatmap(confusion_percentage, annot=True, cmap="Blues", cbar=False)
plt.xlabel("Predicted label")
plt.ylabel("True label")
plt.title("LogisticRegression Confusion matrix")
plt.show()

# Precision-Recall curve
precision2, recall2, _ = precision_recall_curve(true_labels, predictions)
pr_auc2 = auc(recall2, precision2)

print('Logistic Regression AUC = %0.2f' % pr_auc2)

disp = PrecisionRecallDisplay(precision=precision2, recall=recall2)
                              
disp.plot()
plt.show()

In [None]:
# Get the probabilities prediction of the model
proba = logisticRegression.predict_proba(X_train)

# Get the probabilities for the positive class (class index 1)
y_scores = proba[:, 1]

# Get the ROC curve
fpr, tpr, thresholds = roc_curve(y_train, y_scores)

# Get the area under the ROC curve (AUC-ROC)
auc_score = roc_auc_score(y_train, y_scores)

# Plot ROC curve
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label='Logistic Regression (AUC = %0.2f)' % auc_score)
plt.plot([0, 1], [0, 1], 'k--')  # Reference line
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve - Logistic Regression')
plt.legend(loc='lower right')
plt.show()

## Learning curve

In [None]:
LearningCurveDisplay.from_estimator(logisticRegression, xt, y, score_type="both")
plt.show()

## (parenthesis) Checking if threshold affects auc accuracy

In [None]:
from sklearn.metrics import precision_score, recall_score

# Obtener las probabilidades estimadas de pertenencia a cada clase
proba = logisticRegression.predict_proba(X_test)

# Definir umbral inicial
threshold = 0.5

# Clasificar las instancias basado en las probabilidades estimadas
y_pred = (proba[:, 1] >= threshold).astype(int)

# Calcular la precisión y el recall para el umbral inicial
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)

print(f"Umbral: {threshold}")
print(f"Precisión: {precision}")
print(f"Recall: {recall}")
print("Score:", precision-recall)

# Ajustar el umbral y volver a clasificar las instancias
threshold = 0.6
y_pred = (proba[:, 1] >= threshold).astype(int)

# Calcular la precisión y el recall para el nuevo umbral
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)

print(f"\nUmbral: {threshold}")
print(f"Precisión: {precision}")
print(f"Recall: {recall}")
print("Score:", precision-recall)

# Ajustar el umbral y volver a clasificar las instancias
threshold = 0.7
y_pred = (proba[:, 1] >= threshold).astype(int)

# Calcular la precisión y el recall para el nuevo umbral
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)

print(f"\nUmbral: {threshold}")
print(f"Precisión: {precision}")
print(f"Recall: {recall}")
print("Score:", precision-recall)

# Ajustar el umbral y volver a clasificar las instancias
threshold = 0.8
y_pred = (proba[:, 1] >= threshold).astype(int)

# Calcular la precisión y el recall para el nuevo umbral
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)

print(f"\nUmbral: {threshold}")
print(f"Precisión: {precision}")
print(f"Recall: {recall}")
print("Score:", precision-recall)

After this check, we can say that the threshold can affect to classification criteria.

## MultinomialNB

In [None]:
multinomialNB = MultinomialNB()

tfidf = TfidfVectorizer(ngram_range=(1, 2), use_idf=True, max_features=5000)
xt = tfidf.fit_transform(data_cleaned["text"])
y = data_cleaned['sentiment']

predictions = []
true_labels = []

# Initialize StratifiedKFold with the desired number of folds
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Iterate over the generated folds by StratifiedKFold
for train_index, test_index in kfold.split(xt, y):
    X_train, X_test = xt[train_index], xt[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    multinomialNB.fit(X_train, y_train)
    y_pred = multinomialNB.predict(X_train)
    
    predictions.extend(y_pred)
    true_labels.extend(y_train)

# Calculate accuracy
accuracy = accuracy_score(true_labels, predictions)
print("Training accuracy = ", accuracy*100)
print(classification_report(true_labels, predictions))

# Get the confusion matrix
confusion = confusion_matrix(true_labels, predictions)

# Get the total examples amount
total_examples = np.sum(confusion)

# Calculate the success rate by the total amount of examples
confusion_percentage = confusion / total_examples

# Create a graphic of the confusion matrix
plt.figure(figsize=(6, 5))
sns.heatmap(confusion_percentage, annot=True, cmap="Blues", cbar=False)
plt.xlabel("Predicted label")
plt.ylabel("True label")
plt.title("MultinomialNB Confusion matrix")
plt.show()

# Precision-Recall curve
precision3, recall3, _ = precision_recall_curve(true_labels, predictions)
pr_auc3 = auc(recall3, precision3)

print('MultinomialNB AUC = %0.2f' % pr_auc3)

disp = PrecisionRecallDisplay(precision=precision3, recall=recall3)
disp.plot()
plt.show()

In [None]:
# Get the probabilities prediction of the model
proba = multinomialNB.predict_proba(X_train)

# Get the probabilities for the positive class (class index 1)
y_scores = proba[:, 1]

# Get the ROC curve
fpr, tpr, thresholds = roc_curve(y_train, y_scores)

# Get the area under the ROC curve (AUC-ROC)
auc_score = roc_auc_score(y_train, y_scores)

# Plot ROC curve
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label='MultinomialNB (AUC = %0.2f)' % auc_score)
plt.plot([0, 1], [0, 1], 'k--')  # Reference line
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve - MultinomialNB')
plt.legend(loc='lower right')
plt.show()

## Learning curve

In [None]:
LearningCurveDisplay.from_estimator(multinomialNB, xt, y, score_type="both")
plt.show()

In [None]:
multinomialNB.n_features_in_

In [None]:
multinomialNB.feature_count_

In [None]:
multinomialNB.class_count_

In [None]:
multinomialNB.class_log_prior_

In [None]:
multinomialNB.classes_

In [None]:
multinomialNB.feature_log_prob_

## Trying to improve the model:

In [None]:
# Define hyperparameters search space
parameters = {
    'alpha': [0.1, 0.5, 1, 1.5, 2],
    'fit_prior': [True, False],
    'class_prior': [None, [0.2, 0.8], [0.5, 0.5], [0.8, 0.2], [0.3, 0.7]]
}

# Define the cross_validation with 5 fold
cv = 5

# Do the hyperparameters search with GridSearchCV
grid_search = GridSearchCV(estimator=MultinomialNB(), param_grid=parameters, cv=cv, scoring='accuracy', n_jobs=-1)

# Fit the model with the data
grid_search.fit(X_train, y_train)

# Print the best parameters and the best accuracy score
print("Best parameters: ", grid_search.best_params_)
print("Best accuracy score: ", grid_search.best_score_)

After trying to fit the model with different parameters combinations the model does not improve so it will remain as it was by default.

## DecisionTreeClassifier

In [None]:
decisionTreeClassifier = DecisionTreeClassifier()

tfidf = TfidfVectorizer(ngram_range=(1, 2), use_idf=True, max_features=1000)
xt = tfidf.fit_transform(data_cleaned["text"])
y = data_cleaned['sentiment']

predictions = []
true_labels = []

# Initialize StratifiedKFold with the desired number of folds
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Iterate over the generated folds by StratifiedKFold
for train_index, test_index in kfold.split(xt, y):
    X_train, X_test = xt[train_index], xt[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    decisionTreeClassifier.fit(X_train, y_train)
    y_pred = decisionTreeClassifier.predict(X_train)
    
    predictions.extend(y_pred)
    true_labels.extend(y_train)

# Calculate accuracy
accuracy = accuracy_score(true_labels, predictions)
print("Training accuracy = ", accuracy*100)
print(classification_report(true_labels, predictions))

# Get the confusion matrix
confusion = confusion_matrix(true_labels, predictions)

# Get the total examples amount
total_examples = np.sum(confusion)

# Calculate the success rate by the total amount of examples
confusion_percentage = confusion / total_examples

# Create a graphic of the confusion matrix
plt.figure(figsize=(6, 5))
sns.heatmap(confusion_percentage, annot=True, cmap="Blues", cbar=False)
plt.xlabel("Predicted label")
plt.ylabel("True label")
plt.title("DecisionTreeClassifier Confusion matrix")
plt.show()

# Precision-Recall curve
precision4, recall4, _ = precision_recall_curve(true_labels, predictions)
pr_auc4 = auc(recall4, precision4)

print('DecisionTreeClassifier AUC = %0.2f' % pr_auc4)

disp = PrecisionRecallDisplay(precision=precision4, recall=recall4)
disp.plot()
plt.show()

In [None]:
# Get the probabilities prediction of the model
proba = decisionTreeClassifier.predict_proba(X_train)

# Get the probabilities for the positive class (class index 1)
y_scores = proba[:, 1]

# Get the ROC curve
fpr, tpr, thresholds = roc_curve(y_train, y_scores)

# Get the area under the ROC curve (AUC-ROC)
auc_score = roc_auc_score(y_train, y_scores)

# Plot ROC curve
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label='DecisionTreeClassifier (AUC = %0.2f)' % auc_score)
plt.plot([0, 1], [0, 1], 'k--')  # Reference line
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve - DecisionTreeClassifier')
plt.legend(loc='lower right')
plt.show()

## Learning curve

In [None]:
LearningCurveDisplay.from_estimator(decisionTreeClassifier, xt, y, score_type="both")
plt.show()

## Trying to improve the model:

In [None]:
# Define hyperparameters search space
parameters = {
    'max_depth': [6],
    'min_samples_leaf': [2,3,4,5,6,7,8,9,10],
    'max_leaf_nodes': [10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30],
    'random_state': [42]
}

# Define the cross_validation with 5 fold
cv = 5

# Do the hyperparameters search with GridSearchCV
grid_search = GridSearchCV(estimator=DecisionTreeClassifier(), param_grid=parameters, cv=cv, scoring='accuracy', n_jobs=-1)

# Fit the model with the data
grid_search.fit(X_train, y_train)

# Print the best parameters and the best accuracy score
print("Best parameters: ", grid_search.best_params_)
print("Best accuracy score: ", grid_search.best_score_)

In [None]:
decisionTreeClassifier = DecisionTreeClassifier(random_state=42, max_depth=6, min_samples_leaf=4, max_leaf_nodes=14)

tfidf = TfidfVectorizer(ngram_range=(1, 2), use_idf=True, max_features=1000)
xt = tfidf.fit_transform(data_cleaned["text"])
y = data_cleaned['sentiment']

predictions = []
true_labels = []

# Initialize StratifiedKFold with the desired number of folds
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Iterate over the generated folds by StratifiedKFold
for train_index, test_index in kfold.split(xt, y):
    X_train, X_test = xt[train_index], xt[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    decisionTreeClassifier.fit(X_train, y_train)
    y_pred = decisionTreeClassifier.predict(X_train)
    
    predictions.extend(y_pred)
    true_labels.extend(y_train)

# Calculate accuracy
accuracy = accuracy_score(true_labels, predictions)
print("Training accuracy = ", accuracy*100)
print(classification_report(true_labels, predictions))

# Get the confusion matrix
confusion = confusion_matrix(true_labels, predictions)

# Get the total examples amount
total_examples = np.sum(confusion)

# Calculate the success rate by the total amount of examples
confusion_percentage = confusion / total_examples

# Create a graphic of the confusion matrix
plt.figure(figsize=(6, 5))
sns.heatmap(confusion_percentage, annot=True, cmap="Blues", cbar=False)
plt.xlabel("Predicted label")
plt.ylabel("True label")
plt.title("DecisionTreeClassifier Confusion matrix")
plt.show()

# Precision-Recall curve
precision4, recall4, _ = precision_recall_curve(true_labels, predictions)
pr_auc4 = auc(recall4, precision4)

print('DecisionTreeClassifier AUC = %0.2f' % pr_auc4)

disp = PrecisionRecallDisplay(precision=precision4, recall=recall4)
disp.plot()
plt.show()

In [None]:
# Get the probabilities prediction of the model
proba = decisionTreeClassifier.predict_proba(X_train)

# Get the probabilities for the positive class (class index 1)
y_scores = proba[:, 1]

# Get the ROC curve
fpr, tpr, thresholds = roc_curve(y_train, y_scores)

# Get the area under the ROC curve (AUC-ROC)
auc_score = roc_auc_score(y_train, y_scores)

# Plot ROC curve
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label='DecisionTreeClassifier (AUC = %0.2f)' % auc_score)
plt.plot([0, 1], [0, 1], 'k--')  # Reference line
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve - DecisionTreeClassifier')
plt.legend(loc='lower right')
plt.show()

## Learning curve

In [None]:
LearningCurveDisplay.from_estimator(decisionTreeClassifier, xt, y, score_type="both")
plt.show()

## RandomForestClassifier

In [None]:
randomForestClassifier = RandomForestClassifier()

tfidf = TfidfVectorizer(ngram_range=(1, 2), use_idf=True, max_features=1000)
xt = tfidf.fit_transform(data_cleaned["text"])
y = data_cleaned['sentiment']

predictions = []
true_labels = []

# Initialize StratifiedKFold with the desired number of folds
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Iterate over the generated folds by StratifiedKFold
for train_index, test_index in kfold.split(xt, y):
    X_train, X_test = xt[train_index], xt[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    randomForestClassifier.fit(X_train, y_train)
    y_pred = randomForestClassifier.predict(X_train)
    
    predictions.extend(y_pred)
    true_labels.extend(y_train)

# Calculate accuracy
accuracy = accuracy_score(true_labels, predictions)
print("Training accuracy = ", accuracy*100)
print(classification_report(true_labels, predictions))

# Get the confusion matrix
confusion = confusion_matrix(true_labels, predictions)

# Get the total examples amount
total_examples = np.sum(confusion)

# Calculate the success rate by the total amount of examples
confusion_percentage = confusion / total_examples

# Create a graphic of the confusion matrix
plt.figure(figsize=(6, 5))
sns.heatmap(confusion_percentage, annot=True, cmap="Blues", cbar=False)
plt.xlabel("Predicted label")
plt.ylabel("True label")
plt.title("RandomForestClassifier Confusion matrix")
plt.show()

# Precision-Recall curve
precision5, recall5, _ = precision_recall_curve(true_labels, predictions)
pr_auc5 = auc(recall5, precision5)

print('RandomForestClassifier AUC = %0.2f' % pr_auc5)

disp = PrecisionRecallDisplay(precision=precision5, recall=recall5)
disp.plot()
plt.show()

In [None]:
# Get the probabilities prediction of the model
proba = randomForestClassifier.predict_proba(X_train)

# Get the probabilities for the positive class (class index 1)
y_scores = proba[:, 1]

# Get the ROC curve
fpr, tpr, thresholds = roc_curve(y_train, y_scores)

# Get the area under the ROC curve (AUC-ROC)
auc_score = roc_auc_score(y_train, y_scores)

# Plot ROC curve
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label='RandomForestClassifier (AUC = %0.2f)' % auc_score)
plt.plot([0, 1], [0, 1], 'k--')  # Reference line
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve - RandomForestClassifier')
plt.legend(loc='lower right')
plt.show()

## Learning curve

In [None]:
LearningCurveDisplay.from_estimator(randomForestClassifier, xt, y, score_type="both", n_jobs=-1)
plt.show()

## Trying to improve the model:

In [None]:
# Define hyperparameters search space
parameters = {
    'max_depth': [10,12,15,16,18,20],
    'n_estimators': [50,100,120,150],
    'min_samples_leaf': [1,2,3],
    'max_leaf_nodes': [10,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30],
    'random_state': [42]
}

# Define the cross_validation with 5 fold
cv = 5

# Do the hyperparameters search with GridSearchCV
grid_search = GridSearchCV(estimator=RandomForestClassifier(), param_grid=parameters, cv=cv, scoring='accuracy', n_jobs=-1)

# Fit the model with the data
grid_search.fit(X_train, y_train)

# Print the best parameters and the best accuracy score
print("Best parameters: ", grid_search.best_params_)
print("Best accuracy score: ", grid_search.best_score_)

In [None]:
randomForestClassifier = RandomForestClassifier(random_state=42, n_jobs=3, max_depth=10, n_estimators=120, min_samples_leaf=2,
                                               class_weight={0:0.56, 1:0.44}, max_leaf_nodes=12)

tfidf = TfidfVectorizer(ngram_range=(1, 2), use_idf=True, max_features=1000)
xt = tfidf.fit_transform(data_cleaned["text"])
y = data_cleaned['sentiment']

predictions = []
true_labels = []

# Initialize StratifiedKFold with the desired number of folds
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Iterate over the generated folds by StratifiedKFold
for train_index, test_index in kfold.split(xt, y):
    X_train, X_test = xt[train_index], xt[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    randomForestClassifier.fit(X_train, y_train)
    y_pred = randomForestClassifier.predict(X_train)
    
    predictions.extend(y_pred)
    true_labels.extend(y_train)

# Calculate accuracy
accuracy = accuracy_score(true_labels, predictions)
print("Training accuracy = ", accuracy*100)
print(classification_report(true_labels, predictions))

# Get the confusion matrix
confusion = confusion_matrix(true_labels, predictions)

# Get the total examples amount
total_examples = np.sum(confusion)

# Calculate the success rate by the total amount of examples
confusion_percentage = confusion / total_examples

# Create a graphic of the confusion matrix
plt.figure(figsize=(6, 5))
sns.heatmap(confusion_percentage, annot=True, cmap="Blues", cbar=False)
plt.xlabel("Predicted label")
plt.ylabel("True label")
plt.title("RandomForestClassifier Confusion matrix")
plt.show()

# Precision-Recall curve
precision5, recall5, _ = precision_recall_curve(true_labels, predictions)
pr_auc5 = auc(recall5, precision5)

print('RandomForestClassifier AUC = %0.2f' % pr_auc5)

disp = PrecisionRecallDisplay(precision=precision5, recall=recall5)
disp.plot()
plt.show()

In [None]:
# Get the probabilities prediction of the model
proba = randomForestClassifier.predict_proba(X_train)

# Get the probabilities for the positive class (class index 1)
y_scores = proba[:, 1]

# Get the ROC curve
fpr, tpr, thresholds = roc_curve(y_train, y_scores)

# Get the area under the ROC curve (AUC-ROC)
auc_score = roc_auc_score(y_train, y_scores)

# Plot ROC curve
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label='RandomForestClassifier (AUC = %0.2f)' % auc_score)
plt.plot([0, 1], [0, 1], 'k--')  # Reference line
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve - RandomForestClassifier')
plt.legend(loc='lower right')
plt.show()

## Learning curve

In [None]:
LearningCurveDisplay.from_estimator(randomForestClassifier, xt, y, score_type="both", n_jobs=-1)
plt.show()

# Comparing models learning curves

In [None]:
"""
Model 1: LogisticRegression()
Model 2: DecisionTreeClassifier()
Model 3: LinearSVC()
Model 4: MultinomialNB()
Model 5: RandomForestClassifier()
"""

# Define the training sizes for the learning curves
train_sizes = np.linspace(0.1, 1.0, 10)

models = [linearSvc, logisticRegression, multinomialNB, decisionTreeClassifier, randomForestClassifier]
model_names = ['Linear SVC', 'Logistic Regression', 'Multinomial Naive Bayes', 'Decision Tree', 'Random Forest']

for i, model in enumerate(models):
    train_sizes_abs, train_scores, test_scores = learning_curve(model, X_train, y_train, train_sizes=train_sizes)
    
    # Get the means and the std in the test and train set
    train_mean = np.mean(train_scores, axis=1)
    train_std = np.std(train_scores, axis=1)
    test_mean = np.mean(test_scores, axis=1)
    test_std = np.std(test_scores, axis=1)
    
    # Get the learning curves and the legend
    #plt.plot(train_sizes_abs, train_mean, 'o-', label='Model {}'.format(model_names[i] + ' (Train)')
    #plt.fill_between(train_sizes_abs, train_mean - train_std, train_mean + train_std, alpha=0.1)
    
    plt.plot(train_sizes_abs, test_mean, 'o-', label='{}'.format(model_names[i] + ' (Test)'))
    plt.fill_between(train_sizes_abs, test_mean - test_std, test_mean + test_std, alpha=0.1)

plt.legend(loc='best')
plt.title('Learning curves for models over the test set')
plt.xlabel('Test size')
plt.ylabel('Score')
plt.show()

*--------------------------------------------------------------------------------------------------------------------------------*

# Will not be considered from this cell onwards as it has not been included in the report and will be left for future work.

The results and functions used here maybe are not update and correctly implemented as it has been just ane exploratory research.

*--------------------------------------------------------------------------------------------------------------------------------*

## Analyzing errors

In [None]:
models = [linearSvc, logisticRegression, multinomialNB, decisionTreeClassifier, randomForestClassifier]
scores_mean = []
scores_std = []

# Realizar la validación cruzada para cada modelo
for model in models:
    scores = cross_val_score(model, X_train, y_train, cv=5)
    scores_mean.append(scores.mean())
    scores_std.append(scores.std())

# Imprimir los puntajes obtenidos en cada modelo
for i, model in enumerate(models):
    print("Model", i+1)
    print("Cross validation scores:", scores[i])
    print("Scores mean:", scores_mean[i])
    print("Standard deviation of the score:", scores_std[i])
    print()

# Generar gráfico comparativo de los modelos
plt.figure(figsize=(10, 6))
plt.errorbar(range(1, len(models)+1), scores_mean, yerr=scores_std, fmt='o-', capsize=5)
plt.xticks(range(1, len(models)+1))
plt.xlabel('Model')
plt.ylabel('Score')
plt.title('Cross validation score comparative')
plt.savefig('grafico_cross_validation_5_models.png', dpi=300, pad_inches=0.5)
plt.show()

## Data augmentation

In [None]:
import nltk
from nltk.corpus import wordnet
import random

def get_synonyms(word):
    synonyms = []
    for syn in wordnet.synsets(word):
        for lemma in syn.lemmas():
            synonyms.append(lemma.name())
    return synonyms

"""
Returns a new corpus after applying data augmentation
"""
def augment_with_synonyms(corpus):
    augmented_corpus = []
    for sentence in corpus:
        tokens = nltk.word_tokenize(sentence)
        augmented_tokens = []
        for token in tokens:
            synonyms = get_synonyms(token)
            if synonyms:
                augmented_tokens.append(synonyms[0])
            else:
                augmented_tokens.append(token)
        augmented_sentence = ' '.join(augmented_tokens)
        augmented_corpus.append(augmented_sentence)
    return augmented_corpus


"""
Returns a new corpus after applying data augmentation
"""
def augment_with_shuffle_words(corpus):
    augmented_corpus = []
    for text in corpus:
        tokens = nltk.word_tokenize(text)
        random.shuffle(tokens)
        shuffled_text = ' '.join(tokens)
        augmented_corpus.append(shuffled_text)
    return augmented_corpus

In [None]:
def clean_corpus(corpus):
    # Remove digits
    final_corpus = []
    for i in df.index:
        try:
            text = re.sub("(\\d|\\W)+"," ",df['text'][i])
            text = re.sub(r'[ÂÃ]', 'A', text)
            text = re.sub(r"[şŝšś]", "s", text)
            text = re.sub(r"[ĤĦĥħ]", "H", text)
            text = re.sub(r"[ĆĈĊČćĉċč]", "c", text)
            #text = [lem.lemmatize(word) for word in text if word not in set(stuff_to_be_removed)] # Apply lemmatizing and remove stopwords
            if text not in set(stuff_to_be_removed):
                text = ''.join(text)
                final_corpus.append(text)
            else:
                df.drop(i, axis=0, inplace=True)
        except:
            df.drop(i, axis=0, inplace=True)

In [None]:
final_corpus_augmented = []
for i in df.index:
    try:
        text = re.sub("(\\d|\\W)+"," ",df['text'][i])
        text = re.sub(r'[ÂÃ]', 'A', text)
        text = re.sub(r"[şŝšś]", "s", text)
        text = re.sub(r"[ĤĦĥħ]", "H", text)
        text = re.sub(r"[ĆĈĊČćĉċč]", "c", text)
        #text = [lem.lemmatize(word) for word in text if word not in set(stuff_to_be_removed)] # Apply lemmatizing and remove stopwords
        if text not in set(stuff_to_be_removed):
            text = ''.join(text)
            final_corpus_augmented.append(text)
        else:
            df.drop(i, axis=0, inplace=True)
    except:
        df.drop(i, axis=0, inplace=True)

In [None]:
final_corpus_augmented

In [None]:
# The size is the same, it's just making better the data that was already there
final_corpus_augmented = augment_with_synonyms(final_corpus_augmented)

In [None]:
final_corpus_augmented

### Sutdy the cross val score of the models using different data augmented sets

In [None]:
def apply_default(corpus, models):
    # Crear un diccionario vacío para almacenar las puntuaciones de los modelos
    results = {}
    
    # This part is only for adding the scores for the default dataset
    #xt = tfidf.fit_transform(final_corpus)
    xt_bi = bigram_tfidf.fit_transform(data_cleaned["text"])
    y = df['sentiment'].values

    # Inicializar StratifiedKFold con el número deseado de pliegues (folds)
    kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    # Iterar sobre los pliegues generados por StratifiedKFold
    for train_index, test_index in kfold.split(xt_bi, y):
        X_train, X_test = xt[train_index], xt[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
    scores = []
    for model in models:
        # Do it for the default dataset
        cv_scores = cross_val_score(model, xt_bi, y, cv=5)

        # Almacenar las puntuaciones
        scores.append(cv_scores.mean())

        # Agregar las puntuaciones al diccionario
    results["default"] = scores
    
    return results

def apply_data_augmentation(corpus, tecnica):
    if tecnica == "Synonyms":
        synonyms_corpus = clean_corpus(corpus)
        synonyms_corpus = augment_with_synonyms(synonyms_corpus)
        #xt = tfidf.fit_transform(synonyms_corpus)
        xt_bi = bigram_tfidf.fit_transform(synonyms_corpus)
        
    elif tecnica == "Shuffle":
        shuffle_corpus = clean_corpus(corpus)
        shuffle_corpus = augment_with_shuffle_words(shuffle_corpus)
        #xt = tfidf.fit_transform(shuffle_corpus)
        xt_bi = bigram_tfidf.fit_transform(shuffle_corpus)
        
    elif tecnica == "1Syn2Shuff":
        synonyms_corpus = clean_corpus(corpus)
        synonyms_corpus = augment_with_synonyms(synonyms_corpus)
        shuffle_corpus = augment_with_shuffle_words(synonyms_corpus)
        #xt = tfidf.fit_transform(shuffle_corpus)
        xt_bi = bigram_tfidf.fit_transform(shuffle_corpus)
        
    elif tecnica == "2Syn1Shuff":
        shuffle_corpus = clean_corpus(corpus)
        shuffle_corpus = augment_with_shuffle_words(shuffle_corpus)
        synonyms_corpus = augment_with_synonyms(shuffle_corpus)
        #xt = tfidf.fit_transform(synonyms_corpus)
        xt_bi = bigram_tfidf.fit_transform(synonyms_corpus)
        
    y = df['sentiment'].values
    
    # Inicializar StratifiedKFold con el número deseado de pliegues (folds)
    kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    # Iterar sobre los pliegues generados por StratifiedKFold
    for train_index, test_index in kfold.split(xt_bi, y):
        X_train, X_test = xt[train_index], xt[test_index]
        y_train, y_test = y[train_index], y[test_index]

    return X_train, y_train

In [None]:
model1 = LogisticRegression()
model2 = DecisionTreeClassifier()
model3 = LinearSVC()
model4 = MultinomialNB()
model5 = RandomForestClassifier()
    
# Definir las técnicas de data augmentation
tecnicas = ['Synonyms', 'Shuffle', '1Syn2Shuff', '2Syn1Shuff']

# Crear un diccionario con la puntuación default para almacenar las puntuaciones de los modelos después de aplicar data augmentation
results = apply_default(final_corpus, models)

# Here we start iterating and evluating models
for tecnica in tecnicas:
    scores = []
    for model in models:
        # Aplicar la técnica de data augmentation
        X_train_augmented, y_train_augmented = apply_data_augmentation(corpus, tecnica)
        
        # Realizar la evaluación del modelo con validación cruzada
        cv_scores = cross_val_score(model, X_train_augmented, y_train_augmented, cv=5)
        
        # Almacenar las puntuaciones
        scores.append(cv_scores.mean())
    
    # Agregar las puntuaciones al diccionario
    results[tecnica] = scores
    
# Crear un DataFrame a partir del diccionario
data_augmentation_results = pd.DataFrame(results)

# Diccionario de mapeo de índices a nombres de modelos
model_names = {0: 'LogisticRegression', 1: 'DecisionTree', 2: 'LinearSVC', 3: 'NB', 4: 'RandomForest'}

# Cambiar los índices por nombres de modelos
data_augmentation_results.rename(index=model_names, inplace=True)

# Guardar el DataFrame en un archivo CSV
#data_augmentation_results.to_csv('data_augmentation_results.csv')

After using multiple data augmentation techniques (also adding the default), I store the results and check wich one is the best.

#### Data augmentation scores results

In [None]:
data_augmentation_results

Just to be sure that I choose the best data set I will plot the same graphs as done before and see the diffrences with the different metrics.

## Auxiliar cell

This is just an auxiliar cell that will create the x_train and y_train sets to evaluate the models with different data (trying with different combinations of data augmentation)

In [None]:
model1 = LogisticRegression()
model2 = DecisionTreeClassifier()
model3 = LinearSVC()
model4 = MultinomialNB()
model5 = RandomForestClassifier()

models = [model1, model2, model3, model4, model5]

# Syn
#syn_corpus = augment_with_synonyms(corpus)

# Shuffle
shuffle_corpus = augment_with_shuffle_words(corpus)

# SynShuffle
#syn_corpus = augment_with_synonyms(corpus)
#shuffle_corpus = augment_with_shuffle_words(syn_corpus)

# ShuffSyn
#shuffle_corpus = augment_with_shuffle_words(corpus)
#syn_corpus = augment_with_synonyms(shuffle_corpus)

xt = bigram_tfidf.fit_transform(shuffle_corpus)
y = df['sentiment'].values

# Inicializar StratifiedKFold con el número deseado de pliegues (folds)
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Iterar sobre los pliegues generados por StratifiedKFold
for train_index, test_index in kfold.split(xt, y):
    X_train, X_test = xt[train_index], xt[test_index]
    y_train, y_test = y[train_index], y[test_index]

# Comparing models learning curves

After applying these techniques we still see that the model 1,3 and 4 the lineal ones that have algorithms that kind of solve the unbalance problem have a good score but the model 2 and 5, the randomForest and the decisionTree ones, altought they have improved their scores and the imabalance is reduced, the std is still high and still have the imbalance problem.\
In order to solve that I will try to apply another imbalance thecniques to balance the data before evaluating the models and then see if the performance improves or not. This will be done with the models that have the problem, the decissionTree and RandomForest but for that, as we said before we will use the augmented dataset that best fits our model (shuffle_dataset).

In [None]:
data_augmentation_results

Here we will use the shuffleled augmented dataset because in the pdf we have seen that it makes the most imbalanced models solve the imbalanced problems better than the other augmented datasets.

In [None]:
model1 = LogisticRegression()
model2 = DecisionTreeClassifier()
model3 = LinearSVC()
model4 = MultinomialNB()
model5 = RandomForestClassifier()

models = [model1, model2, model3, model4, model5]

results = {}

techniques = [RandomOverSampler(), RandomUnderSampler(), SMOTEENN()]

for tech in techniques:
    scores = []
    for model in models:
        shuffle_corpus = augment_with_shuffle_words(final_corpus)

        xt = tfidf.fit_transform(shuffle_corpus)
        y = df['sentiment'].values

        # Dividir los datos en conjuntos de entrenamiento y prueba
        kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

        # Iterar sobre los pliegues generados por StratifiedKFold
        for train_index, test_index in kfold.split(xt, y):
            X_train, X_test = xt[train_index], xt[test_index]
            y_train, y_test = y[train_index], y[test_index]
            
        # Realizar el muestreo estratificado
        sampler = tech
        X_train_resampled, y_train_resampled = sampler.fit_resample(X_train, y_train)

        # Entrenar un modelo de Random Forest con el conjunto de datos equilibrado
        model = model
        model.fit(X_train_resampled, y_train_resampled)

        cv_scores = cross_val_score(model, X_train_resampled, y_train_resampled, cv=5)
        
        # Almacenar las puntuaciones
        scores.append(cv_scores.mean())
    
    # Agregar las puntuaciones al diccionario
    results[tech] = scores
    
# Crear un DataFrame a partir del diccionario
data_augmentation_results_sampling = pd.DataFrame(results)

# Diccionario de mapeo de índices a nombres de modelos
model_names = {0: 'LogisticRegression', 1: 'DecisionTree', 2: 'LinearSVC', 3: 'NB', 4: 'RandomForest'}

# Cambiar los índices por nombres de modelos
data_augmentation_results_sampling.rename(index=model_names, inplace=True)

# Guardar el DataFrame en un archivo CSV
#data_augmentation_results.to_csv('data_augmentation_results.csv')

In [None]:
data_augmentation_results_sampling

## Future work possible implementations

In [None]:
# Define the model architecture
model = Sequential()
model.add(Dense(128, activation='relu', input_shape=(tfidf_train.shape[1],)))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(tfidf_train.toarray(), y_train, validation_data=(tfidf_val.toarray(), y_val), epochs=10, batch_size=32)