### <p style="text-align:left"><span style="color:green">Importing Required `Libraries`</span></p>

In [1]:
import warnings
warnings.filterwarnings('ignore')

# Importing necessary libraries and functions :
import pandas as pd
import numpy as np
from math import sqrt
import time

# Text processing libraries :
import gensim 
import re # Regular Expression library
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from gensim.parsing.preprocessing import remove_stopwords
from nltk.tokenize import word_tokenize # Tokenizaion 
from spacy.lang.en import English
from spacy.lang.en.stop_words import STOP_WORDS
import contractions

# Plotting libraries :
import seaborn as sns
from wordcloud import WordCloud
import matplotlib.pyplot as plt

# sklearn : 
import sklearn
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.tree import DecisionTreeClassifier # Import Decision Tree Classifier
from sklearn.model_selection import train_test_split # Import train_test_split function
from sklearn import metrics #Import scikit-learn metrics module for accuracy calculation
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.metrics import roc_curve, auc

ModuleNotFoundError: No module named 'contractions'

In [None]:
!pip install contractions

In [None]:
# Data manipulation and analysis libraries:
import numpy as np
import pandas as pd

# Data visualization libraries:
import matplotlib.pyplot as plt
import seaborn as sns

# Other libraries:
#from tqdm import tqdm
import warnings
import os

# Data science imports:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn import metrics
from sklearn.ensemble import (
    RandomForestRegressor,
    ExtraTreesRegressor,
    GradientBoostingRegressor,
    BaggingRegressor,
)
from sklearn.model_selection import KFold
from sklearn.metrics import (
    mean_squared_error,
    mean_absolute_error,
    r2_score,
    mean_absolute_percentage_error,
    median_absolute_error
)

# Configure pandas to display all columns:
pd.set_option('display.max_columns', None)

# Render figures directly in the notebook:
%matplotlib inline

# Render higher resolution images:
%config InlineBackend.figure_format = 'retina'

# Ignore warning messages:
warnings.filterwarnings('ignore')

### <p style="text-align:left"><span style="color:green">Reading the `CSV File`</span></p>

In [None]:
tweets_df = pd.read_csv('../data/tweets dataset.csv', encoding = "ISO-8859-1")

In [None]:
tweets_df.head()

In [None]:
tweets_df.shape

The dataset contains 9093 records and consists in three columns:

* **tweet_text :** The tweet's content
* **emotion_in_tweet_is_directed_at :** The brand mentioned in the tweet
* **is_there_an_emotion_directed_at_a_brand_or_product :** The emotion/neutrality expressed in the tweet   


### <p style="text-align:left"><span style="color:green">Exploring the dataset</span></p>

Let us first check wether our data is cleaned and if there is any missing values

In [None]:
print("The data columns contain the following missing values: ")
tweets_df.isna().sum()

As we can see, there is a missing tweet, we will delete it later since it is not relevant in our case. 
There are also 5802 missing values out of 9093 in the *emotion_in_tweet_is_directed_at* column, we will implement an approach to deal with them later since they represent an important percentage of our data which could bring us significant insights and help us in the analysis and visualization part

In order to analyze the tweet's structure and find more about its relationship with the column *emotion_in_tweet_is_directed_at*, we can display fully  some samples :

In [None]:
random_df  =  tweets_df.sample(n = 200)
for i in range(len(random_df)):
    print(random_df.iloc[i][0])
    print(random_df.iloc[i][1])
    print('----------')


By observing and analyzing the above result , we can make some preliminary hypotheses: 
* The tweets don't follow a standard structure, they start sometimes with a mention, other times with an hashtag , and even with simple words directly.
* The brands mentioned in the tweets aren't always simly identifiable,m especially when they are not preceded with an hashtag or a mention, which would make the task of replacing the missing value of the brands comumn harder .
* The tweets contain some insignificant words/ characters, which are not useful and will be deleted

Let's take a look at our target column which is *is_there_an_emotion_directed_at_a_brand_or_product*, and explore the different classes:

In [None]:
tweets_df['is_there_an_emotion_directed_at_a_brand_or_product'].value_counts()

It seems that we have 4 classes that rate the emotion expressed in the tweets. howevwr this classes are not well balances, especially the "I can't tell" one : there are so few of these values in comparison with the rest of the classes, and since it won't be so useful for our purpose, we will delete it later.

### <p style="text-align:left"><span style="color:green">Cleaning and preprocessing</span></p>

In [None]:
tweets_df.rename(columns={'tweet_text': 'tweets', 'emotion_in_tweet_is_directed_at': 'brands', 'is_there_an_emotion_directed_at_a_brand_or_product': 'emotions'}, inplace=True)

In [None]:
drop_indexes = tweets_df[(tweets_df['emotions'] == "I can't tell")].index
tweets_df.drop(drop_indexes, inplace = True)
tweets_df.dropna(subset = ['tweets'], inplace=True)

In [None]:
tweets_df.isna().sum()

In [None]:
tweets_df['emotions'].value_counts()

In [None]:
tweets_df['emotions'].value_counts(normalize = True)

In [None]:
plt.figure(figsize=(3,5))
# Create the classes percentages
percentages = tweets_df['emotions'].value_counts(normalize = True)

# Create bar plot with stacked bars
plt.bar(0, percentages[0], label='No Emotion')
plt.bar(0, percentages[1], bottom=percentages[0], label='Positive')
plt.bar(0, percentages[2], bottom=sum(percentages[:2]), label='Negative')
plt.legend(bbox_to_anchor=(1.7, 0.5), loc="center right")

# Set axis labels and title
plt.xlabel('Emotions')
plt.ylabel('Percentage')
plt.title('Percentage of Emotion Classes')

# Display the plot
plt.show()

In [None]:
# Calculate the class percentages
percentages = tweets_df['emotions'].value_counts(normalize=True)

# Set the style to use Seaborn's color palette
sns.set(style="whitegrid")

# Create a pie chart (circle plot)
plt.figure(figsize=(8, 6))
plt.pie(percentages, labels=percentages.index, autopct='%1.1f%%', startangle=140, colors=sns.color_palette("deep"))

# Set the title
plt.title('Percentage of Emotion Classes')

# Show the plot
plt.show()

In [None]:
tweets_df['brands'].value_counts()

In [None]:
grouped_by_emotions = tweets_df.groupby(['emotions'])['brands'].value_counts()
grouped_by_emotions

In [None]:
grouped_by_emotions['Positive emotion'].sort_values().plot(kind='barh', stacked=True)

In [None]:
grouped_by_emotions['Negative emotion'].sort_values().plot(kind='barh', stacked=True)

In [None]:
# Group the data by 'brands' and 'emotions' and calculate the count of each combination
grouped = tweets_df.groupby(['brands', 'emotions']).size().unstack()

# Calculate the total count of each brand
total_counts = grouped.sum(axis=1)

# Calculate the percentages
percentage_data = grouped.div(total_counts, axis=0) * 100

# Sort the DataFrame by the percentage of positive emotion in descending order
sorted_percentage_data = percentage_data.sort_values(by='Positive emotion', ascending=False)

# Create a stacked bar plot
ax = sorted_percentage_data.plot(kind='bar', stacked=True, figsize=(12, 6))
ax.set_xlabel('Brand')
ax.set_ylabel('Percentage')
ax.set_title('Percentage of Positive Emotions for Each Brand (Sorted)')
plt.xticks(rotation=45, ha='right')  # Rotate x-axis labels for better readability

# Add a legend
plt.legend(title='Emotions', bbox_to_anchor=(1.05, 1), loc='upper left')

plt.tight_layout()
plt.show()

### 5️⃣  `Data Preprocessing ` :

> Our data generally comes from a variety of different sources and is often in a variety of different formats. For this reason, cleaning our raw data is an essential part of preparing our dataset. However, cleaning is not a simple process, as textual data often contains redundant and/or repetitive words.

> Before training the model, we will perform various pre-processing steps on the dataset such as: 
>- Converting the text document to lowercase for better generalization.
>- Removing stop words.
>- Removing emojis. 
>- Removing of mentions & hastags.
>- Removal of numbers.
>- Removal of whitespaces.
>- Cleaning the ponctuation (to reduce unnecessary noise from the dataset).
>- Removing the repeating characters from the words along with removing the URLs/hyperlinks as they do not have any significant importance. <br>                          
and much more, we will see this in detail later...

> We will then performe:
>- **`Stemming`** : reducing the words to their derived stems.
>- **`Lemmatization`** : reducing the derived words to their root form known as lemma for better results.

> - **`Lowering Case`**:

Lowering case is very imprtant since it allows us to make words with same value equal. This will be very useful to reduce the dimensions of our vocabulary.

> - **`Removal of Mentions`**:

In social media, Mentions are used to call/mention another user into our post. Generally, mentions don't have an added value to our model. So we will remove them.

A mention has a special pattern: **@UserName** or  **#UserName**, So we will remove all string which starts with @

 > - **`Removal of Special Characters`**:
 
Special characters are every where, since we have punctuation marks in our tweets. In order to treat, for example, **hello!** and **hello** in the same way.  we have to remove the punctuation mark **!**

In [None]:
import re

def RemoveMentions(text):
    text_ = re.sub(r"@\S+", "", text)
    text_ = re.sub(r"#\S+", "", text)
    return text_


# Defining a list containing punctuation signs of english :
punctuations_list = string.punctuation

## Defining that will be applied to our datset : 
def RemovePunctuations(text):
    transformator = str.maketrans('', '', punctuations_list)
    return text.translate(transformator)

stopWords = stopwords.words('english')


def RemoveLinks(text):
    return re.sub(r"http\S+", "", text)

def RemoveNumbers(text):       
    return re.sub(r"[0-9]+", "", text)

def RemoveWhitespaces(text):
    text=text.strip()  # Leading and trailing whitespaces are removed
    return re.sub(r" +"," ",text)

def process(data):
    data = data.str.lower()
    data = data.apply(RemoveMentions)
    data = data.apply(RemovePunctuations)
    data = data.apply(lambda x: ' '.join([word for word in x.split() if (word not in stopWords) | (len(word)<=3)]))
    data = data.apply(RemoveLinks)
    data = data.apply(RemoveNumbers)
    data = data.apply(RemoveWhitespaces)
    return data

In [None]:
tweets_df['tweets_processed'] = process(tweets_df['tweets'])

In [None]:
tweets_df