## Sentiment Analysis with Hugging Face

In this project we are going to use Sentiment analysis using text classification model to determine Covid information  based on the input features from the given dataset.

### Mounting The drive with datasets

In [1]:
!pip3 install -qU transformers[all] datasets accelerate

  error: subprocess-exited-with-error
  
  pip subprocess to install build dependencies did not run successfully.
  exit code: 1
  
  [8 lines of output]
  Ignoring numpy: markers 'python_version < "3.8"' don't match your environment
  Ignoring numpy: markers 'python_version == "3.8"' don't match your environment
  Ignoring numpy: markers 'python_version == "3.9"' don't match your environment
  Collecting cython>=0.29
    Downloading Cython-3.0.5-cp311-cp311-win_amd64.whl.metadata (3.2 kB)
  ERROR: Ignored the following versions that require a different python version: 1.21.2 Requires-Python >=3.7,<3.11; 1.21.3 Requires-Python >=3.7,<3.11; 1.21.4 Requires-Python >=3.7,<3.11; 1.21.5 Requires-Python >=3.7,<3.11; 1.21.6 Requires-Python >=3.7,<3.11
  ERROR: Could not find a version that satisfies the requirement numpy==1.21.3 (from versions: 1.3.0, 1.4.1, 1.5.0, 1.5.1, 1.6.0, 1.6.1, 1.6.2, 1.7.0, 1.7.1, 1.7.2, 1.8.0, 1.8.1, 1.8.2, 1.9.0, 1.9.1, 1.9.2, 1.9.3, 1.10.0.post2, 1.10.1, 1.10.2, 1

In [2]:
#Disable  W&B
import os
os.environ['WANDB_DISABLED'] = 'true'

In [3]:
#Account Authentication
#from google.colab import auth
auth.authenticate_user()

ModuleNotFoundError: No module named 'google.colab'

In [None]:
# Mounting the datasets/requirements text file from Drive
from google.colab import drive
drive.mount('/content/drive')

# 1.INSTALLATION OF REQUIRED LIBRARIES

In [None]:
#importations
!pip install datasets
!pip install transformers
!pip install transformers[torch] --upgrade
!pip install accelerate -u
!pip3 install huggingface_hub

In [None]:
#Creating a hugging face login to save our models
from huggingface_hub import notebook_login
notebook_login()

In [None]:
#Importation
import numpy as np
import pandas as pd
from datasets import load_dataset
from sklearn.model_selection import train_test_split

#Data exploration
import matplotlib.pyplot as plt
import seaborn as sns

#Modelling
from datasets import load_dataset
from sklearn.model_selection import train_test_split

from transformers import AutoTokenizer
from transformers import TrainingArguments
from transformers import AutoModelForSequenceClassification
from transformers import Trainer


### II. LOADING THE DATASETS FROM DRIVE

In [None]:
# Navigating to the directory where the datasets are stored
%cd /content/drive/My Drive/Azubi-Africa--P5-Natural-Language-Processing-Project-Sentiment-Analysis/data

# Load the train and test data
train = pd.read_csv('Train.csv')
# A way to eliminate rows containing NaN values
train = train[~train.isna().any(axis=1)]
test = pd.read_csv('Test.csv')
# A way to eliminate rows containing NaN values
test = test[~test.isna().any(axis=1)]

## III. EDA ANALYSIS
Here we will be understanding and visualizing our train dataset

In [None]:
#Preview of the datasets
train.head(5)

In [None]:
train.columns

In [None]:
#checking shape
train.shape, test.shape

In [None]:
#Preview of test data
test.head(5)

In [None]:
#preview test data
test.columns

In [None]:
#checking statistical distribution
train.describe()

In [None]:
#checking info
train.info(), test.info()

In [None]:
#Exploring distribution of the dataset
# Selecting numerical and categorical columns
numerical_columns = train.select_dtypes(exclude=['object'])

# Histogram of a numerical column
plt.hist(numerical_columns['label'], bins=30)
plt.xlabel('Value')
plt.ylabel('Frequency')
plt.title('Histogram of Labels')
plt.show()

The labels 0 have higher frequency as shown in the histogram. Most of the tweets response is neutral responses.




In [None]:
#checking correlation analysis
corr_matrix = train.corr()
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()

There is a positive relationship between labels and agreements of 0.14 which indicates there is no linear relationship between the variables.

In [None]:
#checking for outliers in the agreement and labels
sns.boxplot(data=train, x='label')
plt.title('Box Plot to Detect Outliers in the label')
plt.show()

sns.boxplot(data=train, x='agreement')
plt.title('Box Plot to Detect Outliers in the Agreement')
plt.show()

From the box plots there are not outliers in the datasets

In [None]:
#checking for null values
train.isnull().sum(), test.isnull().sum()

There are no missing values in the train and test datasets.

## 2. SPLITTING THE DATASETS FOR MODELLING
Using traintest split we are going to split the train to train set where the model can learn and evaluation set where we can compute the metric scores

In [None]:
# Split the train data => {train, eval}
train_set, eval = train_test_split(train, test_size=0.2, random_state=42, stratify=train['label'])

In [None]:
#preview
train_set.head(5)

In [None]:
eval.head(5)

In [None]:
#checking shapes of the splitted sets
train_set.shape, eval.shape


In [None]:
# Save splitted subsets
#train.to_csv("/content/drive/My Drive/Azubi-Africa--P5-Natural-Language-Processing-Project-Sentiment-Analysis/data/train_subset.csv", index=False)
#eval.to_csv("/content/drive/My Drive/Azubi-Africa--P5-Natural-Language-Processing-Project-Sentiment-Analysis/data/eval_subset.csv", index=False)

In [None]:
dataset = load_dataset('csv',
                        data_files={'train_set': '/content/drive/My Drive/Azubi-Africa--P5-Natural-Language-Processing-Project-Sentiment-Analysis/data/train_subset.csv',
                        'eval': '/content/drive/My Drive/Azubi-Africa--P5-Natural-Language-Processing-Project-Sentiment-Analysis/data/eval_subset.csv'}, encoding = "ISO-8859-1")


# 3. Modelling - Data Preposessing, Model training and evaluation

### Model 1: Bert-Base-Cased

In [None]:
#transforming the datasets
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')

In [None]:
def transform_labels(label):

    label = label['label']
    num = 0
    if label == -1: #'Negative'
        num = 0
    elif label == 0: #'Neutral'
        num = 1
    elif label == 1: #'Positive'
        num = 2

    return {'labels': num}

def tokenize_data(example):
    return tokenizer(example['safe_text'], padding='max_length')

# Change the tweets to tokens that the models can exploit
dataset = dataset.map(tokenize_data, batched=True)

# Transform	labels and remove the useless columns
remove_columns = ['tweet_id', 'label', 'safe_text', 'agreement']
dataset = dataset.map(transform_labels, remove_columns=remove_columns)

In [None]:
#previewing the dataset
dataset

### Model Training

In [None]:
#Training argurments/parameters
batch_size = 16
# Configure the trianing parameters like `num_train_epochs`:
# the number of time the model will repeat the training loop over the dataset
training_args = TrainingArguments("test_trainer", num_train_epochs=3, load_best_model_at_end=True,push_to_hub=True,
                                  evaluation_strategy="steps",
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    logging_steps = batch_size,
    save_steps= batch_size,
    save_strategy="steps",
    learning_rate=2e-5,
    weight_decay = 0.01
)

In [None]:
# Loading a pretrain model while specifying the number of labels in our dataset for fine-tuning
model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=3)

In [None]:
#pushing model to hugging face
#model.push_to_hub()

In [None]:
#pushing model tokenizer


In [None]:
#reshuffling the train and eval sets to introduce randomness
train_dataset = dataset['train_set'].shuffle(seed=10)
eval_dataset = dataset['eval'].shuffle(seed=10)

In [None]:
# Instatiate The Trainer
trainer = Trainer(
    model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset
)

In [None]:
#trainer.push_to_hub()

In [None]:
#model training
trainer.train()

### Model evaluation

In [None]:
import numpy as np
from datasets import load_metric
#Compute the metric
metric = load_metric("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics,
)

In [None]:
# Launch the final evaluation
trainer.evaluate()

Here we are going to push our trainer to hugging face and save model and tokenizer and trainer to our local machine.

In [None]:
# Save the model to your local directory
#model.save_pretrained('C:/Users/Natural-Language-Processing-Project-Sentiment-Analysis/model')
#tokenizer.save_pretrained("C:/Users-Natural-Language-Processing-Project-Sentiment-Analysis/tokenizer")
#trainer.save_pretrained('C:/Users-Natural-Language-Processing-Project-Sentiment-Analysis/trainer')

In [None]:
#pushing trainer to hugging face
trainer.push_to_hub()