<a href="https://colab.research.google.com/github/leovidith/NLP-Sentiment_Analysis/blob/main/Sentiment_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# ⭐ Sentiment Analysis using NLP
[Kaggle link ](https://www.kaggle.com/datasets/abhi8923shriv/sentiment-analysis-dataset)

## 🔴 Set the Version


In [None]:
!pip install tensorflow==2.8.0
!pip install tensorflow-hub==0.12.0

Collecting tensorflow==2.8.0
  Downloading tensorflow-2.8.0-cp310-cp310-manylinux2010_x86_64.whl.metadata (2.9 kB)
Collecting keras-preprocessing>=1.1.1 (from tensorflow==2.8.0)
  Downloading Keras_Preprocessing-1.1.2-py2.py3-none-any.whl.metadata (1.9 kB)
Collecting tensorboard<2.9,>=2.8 (from tensorflow==2.8.0)
  Downloading tensorboard-2.8.0-py3-none-any.whl.metadata (1.9 kB)
Collecting tf-estimator-nightly==2.8.0.dev2021122109 (from tensorflow==2.8.0)
  Downloading tf_estimator_nightly-2.8.0.dev2021122109-py2.py3-none-any.whl.metadata (1.2 kB)
Collecting keras<2.9,>=2.8.0rc0 (from tensorflow==2.8.0)
  Downloading keras-2.8.0-py2.py3-none-any.whl.metadata (1.3 kB)
Collecting google-auth-oauthlib<0.5,>=0.4.1 (from tensorboard<2.9,>=2.8->tensorflow==2.8.0)
  Downloading google_auth_oauthlib-0.4.6-py2.py3-none-any.whl.metadata (2.7 kB)
Collecting tensorboard-data-server<0.7.0,>=0.6.0 (from tensorboard<2.9,>=2.8->tensorflow==2.8.0)
  Downloading tensorboard_data_server-0.6.1-py3-none-ma

## 🔴 Import Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import tensorflow as tf
import tensorflow_hub as hub
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import layers

## 🔴 Import Dataset

In [None]:
train_df = pd.read_csv('/content/train.csv', encoding='ISO-8859-1')
test_df = pd.read_csv('/content/test.csv', encoding='ISO-8859-1')

train_df.head(2)

Unnamed: 0,textID,text,selected_text,sentiment,Time of Tweet,Age of User,Country,Population -2020,Land Area (Km²),Density (P/Km²)
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral,morning,0-20,Afghanistan,38928346,652860.0,60
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative,noon,21-30,Albania,2877797,27400.0,105


In [None]:
# remove NaN values from rows

train_df.dropna(inplace = True)
test_df.dropna(inplace = True)

# ⭐ Data Visualization

## 🔴 Understanding Data

In [None]:
train_df['sentiment'].value_counts()

Unnamed: 0_level_0,count
sentiment,Unnamed: 1_level_1
neutral,11117
positive,8582
negative,7781


In [None]:
# replace the strings with integer values and typecaste em to Float64 to avoid
# SparseCategoricalCrossEntropy

train_df['sentiment'] = train_df['sentiment'].replace({'positive':2, 'neutral':1, 'negative':0}).astype(float)

In [None]:
train_df['sentiment'].head()

Unnamed: 0,sentiment
0,1.0
1,0.0
2,0.0
3,0.0
4,0.0


In [None]:
test_df['sentiment'] = test_df['sentiment'].replace({'positive':2, 'neutral':1, 'negative':0})
test_df['sentiment'] = test_df['sentiment'].astype(float)

In [None]:
test_df['sentiment'].head()

Unnamed: 0,sentiment
0,1.0
1,2.0
2,0.0
3,2.0
4,2.0


# ⭐ Preprocessing

## 🔴 Text Preprocessing

In [None]:
import re
def clean(text):
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Conver to lower
    text = text.lower()
    # Remove extra whitespaces
    text = re.sub(r'\s+', ' ', text).strip()
    return text

train_df['text'] = train_df['text'].apply(clean)
test_df['text'] = test_df['text'].apply(clean)

## 🔴 Setting up train and test datasets

In [None]:
train_data = train_df['text'].astype(str).to_numpy()
test_data = test_df['text'].astype(str).to_numpy()

train_labels = train_df['sentiment'].astype(int).to_numpy()
test_labels = test_df['sentiment'].astype(int).to_numpy()

In [None]:
(len(train_df), len(train_labels)), (len(test_data), len(test_labels))

((27480, 27480), (3534, 3534))

## 🔴 One Hot Encoding the Lables

In [None]:
from tensorflow.keras.utils import to_categorical
train_labels = to_categorical(train_labels, num_classes=3)
test_labels = to_categorical(test_labels, num_classes=3)

## 🔴 Pipelining

In [None]:
def create_dataset(data, labels, batch_size=32):
    dataset = tf.data.Dataset.from_tensor_slices((data, labels))
    dataset = dataset.shuffle(buffer_size=len(data))
    dataset = dataset.batch(batch_size)
    dataset = dataset.prefetch(buffer_size=tf.data.AUTOTUNE)

train_dataset = create_dataset(train_data, train_labels)
test_dataset = create_dataset(test_data, test_labels)

# ⭐ Modelling

## 🔴 Load the Model

In [None]:
use_embed = hub.load("https://www.kaggle.com/models/google/universal-sentence-encoder/TensorFlow2/universal-sentence-encoder/2")

In [None]:
use_encoder_layer = hub.KerasLayer("https://www.kaggle.com/models/google/universal-sentence-encoder/frameworks/TensorFlow2/variations/universal-sentence-encoder/versions/2",
                                  input_shape=[],
                                  dtype=tf.string,
                                  trainable=False)

## 🔴 Model Adjustments

In [None]:
model = tf.keras.Sequential([
    use_encoder_layer,
    layers.Dense(128, activation='relu'),
    layers.Dropout(0.3),
    layers.Dense(64, activation='relu'),
    layers.Dropout(0.3),
    layers.Dense(32, activation= 'relu'),
    layers.Dropout(0.3),
    layers.Dense(16, activation = 'relu'),
    layers.Dropout(0.3),
    layers.Dense(16, activation = 'relu'),
    layers.Dense(3, activation='softmax')
])

## 🔴 Model Compiling

In [None]:
model.compile(loss= tf.keras.losses.CategoricalCrossentropy(),
               optimizer=tf.keras.optimizers.Adam(learning_rate = 0.005),
               metrics=['accuracy'])

early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy',patience=2, restore_best_weights=True)

history = model.fit(train_data,
                    train_labels,
                    validation_data=(test_data, test_labels),
                    epochs=10,
                    batch_size=32,
                    callbacks=[early_stopping])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10


# ⭐ After Math

## Making Predictions

In [None]:
predictions = model.predict(test_data)
predictions = np.argmax(predictions, axis=1)
test_labels = np.argmax(test_labels, axis=1)
predictions

array([1, 2, 0, ..., 0, 2, 2])

## Calculating Accuracy

In [None]:
from sklearn.metrics import accuracy_score,precision_recall_fscore_support
def cal_result(y_true,y_pred):
    mod_acc=accuracy_score(y_true,y_pred)*100
    mod_prec,mod_rec,model_f1,_=precision_recall_fscore_support(y_true,y_pred,average="weighted")
    model_results={"accuracy": mod_acc,
                    "precision": mod_rec,
                    "recall":mod_rec,
                    "f1":model_f1
                    }

    return model_results

In [None]:
results = cal_result(test_labels, predictions)
results

{'accuracy': 70.71307300509338,
 'precision': 0.7071307300509337,
 'recall': 0.7071307300509337,
 'f1': 0.7071063231645413}

## Making Predictions on User Defined Sentences

In [None]:
input_sentence = input("Enter your sentence: ")
input_sentence = clean(input_sentence)
input_sentence = np.array([input_sentence])

preds = model.predict(input_sentence)
index = np.argmax(preds, axis=1)[0]

sentiment_labels = {0: 'Negative 🤬', 1: 'Neutral 😐', 2: 'Positive 😊'}
sentiment = sentiment_labels[index]
print(f"Emotion: {sentiment}")

Enter your sentence: i feel anxious
Emotion: Negative 🤬
