In [1]:
import numpy as np
import pandas as pd
import regex as re
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="whitegrid")
import plotly.express as px
import plotly.graph_objects as go
pd.set_option('display.max_columns', None)

import warnings
warnings.filterwarnings('ignore')
from sklearn.preprocessing import OrdinalEncoder

In [2]:
df = pd.read_csv("twitter_training.csv")

In [3]:
# Label the columns
df.columns = ["tweet_id","place", "sentiments", "tweets"]
df = df.drop("place", axis=1)
df

Unnamed: 0,tweet_id,sentiments,tweets
0,2401,Positive,I am coming to the borders and I will kill you...
1,2401,Positive,im getting on borderlands and i will kill you ...
2,2401,Positive,im coming on borderlands and i will murder you...
3,2401,Positive,im getting on borderlands 2 and i will murder ...
4,2401,Positive,im getting into borderlands and i can murder y...
...,...,...,...
74676,9200,Positive,Just realized that the Windows partition of my...
74677,9200,Positive,Just realized that my Mac window partition is ...
74678,9200,Positive,Just realized the windows partition of my Mac ...
74679,9200,Positive,Just realized between the windows partition of...


In [4]:
# df_twt = [df["tweets"]]
# df_twt


# Data Cleaning and Preprocessing

In [5]:
# Identify outliers
outliers = df.describe().loc["75%"] + 1.5 * (df.describe().loc["75%"] - df.describe().loc["25%"])
outliers

tweet_id    19210.0
dtype: float64

In [6]:
# Identify inconsistencies
duplicate_rows = df[df.duplicated()]
duplicate_rows

Unnamed: 0,tweet_id,sentiments,tweets
19,2404,Positive,that was the first borderlands session in a lo...
25,2405,Negative,The biggest disappointment of my life came a y...
50,2409,Neutral,Blaming Sight for Tardiness! A little bit of b...
63,2411,Neutral,.. [
145,2425,Negative,"""What a bitch!"""
...,...,...,...
74504,9171,Neutral,This benchmarking comparison between Oculus Qu...
74509,9172,Positive,@ NVIDIAGeForce @ nvidia
74588,9185,Neutral,Heard people are having issues with ordering t...
74618,9190,Positive,This news about the Nvidia 3000 series is ligi...


In [7]:
# Identify missing values
missing_values = df.isna().sum()
missing_values

tweet_id        0
sentiments      0
tweets        686
dtype: int64

In [8]:
# Remove the outlier value
df = df[df["tweet_id"] != 19210.0]

# removing duplicates
df.drop_duplicates(inplace = True)

# Remove the duplicate rows
df = df.drop_duplicates()

#This will remove any character in the ‘tweets’ column that is not a letter (upper or lower case), a number, a space, or a punctuation character (.,;?!$%^&*()-_+=[]{}'|"<>`~) with an empty string
df['tweets'] = df['tweets'].astype(str)
df['tweets'] = df['tweets'].str.replace('[^a-zA-Z0-9\s.,;?!$%^&*()-_+=[]{}\'"|<>`~]', '', regex=True)


#fill 0 in mum
df['tweets'] = df['tweets'].fillna(0)

# Save the dataframe
df.to_csv("tweet_clean.csv", index=False)


In [9]:
#check for duplicates
df.duplicated().sum()

0

In [10]:
#check null
df['tweets'].isnull().sum()

0

In [11]:
special_chars = df['tweets'].str.contains('[^a-zA-Z0-9\s.,;?!$%^&*()-_+=[]{}\'"|<>`~]')
print(any(special_chars))


False


In [12]:
# the number of missing values and the data types
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 71981 entries, 0 to 74680
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   tweet_id    71981 non-null  int64 
 1   sentiments  71981 non-null  object
 2   tweets      71981 non-null  object
dtypes: int64(1), object(2)
memory usage: 2.2+ MB


In [None]:
# df_train_POS = df[df["Positive"] == "Positive"]
# df_train_NEG = df_train[df_train["Positive"] == "Negative"]
# df_train_NEU = df_train[df_train["Positive"] == "Neutral"]

In [None]:
# df_train_POS = df_train_POS.sample(15000)
# df_train_NEG = df_train_NEG.sample(15000)
# df_train_NEU = df_train_NEU.sample(15000)

In [None]:
# df_train_POS.shape, df_train_NEG.shape, df_train_NEU.shape

# Exploratory data analysis (EDA)

Analysing different models

In [14]:
!pip install transformers



In [15]:
from transformers import BertTokenizer, TFBertForSequenceClassification
from transformers import InputExample, InputFeatures
import tensorflow as tf
from tensorflow.keras.metrics import Precision, Recall
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical

In [16]:
# Define the number of unique classes in your labels
# num_labels = len(np.unique(labels))

# Instantiate the BERT model and tokenizer
model = TFBertForSequenceClassification.from_pretrained("bert-base-uncased")
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [21]:

# Instantiate the precision and recall metrics
precision = Precision()
recall = Recall()

# Prepare the inputs for the model
input_ids, attention_masks, encoded_labels = [], [], []  # Create a new list for your encoded labels

# Define labels as a list of sentiment labels from your DataFrame
labels = df['sentiments'].values.tolist()

# Prepare the inputs for the model
# input_ids, attention_masks, labels = [], [], []


In [23]:
for tweet, label in zip(df['tweets'], df['sentiments']):
    inputs = tokenizer.encode_plus(tweet, add_special_tokens=True, max_length=128, pad_to_max_length=True,
                                   return_attention_mask=True, return_token_type_ids=True)
    input_ids.append(inputs['input_ids'])
    attention_masks.append(inputs['attention_mask'])
    labels.append(label)

AttributeError: ignored

In [23]:
# Convert lists to tensors
input_ids = tf.convert_to_tensor(input_ids)
attention_masks = tf.convert_to_tensor(attention_masks)

# Convert string labels to integers
le = LabelEncoder()
labels = le.fit_transform(labels)

# Convert integer labels to one-hot encoded format
labels = to_categorical(labels)

In [None]:

# Compile the model
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0),
              loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True),
              metrics=[tf.keras.metrics.CategoricalAccuracy('accuracy'), precision, recall])

# Train the model
history = model.fit([input_ids, attention_masks], labels, batch_size=100, epochs=2)


Epoch 1/2


In [None]:
# Print the history
print(history.history)


All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Epoch 1/2
   3/2250 [..............................] - ETA: 31:45:33 - loss: 1.3801 - accuracy: 0.3333 - precision_5: 0.5625 - recall_5: 0.0938

In [None]:
from transformers import BertTokenizer, TFBertForSequenceClassification
from transformers import InputExample, InputFeatures
import tensorflow as tf
from tensorflow.keras.metrics import Precision, Recall
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical

# Define labels as a list of sentiment labels from your DataFrame
labels = df['sentiments'].values.tolist()

# Define the number of unique classes in your labels
num_labels = len(np.unique(labels))

# Instantiate the BERT model and tokenizer
model = TFBertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=num_labels)
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Instantiate the precision and recall metrics
precision = Precision()
recall = Recall()

# Prepare the inputs for the model
input_ids, attention_masks, encoded_labels = [], [], []  # Create a new list for your encoded labels

for tweet, label in zip(df['tweets'], labels):
    inputs = tokenizer.encode_plus(tweet, add_special_tokens=True, max_length=128, pad_to_max_length=True,
                                   return_attention_mask=True, return_token_type_ids=True)
    input_ids.append(inputs['input_ids'])
    attention_masks.append(inputs['attention_mask'])
    encoded_labels.append(label)  # Append to encoded_labels instead of labels

# Convert lists to tensors
input_ids = tf.convert_to_tensor(input_ids)
attention_masks = tf.convert_to_tensor(attention_masks)

# Convert string labels to integers
le = LabelEncoder()
encoded_labels = le.fit_transform(encoded_labels)  # Use encoded_labels here

# Convert integer labels to one-hot encoded format
encoded_labels = to_categorical(encoded_labels)  # And here

# Compile the model
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0),
              loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True),
              metrics=[tf.keras.metrics.CategoricalAccuracy('accuracy'), precision, recall])


#Calculate batch size
batch_size = len(labels) // 25


# Train the model
history = model.fit([input_ids, attention_masks], encoded_labels, batch_size=batch_size, epochs=2)

# # Train the model
# history = model.fit([input_ids, attention_masks], encoded_labels, batch_size=32, epochs=2)  # And here

# Print the history
print(history.history)


All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Epoch 1/2


##BARD2

from transformers import BertTokenizer, TFBertForSequenceClassification
from transformers import InputExample, InputFeatures
import tensorflow as tf
from tensorflow.keras.metrics import Precision, Recall, SparseCategoricalAccuracy
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical

# Define the number of unique classes in your labels
num_labels = len(np.unique(labels))

# Instantiate the BERT model and tokenizer
model = TFBertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=num_labels)
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Instantiate the precision and recall metrics
precision = Precision()
recall = Recall()

# Prepare the inputs for the model
input_ids, attention_masks, labels = [], [], []

for tweet, label in zip(df['tweets'], df['sentiments']):
    inputs = tokenizer.encode_plus(tweet, add_special_tokens=True, max_length=128, pad_to_max_length=True,
                                   return_attention_mask=True, return_token_type_ids=True)
    input_ids.append(inputs['input_ids'])
    attention_masks.append(inputs['attention_mask'])
    labels.append(label)

# Convert lists to tensors
input_ids = tf.convert_to_tensor(input_ids)
attention_masks = tf.convert_to_tensor(attention_masks)

# Convert string labels to integers
le = LabelEncoder()
labels = le.fit_transform(labels)

# Convert integer labels to one-hot encoded format
# (not needed with sparse_categorical_accuracy)
# labels = to_categorical(labels)

# Compile the model
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0),
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=[SparseCategoricalAccuracy('accuracy'), precision, recall])

# Train the model
history = model.fit([input_ids, attention_masks], labels, batch_size=32, epochs=2)

# Print the history
print(history.history)
