In [None]:
# Importing Libraries
import warnings
import pandas as pd
import numpy as np
import seaborn as sns
import string
from wordcloud import WordCloud
import seaborn as sns
import matplotlib.pyplot as plt 
warnings.filterwarnings('ignore')

#Renders images as PNG
%matplotlib inline 



In [None]:
# Loading CSV into dataframe. Dataset is from https://www.kaggle.com/datasets/kazanova/sentiment140/code.
DATASET_COLUMNS=['target','ids','date','query','user','text']
DATASET_ENCODING = 'ISO-8859-1'
df = pd.read_csv('Twitter.tweets.csv',\
    encoding=DATASET_ENCODING, names=DATASET_COLUMNS)
df.head()

In [None]:
# Shows how many rows and columns are in the dataset
df.info()

Preprocessing

Here I will be cleaning the data set.

In [None]:
# Remove duplicate rows and add then add them to new dataframe.
#df_nd=df.drop_duplicates(subset=['text'],keep='last').copy()
#print(len(df_nd))
#print(df_nd.head())
#print(df_nd.tail())
#df_nd.to_csv('df_noduplicates.csv',header=False)

In [None]:
# Based on dataset description Positive sentiments are marked as 4
# The following reassigns the values to be more binary.

df['target'] = df['target'].replace(4,1)

In [None]:
# This will visualise the number of values mark as positive and negative. 
sentiment = {0:"Negative", 1:"Positive"}
print(df.target.apply(lambda x: sentiment[x]).value_counts())
df.target.apply(lambda x: sentiment[x]).value_counts().plot(kind = 'bar')
plt.show()

In [None]:
# This will reduce dataset to one hundred thousand values
# this also reduces the time it takes for the model to run.
pos_sentiment = df[df['target'] == 1]
neg_sentiment = df[df['target'] == 0]

df_pos = pos_sentiment.iloc[:int(50000)]
df_neg = neg_sentiment.iloc[:int(50000)]

df = pd.concat([df_pos, df_neg])

print(len(df))
df.head(10)

In [None]:
# As I am only going to be analysing the sentiment of the tweet. I will only be using the target and text columns.
new_df = df[['target', 'text']]
new_df.head(10)

Here I am importing the dependies required for analysis. The following dependies will be imported.
# Natural language tool kit(NLTK) - This will help with natural language understanding. This also parameters necessary for tokenization and lammatizing.


In [None]:
# Here I will be importing stopwords
# These will remove unnecessary words from the text column
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
print(stop_words)

In [None]:
# I will use this cell to remove words in the stopwords dictionary.
#stop_words.remove('')

In [None]:
# Converting to Lower case
# This makes it easier to remove stopwords

new_df['text']=new_df['text'].str.lower()
new_df['text'].head(10)

In [None]:
# Importing wordnet lemmantizer
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')

In [None]:
# convert stringified list to list
def strg_list_to_list(strg_list):
 return strg_list.strip("[]").replace("'","").replace('"',"").replace(",","").split() 

Prepprocessing Task
Here I will be removing the following from the dataframe
1. Retweet ...label
2. VIDEO: label
3. Hyperlink
4. Twitter Handle
5. Escape Sequence
6. Extra Spaces
7. Short words less than 3 characters

In [None]:
# remove short words

new_df['text'] = new_df['text'].apply(lambda x: " ".join([w for w in x.split() if len(w)>3]))
new_df.head(10)

In [None]:
# The pre-tokenization task functions will receive strings as input parametres
#and return strings as output

import re
import contractions

def remove_retweet_label(text):
  return re.sub('RT @[\w_]+:','', text)

def remove_video_label(text):
  return re.sub('VIDEO:','', text)

def remove_hyperlink(text):
  return re.sub(r'http\S+','', text) # r=raw \S=string

def remove_twitterhandle(text):
  return re.sub('@[A-Za-z0-9_]+(:)?','', text)

def remove_escape_sequence(text):
  return re.sub(r'\n','', text)

def remove_extra_spaces(text):
  return  re.sub(r"\s+"," ", text)  

def remove_contraction(text):
  return ' '.join([contractions.fix(word) for word in text.split()])
  
def remove_stopwords(text):
  return " ".join([word for word in text.split() if word not in stop_words])

def pretokenization_cleaning(text):
  text=remove_retweet_label(text)
  text=remove_video_label(text)
  text=remove_hyperlink(text)
  text=remove_twitterhandle(text)
  text=remove_escape_sequence(text)
  text=remove_extra_spaces(text)  
  text=remove_contraction(text)
  text=remove_stopwords(text)
  return text

Pre-Tokenization task

In [None]:
# defining tokenizing function
from nltk.tokenize import TweetTokenizer
def tokenize(text):
  tokenizer = TweetTokenizer(reduce_len=True)
  return tokenizer.tokenize(text)

In [None]:
# defining Normalizing task using Stemmer
import nltk
def stemming(unkn_input):
  porter = nltk.PorterStemmer()
  if (isinstance(unkn_input,list)):
    list_input=unkn_input
  if (isinstance(unkn_input,str)):
    list_input=strg_list_to_list(unkn_input)
  list_stemmed=[]
  for word in list_input:
    word=porter.stem(word)
    list_stemmed.append(word)
  return " ".join(list_stemmed) #use this to return a string
  #return list_stemmed #use this to return a list

In [None]:
#defining normalizing task using Lemmatizer
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

lemmatizer = WordNetLemmatizer()

def nltk_pos_tagger(nltk_tag):
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:          
        return None

#lemmatize requires list input
def lemmatize(unkn_input):
    if (isinstance(unkn_input,list)):
      list_input=unkn_input
    if (isinstance(unkn_input,str)):
      list_input=strg_list_to_list(unkn_input)
    list_sentence = [item.lower() for item in list_input]
    nltk_tagged = nltk.pos_tag(list_sentence)  
    wordnet_tagged = map(lambda x: (x[0], nltk_pos_tagger(x[1])),nltk_tagged)
    
    lemmatized_sentence = []
    for word, tag in wordnet_tagged:
        if tag is None:
            lemmatized_sentence.append(word)
        else:        
            lemmatized_sentence.append(lemmatizer.lemmatize(word, tag))
        " ".join(lemmatized_sentence)
    return lemmatized_sentence

Post Tokenization Task


In [None]:
#the following post-tokenization receives list as input parameter
#and returns list as output

def remove_punc(list_token): 
  #print(list_token)
  def process(strg_token):
    strg_numb ='''0123456789'''
    strg_3dots='...'
    strg_2dots=".."
    strg_punc = '''!()+-[]{}|;:'"\,<>./?@#$£%^&*_~“”…‘’'''
    strg_output=''
    #for idx, char in enumerate(strg_token): 
    #print(item)
    if (len(strg_token)==0): #empty char
        strg_output +=''
    else:
      if (all(char in strg_numb for char in strg_token) or
          strg_token[0] in strg_numb): #if char is a number
        strg_output +=''
      else:
        if (len(strg_token)==1 and strg_token in strg_punc): #if char is a single punc
          strg_output +=''
        else:
            if (strg_token[0]=='#'): #if char is hashtag
              strg_output +=strg_token.lower()
            elif(strg_token==strg_3dots or strg_token==strg_2dots):
              strg_output +=''
            else: # other than above, char could be part of word,
            # e.g key-in
              strg_output += strg_token
    return strg_output
  list_output=[process(token) for token in list_token]
  return list_output


def remove_empty_item(list_item):
  token = [token for token in list_item if len(token)>0]
  return token

def lowercase_alpha(list_token):
  return [token.lower() if (token.isalpha() or token[0]=='#') else token for token in list_token]

def posttokenization_cleaning(unkn_input):
  list_output=[]
  if (isinstance(unkn_input,list)):
    list_output=unkn_input
  if (isinstance(unkn_input,str)):
    list_output=strg_list_to_list(unkn_input)
  list_output=remove_punc(list_output)
  list_output=remove_empty_item(list_output)
  #list_output=lowercase_alpha(list_output)


  return (list_output)

In [None]:
# calling pretokenization_cleaning (list comprehension style)
new_df['pretoken']=[pretokenization_cleaning(sentence) for sentence in new_df['text']]
new_df.head()

In [None]:
#calling tokenize (list comprehension style)
new_df['token']=[tokenize(sentence) for sentence in new_df['pretoken']]
new_df.head()


Stemming and Lammitising the dataframe

In [None]:
#calling stemming (list comprehension style)
new_df['stemmed']=[stemming(tokenize(sentence)) for sentence in new_df['pretoken']]
new_df.head()

In [None]:
#calling stemming (list comprehension style)
new_df['lemmatized']=[lemmatize(tokenize(word)) for word in new_df['pretoken']]
new_df.head()

Post tokenization cleaning. Here I will be removing numbers, empty tokens, single punctuations etc.

In [None]:
#calling posttokenization_cleaning (list comprehension style)
new_df['posttoken']=[posttokenization_cleaning(list_sentence) for list_sentence in new_df['lemmatized']]
new_df.head()

In [None]:
def create_wordcloud(posttoken):
    # Concatenate all tokens into a single string
    tokens = [token for sublist in new_df.posttoken for token in sublist]
    text = " ".join(token for token in tokens)
    # Generate a word cloud image
    wordcloud = WordCloud(width=800, height=400, max_words=200, background_color="white").generate(text)
    # Display the generated image:
    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis("off")
    plt.show()

In [None]:
create_wordcloud(new_df['posttoken'])

In [None]:
#word_counts = pd.Series(' '.join(new_df['posttoken']).split()).value_counts()
#word_counts[:20].plot(kind='bar', figsize=(10,5))
#plt.title('20 Most Frequent Words')
#plt.xlabel('Words')
#plt.ylabel('Frequency')
#plt.show()

Model Building

In [None]:
# Input Feature and Label

X = new_df.posttoken
y = new_df.target

In [None]:
# Splitting our data into Train and Test

from sklearn.model_selection import train_test_split


X_train,X_val,y_train,y_val = train_test_split(X, y, test_size = 0.25, random_state = 42)

In [None]:
# TFIDF : Extracting Tf-idf features

from sklearn.feature_extraction.text import TfidfVectorizer

X_train = [str(x) for x in X_train]
X_val = [str(x) for x in X_val]

tfidf = TfidfVectorizer(max_df=0.90, min_df=0.02, max_features=1000, stop_words='english')

tfidf.fit(list(X_train) + list(X_val))
X_train_tfidf = tfidf.transform(X_train)
X_val_tfidf = tfidf.transform(X_val)

In [None]:
scores_dict_tfid = {}

In [None]:
 # Naive bayes using Tf-idf features

from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB


nb = MultinomialNB()
nb.fit(X_train_tfidf, y_train)
y_pred_nb = nb.predict(X_val_tfidf)
print('naive bayes tfidf accuracy %s' % accuracy_score(y_pred_nb, y_val))

In [None]:
scores_dict_tfid['Multinomial Naive Bayes(Tfid)'] = accuracy_score(y_pred_nb, y_val)

In [None]:
# Creating a confusion matrix.
from sklearn.metrics import confusion_matrix


cf_matrix_model = confusion_matrix(y_val, y_pred_nb)
group_names = ['True Neg','False Pos','False Neg','True Pos']
group_counts = ["{0:0.0f}".format(value) for value in
                cf_matrix_model.flatten()]
group_percentages = ["{0:.2%}".format(value) for value in
                     cf_matrix_model.flatten()/np.sum(cf_matrix_model)]
labels = [f"{v1}\n{v2}\n{v3}" for v1, v2, v3 in
          zip(group_names,group_counts,group_percentages)]
labels = np.asarray(labels).reshape(2,2)
sns.heatmap(cf_matrix_model, annot=labels, fmt='', cmap='binary')