# 0. Pip Installations

In [238]:
# Install all requirements
%pip install opendatasets
%pip install polars
%pip install nltk
%pip install wordcloud
%pip install stop-words


Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


# 1. Gather data

In [239]:
import opendatasets as od
import polars as pl

od.download("https://www.kaggle.com/datasets/abhi8923shriv/sentiment-analysis-dataset")
file =('sentiment-analysis-dataset/training.1600000.processed.noemoticon.csv')

# Enter kaggle username and api key to download dataset. Api key can be found by following this link:
# https://www.kaggle.com/settings/account

df = pl.read_csv(file,separator=',', encoding='ISO-8859-1')

Skipping, found downloaded files in ".\sentiment-analysis-dataset" (use force=True to force download)


In [240]:
df.head()

polarity of tweet,id of the tweet,date of the tweet,query,user,text of the tweet
i64,i64,str,str,str,str
0,1467810672,"""Mon Apr 06 22:…","""NO_QUERY""","""scotthamilton""","""is upset that …"
0,1467810917,"""Mon Apr 06 22:…","""NO_QUERY""","""mattycus""","""@Kenichan I di…"
0,1467811184,"""Mon Apr 06 22:…","""NO_QUERY""","""ElleCTF""","""my whole body …"
0,1467811193,"""Mon Apr 06 22:…","""NO_QUERY""","""Karoli""","""@nationwidecla…"
0,1467811372,"""Mon Apr 06 22:…","""NO_QUERY""","""joy_wolf""","""@Kwesidei not …"


# 2. Data Preprocessing

## 2.1 Data cleaning

In [241]:
df.null_count()
# No null data

# Rename columns
df.columns = ['sentiment', 'id', 'date', 'query', 'user', 'text']

# Drop columns that are not needed
df = df.select(['sentiment', 'text'])

# Convert polarity to sentiment
# The polarity of the tweet (0 = negative, 2 = neutral, 4 = positive)
mapping = {0: 'negative', 2: 'neutral', 4: 'positive'}

df = df.with_columns(pl.col('sentiment').map_dict(mapping).alias('sentiment'))
df.head()

sentiment,text
str,str
"""negative""","""is upset that …"
"""negative""","""@Kenichan I di…"
"""negative""","""my whole body …"
"""negative""","""@nationwidecla…"
"""negative""","""@Kwesidei not …"


## 2.2 Remove Urls

In [242]:
import re

df = df.with_columns(pl.col('text').map_elements(lambda x: re.sub(r"http\S+","", x)))

## 2.3 Remove punctuations and special characters 

In [243]:
import string
df = df.with_columns(pl.col('text').map_elements(lambda x: x.translate(str.maketrans('', '', string.punctuation))))

## 2.4 Lowercase all text

In [244]:
df = df.with_columns(pl.col('text').str.to_lowercase())

## 2.5 Tokenization

In [245]:
df = df.with_columns(pl.col('text')
                     .str.split(by=" ")
                    .alias('tokenized_text'))
cleaned_df = df.drop('text')

## 2.5 Stemming

In [None]:
from nltk.stem import SnowballStemmer
from stop_words import get_stop_words

stemmer = SnowballStemmer('english')
stemmed_df = cleaned_df.with_columns(pl.col('tokenized_text').map_elements(lambda x: [stemmer.stem(word) for word in x if (word and word not in get_stop_words('english'))]).alias('stemmed_text'))

In [None]:
stemmed_df = stemmed_df.drop('tokenized_text')


# 3. Data Exploration

In [None]:
stemmed_df.head()

In [None]:
import matplotlib.pyplot as plt

sentiment_count =stemmed_df.select(pl.col('sentiment').value_counts(sort=True)).unnest('sentiment')
plt.barh(y=sentiment_count['sentiment'], width=sentiment_count['counts'])
plt.show()

This indicates more tweets with a negative sentiments than positive sentiments. No tweets could be found with a neutral sentiment.

In [None]:
# Frequency of tokenized words
from collections import Counter
import numpy as np

def count_word_freq(column):
    return Counter([item for sublist in column.to_list() for item in sublist])

freq = count_word_freq(stemmed_df.get_column('stemmed_text'))
freq_negative_sentiment = count_word_freq(stemmed_df.filter(pl.col('sentiment') == 'negative').get_column('stemmed_text'))
freq_positive_sentiment = count_word_freq(stemmed_df.filter(pl.col('sentiment') == 'positive').get_column('stemmed_text'))

In [None]:
# Visualize top 50 frequently used words
word, frequency = zip(*freq.most_common(n=50))
indices = np.arange(len(freq.most_common(n=50)))
plt.bar(indices, frequency, color='b')
plt.xticks(indices, word, rotation='vertical')
plt.title('Top 50 frequently used words')
plt.tight_layout()
plt.show()

In [None]:
# Create wordcloud
import wordcloud

fig, (ax1, ax2) = plt.subplots(1, 2)

pos_cloud = wordcloud.WordCloud(max_words=150, random_state=30, collocations=True)
neg_cloud = wordcloud.WordCloud(max_words=150, random_state=30, collocations=True)
negative_sentiment_cloud = neg_cloud.generate_from_frequencies(dict(freq_negative_sentiment.most_common()))
positive_sentiment_cloud = pos_cloud.generate_from_frequencies(dict(freq_positive_sentiment.most_common()))

ax1.imshow(negative_sentiment_cloud, interpolation='bilinear')
ax1.set_title('Negative Sentiment Words')
ax1.axis('off')
ax2.imshow(positive_sentiment_cloud, interpolation='bilinear')
ax2.set_title('Positive Sentiment Words')
ax2.axis('off')

fig.tight_layout()
fig.set_size_inches(15,15)

# 4. Text Vectorization

In [None]:
from gensim.models import FastText
import pickle as pk

X = stemmed_df.select('stemmed_text')
y = stemmed_df.select('sentiment')
vector_size = 100

fasttext_model = FastText(vector_size= vector_size, window= 3, min_count= 1, sg=1)
fasttext_model.build_vocab(X.to_series())
fasttext_model.train(X.to_series(), total_examples=fasttext_model.corpus_count, epochs=5)

fasttext_model.save("fasttext/fasttext_model")

pk.dump(vector_size, open('word2vec/vector_size_w2v_metric.pkl', 'wb'))