# 0. Pip Installations

In [164]:
!pip install opendatasets
!pip install polars



# 1. Load dataset from kaggle

In [165]:
import opendatasets as od
import polars as pl

od.download("https://www.kaggle.com/datasets/abhi8923shriv/sentiment-analysis-dataset")
file =('sentiment-analysis-dataset/training.1600000.processed.noemoticon.csv')

# Enter kaggle username and api key to download dataset. Api key can be found by following this link:
# https://www.kaggle.com/settings/account

df = pl.read_csv(file,separator=',', encoding='ISO-8859-1')

Skipping, found downloaded files in ".\sentiment-analysis-dataset" (use force=True to force download)


In [166]:
df.head()

polarity of tweet,id of the tweet,date of the tweet,query,user,text of the tweet
i64,i64,str,str,str,str
0,1467810672,"""Mon Apr 06 22:…","""NO_QUERY""","""scotthamilton""","""is upset that …"
0,1467810917,"""Mon Apr 06 22:…","""NO_QUERY""","""mattycus""","""@Kenichan I di…"
0,1467811184,"""Mon Apr 06 22:…","""NO_QUERY""","""ElleCTF""","""my whole body …"
0,1467811193,"""Mon Apr 06 22:…","""NO_QUERY""","""Karoli""","""@nationwidecla…"
0,1467811372,"""Mon Apr 06 22:…","""NO_QUERY""","""joy_wolf""","""@Kwesidei not …"


# 2. Data Preprocessing

## 2.1 Data cleaning

In [167]:
df.null_count()
# No null data

# Rename columns
df.columns = ['polarity', 'id', 'date', 'query', 'user', 'text']

# Drop columns that are not needed
df = df.select(['polarity', 'text'])

## 2.2 Remove punctuations and special characters 

In [168]:
import string
df = df.with_columns(pl.col('text').map_elements(lambda x: x.translate(str.maketrans('', '', string.punctuation))))
df.head()

polarity,text
i64,str
0,"""is upset that …"
0,"""Kenichan I div…"
0,"""my whole body …"
0,"""nationwideclas…"
0,"""Kwesidei not t…"


## 2.3 Lowercase all text

In [169]:
df = df.with_columns(pl.col('text').str.to_lowercase())

In [170]:
df = df.with_columns(pl.col('text')
                     .str.split(by=" ")
                    .alias('tokenized_text'))
df.head()

polarity,text,tokenized_text
i64,str,list[str]
0,"""is upset that …","[""is"", ""upset"", … ""blah""]"
0,"""kenichan i div…","[""kenichan"", ""i"", … ""bounds""]"
0,"""my whole body …","[""my"", ""whole"", … """"]"
0,"""nationwideclas…","[""nationwideclass"", ""no"", … """"]"
0,"""kwesidei not t…","[""kwesidei"", ""not"", … """"]"
