In [None]:
from datasets import load_dataset
import pandas as pd
import re
import html
import contractions
import emoji
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer 
from huggingface_hub import list_datasets

In [58]:
emoticon_map = {
    ":)": "😊",
    "=)": "😊",
    ":]": "😊",
    ":(": "😞",
    "=(": "😞",
    ";)": "😉",
    ":D": "😁",
    ":P": "😜",
    "=d": "😜",
    "<3": "❤️"
}

In [59]:
dt = load_dataset('stanfordnlp/sentiment140', trust_remote_code=True)
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
ps = PorterStemmer()

[nltk_data] Downloading package stopwords to /home/joshua/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [60]:
def replace_emoticons(text):
    for emote_text, emote in emoticon_map.items():
        text = text.replace(emote_text, emote)
    return text

In [61]:
def preprocess(text):
    text =  html.unescape(text)                                                     # decode html emojis
    text = replace_emoticons(text)                                                  # convert html emojis into actual emojis
    text = emoji.demojize(text, language="en")                                      # convert emojis into words
    text = text.lower()                                                             # lower case all words
    text = contractions.fix(text)                                                   # de-contract words
    text = re.sub(r'http\S+', '', text)                                             # remove links
    text = re.sub(r'@\S+', '', text)                                                # remove usernames
    text = re.sub(r"[,.@`\"'-]",'', text)                                            # remove punctuation
    text = text.strip()                                                             # remove surrounding whitespace if any
    tokens = text.split()                                                           # tokenize
    tokens = [word for word in tokens if word not in stop_words]    # remove stopwords
    tokens = [ps.stem(word) for word in tokens]                                     # stem words
    return ' '.join(tokens)

In [62]:
df_train = pd.DataFrame(dt['train']).sample(n=30000, random_state=42).reset_index(drop=True)
df_test = pd.DataFrame(dt['test'])
df_train['text'] = df_train['text'].apply(preprocess)
df_test['text'] = df_test['text'].apply(preprocess)

In [None]:
cv = TfidfVectorizer(max_features=10000)
X_train = cv.fit_transform(df_train['text']).toarray()
X_test = cv.transform(df_test['text']).toarray()
y_train = df_train['sentiment'].values
y_test = df_test['sentiment'].values

In [64]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)

In [65]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
accuracy_score(y_test, y_pred)

0.5461847389558233