In [2]:
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt')
nltk.download('average_perceptron_tagger')
nltk.download('stopwords')
nltk.download('brown')
nltk.download('movie_reviews')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/jiarongli/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/jiarongli/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to /Users/jiarongli/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Error loading average_perceptron_tagger: Package
[nltk_data]     'average_perceptron_tagger' not found in index
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jiarongli/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package brown to /Users/jiarongli/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package movie_reviews to
[nltk_data]     /Users/jiarongli/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!


True

In [5]:
# Construct dataset for activity
# We'll load movie statement dataset from nltk into a pandas dataframe for ease of use

from nltk.corpus import movie_reviews
import pandas as pd
from sklearn.model_selection import train_test_split

pos_ids = movie_reviews.fileids('pos')
neg_ids = movie_reviews.fileids('neg')

pos_reviews = []
for id in pos_ids:
    pos_reviews.append(movie_reviews.raw(fileids=id))

neg_reviews = []
for id in neg_ids:
    neg_reviews.append(movie_reviews.raw(fileids=id))
    
pos_df = pd.DataFrame({"text": pos_reviews, "label": [1] * len(pos_reviews)})
neg_df = pd.DataFrame({"text": neg_reviews, "label": [0] * len(neg_reviews)})

df = pd.concat([pos_df, neg_df])

train, test = train_test_split(df, random_state=2023, stratify=df["label"], test_size=.20)
train, val = train_test_split(train, random_state=2023, stratify=train["label"], test_size=.15)

print(f"final datasets, train examples: {len(train)}, val examples: {len(val)}, test examples: {len(test)}")

final datasets, train examples: 1360, val examples: 240, test examples: 400


In [6]:
def preprocess_text(text:str) -> str:
    """
    Method as input a essy string and return a clean and normalised string
    The clean processs is as follows:
        1. lowercase string
        2. remove all non-alpha characters
    """
    
    # 1. lowercase the string
    text = text.lower()
    
    # 2. Strip all non alpha characters
    text = "".join([char for char in text if char.isalpha() or char == ' '])
    
    return text

In [7]:
messy_text = 'The 3)quick bro*&wn foxes ju#$mped over t<>he l"~azy dogs.'
clean_text = "the quick brown foxes jumped over the lazy dogs"

print(f"Function output: {preprocess_text(messy_text)}")
print(f"Function test: {preprocess_text(messy_text) == clean_text}")

Function output: the quick brown foxes jumped over the lazy dogs
Function test: True


In [9]:
train["cleaned_text"] = train["text"].apply(preprocess_text)
test["cleaned_text"] = test["text"].apply(preprocess_text)

In [11]:
import nltk
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

def lemmatize_sentence(text):
    return " ".join(lemmatizer.lemmatize(tok) for tok in text.split())

train["cleaned_text"] = train["cleaned_text"].apply(lemmatize_sentence)
test["cleaned_text"] = test["cleaned_text"].apply(lemmatize_sentence)

In [14]:
train["cleaned_text"]

874    ingredient man with amnesia who wake up wanted...
542    dont let the following quirk of this review fo...
788    you leave little note on my pillow i told you ...
403    my fellow american is a movie that at first gl...
808    did claus von bulow try to kill his wife sunny...
                             ...                        
426    by now i figured id seen every alfred hitchcoc...
399    the year is and the military is conducting nuc...
486    where do i begin okay how about with this star...
476    a backdrop of new year eve in would seem to le...
126    plot a separated glamorous hollywood couple mu...
Name: cleaned_text, Length: 1360, dtype: object

In [17]:
# generate Frequencies for the positive labels
import nltk
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

positive_examples = train.query("label == 1")["cleaned_text"].tolist()
cv = CountVectorizer(stop_words="english", ngram_range=(1, 3), max_df=.6)
cv.fit(positive_examples)

In [42]:
id2vocab = {v:k for k,v in cv.vocabulary_.items()}

pos_matrix = cv.transform(positive_examples)
counts = np.array(np.sum(pos_matrix, axis=0))[0]
sorted_mat = np.argsort(counts)[::-1]

# # top 15 tokens
for val in sorted_mat[:15]:
    print(f"{id2vocab[val]}: {counts[val]}")

story: 939
good: 846
life: 824
way: 665
year: 631
doe: 595
performance: 584
best: 560
little: 559
thing: 557
people: 553
come: 552
really: 525
great: 517
work: 500


In [44]:
# generate input features
from sklearn.feature_extraction.text import TfidfVectorizer

# We'll use the TFIDF instead of pure counts as it provides the model additional information
tfidf = TfidfVectorizer(
    stop_words = "english", # use english stopwords
    ngram_range = (1, 3), # generate ngrams
    max_df = .6, # corpus level stopwords: words that frequently occur in corpus
    max_features = 20000, # maximum vocabulary size
)

# Fit the tfidf vectorizer on the cleaned text column
tfidf.fit(train["cleaned_text"])

# Generate feature for train
train_X = tfidf.transform(train["cleaned_text"])
train_y = train["label"].tolist() # labels are already numericalised

# generate features for test
test_X = tfidf.transform(test["cleaned_text"])
test_y = test["label"].tolist()

In [45]:
train_X.shape

(1360, 20000)

In [None]:
# Train a Gaussian Naive Bayes Model
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report

nb = GaussianNB() # initialise model
nb.fit(train_X.toarray())