In [1]:
!pip install nltk

Defaulting to user installation because normal site-packages is not writeable
Collecting nltk
  Downloading nltk-3.8.1-py3-none-any.whl.metadata (2.8 kB)
Collecting click (from nltk)
  Downloading click-8.1.7-py3-none-any.whl.metadata (3.0 kB)
Collecting regex>=2021.8.3 (from nltk)
  Downloading regex-2024.7.24-cp312-cp312-win_amd64.whl.metadata (41 kB)
     ---------------------------------------- 0.0/41.5 kB ? eta -:--:--
     ---------------------------------------- 41.5/41.5 kB 2.0 MB/s eta 0:00:00
Collecting tqdm (from nltk)
  Downloading tqdm-4.66.5-py3-none-any.whl.metadata (57 kB)
     ---------------------------------------- 0.0/57.6 kB ? eta -:--:--
     ---------------------------------------- 57.6/57.6 kB 1.5 MB/s eta 0:00:00
Downloading nltk-3.8.1-py3-none-any.whl (1.5 MB)
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   --- ------------------------------------ 0.1/1.5 MB 7.0 MB/s eta 0:00:01
   ----- ---------------------------------- 0.2/1.5 MB 3.4


[notice] A new release of pip is available: 24.1.1 -> 24.2
[notice] To update, run: C:\Users\Nguyen Hai Duong\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [8]:
# 1.1a
# grammar generates 3 strings ["hello", "hi", "good to see you"]

# need '' around the words to make them strings
greetGrammar = '''
S -> 'hi' | 'hello' | 'good to see you'
'''

# 1.1b
# load grammar into nltk

from nltk import CFG
from nltk.parse.generate import generate

grammar = CFG.fromstring(greetGrammar)
print(grammar)

print([g for g in generate(grammar)])

Grammar with 3 productions (start state = S)
    S -> 'hi'
    S -> 'hello'
    S -> 'good to see you'
[['hi'], ['hello'], ['good to see you']]


In [10]:
# 1.1c
# modify to accept names

greetGrammar = '''
S -> 'hi' NAME | 'hello' NAME | 'good to see you' NAME
NAME -> 'Alice' | 'Bob' | 'Carla'
'''

grammar = CFG.fromstring(greetGrammar)
print([g for g in generate(grammar)])


[['hi', 'Alice'], ['hi', 'Bob'], ['hi', 'Carla'], ['hello', 'Alice'], ['hello', 'Bob'], ['hello', 'Carla'], ['good to see you', 'Alice'], ['good to see you', 'Bob'], ['good to see you', 'Carla']]


In [11]:
import random
import nltk

positive_film_grammar = nltk.CFG.fromstring("""
S -> NPS VPS | NPP VPP | PR VPR
NPS -> DetS NS
NPP -> DetP NP
NS -> 'director' | 'screenplay' | 'plot' | 'story' | 'atmosphere'
NP -> 'scenes' | 'special effects' | 'costumes' | 'actors' | 'dialogues' | 'characters'

VPS -> VerbS Adj
VPP -> VerbP Adj

VPR -> VerbPR NPS | VerbPR NPP
DetS -> 'the' | 'this'
DetP -> 'the' | 'these' | 'those'

VerbS -> 'is' | 'looks' | 'was'
VerbP -> 'are' | 'look' | 'were'

VerbPR -> 'love' | 'loved' | 'enjoy' | 'enjoyed' | 'fell in love with' | 'adore' | 'adored'
PR -> 'I'

Adj -> 'great' | 'cool' | 'amazing' | 'fantastic' | 'very nice'
""")

positive_reviews = [' '.join(s) for s in generate(positive_film_grammar)]
for i in range(5):
  print(random.choice(positive_reviews))

negative_film_grammar = nltk.CFG.fromstring("""
S -> NPS VPS | NPP VPP | PR VPR
NPS -> DetS NS
NPP -> DetP NP
NS -> 'director' | 'screenplay' | 'plot' | 'story' | 'atmosphere'
NP -> 'scenes' | 'special effects' | 'costumes' | 'actors' | 'dialogues' | 'characters'

VPS -> VerbS Adj
VPP -> VerbP Adj

VPR -> VerbPR NPS | VerbPR NPP
DetS -> 'the' | 'this'
DetP -> 'the' | 'these' | 'those'

VerbS -> 'is' | 'looks' | 'was'
VerbP -> 'are' | 'look' | 'were'

VerbPR -> 'hate' | 'hated' | 'do not like' | 'did not enjoy' | 'got bored with' | 'despise' | 'despised'
PR -> 'I'

Adj -> 'mediocre' | 'dull' | 'terrible' | 'boring' | 'lame' | 'dumb'
""")

# join the strings by space
negative_reviews = [' '.join(s) for s in generate(negative_film_grammar)]
for i in range(5):
  print(random.choice(negative_reviews))  

these actors look fantastic
the scenes are great
the plot was cool
I loved this atmosphere
the atmosphere is very nice
those characters look dull
the characters were lame
I despised those scenes
the actors look dumb
I despised the scenes


Generate a training dataset for the Movie Reviews Sentiment Analysis
task. Your dataset should have 1,000 positive reviews obtained by sampling three random utterances from the positive_reviews language and
concatenating them together, and 1,000 negative reviews obtained by applying the same method to the negative_reviews language. Each datapoint should be a tuple T= (utterance, label) , where label can be
either "neg" or "pos" depending on the sentiment of the generated datapoint.


In [12]:
grammar_training_dataset = []
for i in range(1000):
  positive_utterance = f"{random.choice(positive_reviews)}. {random.choice(positive_reviews)}. {random.choice(positive_reviews)}"
  negative_utterance = f"{random.choice(negative_reviews)}. {random.choice(negative_reviews)}. {random.choice(negative_reviews)}"
  grammar_training_dataset.append((positive_utterance, "pos"))
  grammar_training_dataset.append((negative_utterance, "neg"))

for i in range(5):
  print(random.choice(grammar_training_dataset))

('these characters look dull. the atmosphere looks dull. these dialogues were dull', 'neg')
('the scenes are mediocre. I hated those costumes. those actors look terrible', 'neg')
('these characters look cool. these costumes look great. I adore these costumes', 'pos')
('the atmosphere was dumb. the dialogues were dull. these costumes look mediocre', 'neg')
('the story is amazing. I enjoy these scenes. the characters are great', 'pos')


In [13]:
# 2.1a

import regex as re

text = "We hold these truths to be self-evident, that all men are created equal, that all men are endowed by their Creator with certain unalienable Rights, that among these are Life, Liberty and the pursuit of Happiness and mental stability."

# replace 'men' with 'people'
# \b is a word boundary to not modify 'mentalily'

text = re.sub(r'\bmen\b', 'people', text)
print(text)

We hold these truths to be self-evident, that all people are created equal, that all people are endowed by their Creator with certain unalienable Rights, that among these are Life, Liberty and the pursuit of Happiness and mental stability.


In [15]:
nltk.download('movie_reviews')
from nltk.corpus import movie_reviews

[nltk_data] Downloading package movie_reviews to C:\Users\Nguyen Hai
[nltk_data]     Duong\AppData\Roaming\nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!


In [17]:
def cleanup_review(review):

  # 1. Replace all URLs with URLTOKEN
  review = re.sub(r'http\S+', 'URLTOKEN', review)

  # 2. Replace all dates with DATETOKEN
  review = re.sub(r"\d{4}-\d{2}-\d{2}", "DATETOKEN", review)
  review = re.sub(r"\d{4}/\d{2}/\d{2}", "DATETOKEN", review)

  review = re.sub(r"\d{2}-\d{2}-\d{4}", "DATETOKEN", review)
  review = re.sub(r"\d{2}/\d{2}/\d{4}", "DATETOKEN", review)


  # 3. Remove all non-alphanumerical characters
  review = re.sub(r'[^a-zA-Z0-9,!.\';:? ]', '', review)

  # 4. Collapse multiple spaces into one space
  review = re.sub(r'\s\s+', ' ', review)

  return review

sample_text = "Hello!! My name is Stefano, I have been a tutor for COMP#9414 since 01/04/2023. My personal website is http://stefano.com . (Nice to meet you ^__^)"
print(cleanup_review(sample_text))

Hello!! My name is Stefano, I have been a tutor for COMP9414 since DATETOKEN. My personal website is URLTOKEN . Nice to meet you 


In [24]:
#2.2

import numpy as np
np.random.seed(0)

nltk_data = []
for file_id in movie_reviews.fileids():
  nltk_data.append((movie_reviews.raw(file_id), movie_reviews.categories(file_id)[0]))

print(len(nltk_data))

cleanup_data = [(cleanup_review(review), label) for review, label in nltk_data]
np.random.shuffle(cleanup_data)

print(cleanup_data[0])

2000
("arnold schwarzenegger has been an icon for action enthusiasts , since the late 80's , but lately his films have been very sloppy and the oneliners are getting worse . it's hard seeing arnold as mr . freeze in batman and robin , especially when he says tons of ice jokes , but hey he got 15 million , what's it matter to him ? once again arnold has signed to do another expensive blockbuster , that can't compare with the likes of the terminator series , true lies and even eraser . in this so called dark thriller , the devil gabriel byrne has come upon earth , to impregnate a woman robin tunney which happens every 1000 years , and basically destroy the world , but apparently god has chosen one man , and that one man is jericho cane arnold himself . with the help of a trusty sidekick kevin pollack , they will stop at nothing to let the devil take over the world ! parts of this are actually so absurd , that they would fit right in with dogma . yes , the film is that weak , but it's bet

In [25]:
train_nltk_data = cleanup_data[0:int(len(cleanup_data)*0.85)]
test_nltk_data = cleanup_data[int(len(cleanup_data)*0.85):int(len(cleanup_data)*0.95)]
valid_nltk_data = cleanup_data[int(len(cleanup_data)*0.95):]

In [33]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline

pipeline = Pipeline([
  ('vect', TfidfVectorizer(min_df=3, max_df=0.95)),
  ('clf', LinearSVC(C=1000, max_iter=10000)),
])

pipeline.fit([d[0] for d in train_nltk_data], [d[1] for d in train_nltk_data])

In [34]:
from sklearn import metrics

y_predicted = pipeline.predict([t[0] for t in test_nltk_data])

# Print the classification report
print(metrics.classification_report([t[1] for t in test_nltk_data], y_predicted,
                                    target_names=['positive', 'negative']))

              precision    recall  f1-score   support

    positive       0.83      0.86      0.84        98
    negative       0.86      0.83      0.85       102

    accuracy                           0.84       200
   macro avg       0.85      0.85      0.84       200
weighted avg       0.85      0.84      0.85       200



Experiment with different classifiers – for example, you may want
to try a simple GaussianNB classifier, or try some classifiers that
usually perform well on this task such as AdaBoostClassifier or
RandomForestClassifier.

• Try changing the parameters of your classifiers – for example, try
reducing the C regularization parameter in your SVC, or increasing
it further.

In [38]:
# 3.f (ii) classifiers experiments

from sklearn.datasets import make_circles, make_classification, make_moons
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.inspection import DecisionBoundaryDisplay
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

classifiers = [
    KNeighborsClassifier(3),
    SVC(gamma=2, C=1),
    DecisionTreeClassifier(max_depth=5),
    RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
    MLPClassifier(alpha=1, max_iter=1000),
    AdaBoostClassifier(),
    # GaussianNB(),
]

# use each classifier to train the model
# vectorizer is used to convert text into numerical values
# classifier is used to classify the text into positive or negative

for c in classifiers:
  print(f"Training classifier model: {c.__str__()}")

  # create a pipeline to train the model
  # pipeline is a sequence of transformations followed by a classifier
  pipeline=Pipeline([
      ('vect', TfidfVectorizer(min_df=3, max_df=0.95)), # vectorizer
      ('clf', c), # classifier changed here
      ])
  
  # Train the model
  pipeline.fit(np.array([d[0] for d in train_nltk_data]), np.array([d[1] for d in train_nltk_data]))
  # Predict the test data
  y_predicted = pipeline.predict([t[0] for t in valid_nltk_data])

  # Print the classification report
  print(metrics.classification_report([t[1] for t in valid_nltk_data], y_predicted,
                                    target_names=['positive', 'negative']))

Training classifier model: KNeighborsClassifier(n_neighbors=3)
              precision    recall  f1-score   support

    positive       0.82      0.67      0.73        60
    negative       0.61      0.78      0.68        40

    accuracy                           0.71       100
   macro avg       0.71      0.72      0.71       100
weighted avg       0.73      0.71      0.71       100

Training classifier model: SVC(C=1, gamma=2)
              precision    recall  f1-score   support

    positive       0.90      0.78      0.84        60
    negative       0.73      0.88      0.80        40

    accuracy                           0.82       100
   macro avg       0.82      0.83      0.82       100
weighted avg       0.83      0.82      0.82       100

Training classifier model: DecisionTreeClassifier(max_depth=5)
              precision    recall  f1-score   support

    positive       0.84      0.53      0.65        60
    negative       0.55      0.85      0.67        40

    accurac



              precision    recall  f1-score   support

    positive       0.87      0.78      0.82        60
    negative       0.72      0.82      0.77        40

    accuracy                           0.80       100
   macro avg       0.79      0.80      0.80       100
weighted avg       0.81      0.80      0.80       100



In [36]:
# if change vectorizer to CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer

for c in classifiers:
  print(f"Training classifier model: {c.__str__()}")

  # create a pipeline to train the model
  # pipeline is a sequence of transformations followed by a classifier
  pipeline=Pipeline([
      ('vect', CountVectorizer(min_df=3, max_df=0.95)), # vectorizer changed here
      ('clf', c), # classifier
      ])
  
  # Train the model
  pipeline.fit(np.array([d[0] for d in train_nltk_data]), np.array([d[1] for d in train_nltk_data]))
  # Predict the test data
  y_predicted = pipeline.predict([t[0] for t in valid_nltk_data])

  # Print the classification report
  print(metrics.classification_report([t[1] for t in valid_nltk_data], y_predicted,
                                    target_names=['positive', 'negative']))

Training classifier model: KNeighborsClassifier(n_neighbors=3)
              precision    recall  f1-score   support

    positive       0.64      0.73      0.68        60
    negative       0.48      0.38      0.42        40

    accuracy                           0.59       100
   macro avg       0.56      0.55      0.55       100
weighted avg       0.58      0.59      0.58       100

Training classifier model: SVC(C=1, gamma=2)


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


              precision    recall  f1-score   support

    positive       0.00      0.00      0.00        60
    negative       0.40      1.00      0.57        40

    accuracy                           0.40       100
   macro avg       0.20      0.50      0.29       100
weighted avg       0.16      0.40      0.23       100

Training classifier model: DecisionTreeClassifier(max_depth=5)
              precision    recall  f1-score   support

    positive       0.78      0.48      0.60        60
    negative       0.51      0.80      0.62        40

    accuracy                           0.61       100
   macro avg       0.65      0.64      0.61       100
weighted avg       0.67      0.61      0.61       100

Training classifier model: RandomForestClassifier(max_depth=5, max_features=1, n_estimators=10)
              precision    recall  f1-score   support

    positive       0.69      0.18      0.29        60
    negative       0.42      0.88      0.56        40

    accuracy           



              precision    recall  f1-score   support

    positive       0.88      0.75      0.81        60
    negative       0.69      0.85      0.76        40

    accuracy                           0.79       100
   macro avg       0.79      0.80      0.79       100
weighted avg       0.81      0.79      0.79       100

