In [1]:
from datasets import load_dataset_builder

ds_builder = load_dataset_builder("imdb")

  from .autonotebook import tqdm as notebook_tqdm
Downloading builder script: 100%|██████████| 4.31k/4.31k [00:00<00:00, 1.33MB/s]
Downloading metadata: 100%|█████████████████| 2.17k/2.17k [00:00<00:00, 538kB/s]


# Lab 02 NLP Non Deep

# Part 1: The dataset

## Question 1: How many splits does the dataset has?
The data set has 3 splits as we can see below

In [None]:
from datasets import get_dataset_split_names
get_dataset_split_names("imdb")

['train', 'test', 'unsupervised']

## Question 2: How big are these splits?
- train: 25000 records
- test: 25000 records
- unsupervised: 50000 records

In [3]:
from datasets import load_dataset

dataset = load_dataset("imdb")
dataset

Downloading and preparing dataset imdb/plain_text (download: 80.23 MiB, generated: 127.02 MiB, post-processed: Unknown size, total: 207.25 MiB) to /Users/longvo/.cache/huggingface/datasets/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1...


Downloading data: 100%|████████████████████| 84.1M/84.1M [00:39<00:00, 2.13MB/s]
                                                                                

Dataset imdb downloaded and prepared to /Users/longvo/.cache/huggingface/datasets/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1. Subsequent calls will reuse this data.


100%|█████████████████████████████████████████████| 3/3 [00:00<00:00, 81.04it/s]


DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})

In [None]:
# We can see that the train and the test splits have 25000 rows. The unsupervised splits has 50000 rows. 

## Question 3: What is the proportion of each class on the supervised splits?

Here we can see the proportion of value of each split is:
- train: 50% of value 0, 50% of value 1 (12500 each)
- test: 50% of value 0, 50% of value 1 (12500 each)

In [4]:
train_data, test_data = load_dataset('imdb', split =['train', 'test'])

Found cached dataset imdb (/Users/longvo/.cache/huggingface/datasets/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1)
100%|█████████████████████████████████████████████| 2/2 [00:00<00:00, 76.83it/s]


In [5]:
df_pandas_train = train_data.to_pandas()
df_pandas_test = test_data.to_pandas()

In [7]:
df_pandas_train['label'].value_counts()[0]


12500

In [8]:
df_pandas_train['label'].value_counts()[1]

12500

In [9]:
df_pandas_test['label'].value_counts()[0]

12500

In [10]:
df_pandas_test['label'].value_counts()[1]

12500

# Part 2: Naive Bayes classifier

## Question 1: Text pretreatment
- First all the alphabetical characters are switched into lowercase
- Puntuation is removed to avoid tokenizing '.' or ',' which will influence the output

In [11]:
import string
import pandas as pd

def lower_case_rm_punc(dataframe: pd.DataFrame) -> pd.DataFrame:
    dataframe['text'] = dataframe['text'].str.lower()
    dataframe['text'] = dataframe['text'].str.replace(r'[^\w\s]+', '')
    return dataframe

df_train = lower_case_rm_punc(df_pandas_train)
df_test = lower_case_rm_punc(df_pandas_test)

  dataframe['text'] = dataframe['text'].str.replace(r'[^\w\s]+', '')


## Question 2: Create a pipeline that has a `CountVectorizer()` and a `MultinomialNB`

In [12]:
# Question 2
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline

pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('clf', MultinomialNB())
])

Fit the training data in to train the model

In [13]:
pipeline.fit(df_train['text'], df_train['label'])

## Question 3: Report on the accuracy of the model

- The accuracy of the model on `train` split is: $91.28\%$
- The accuracy of the model on `test` split is: $81.74\%$

In [17]:
from sklearn.metrics import accuracy_score
train_pred = pipeline.predict(df_train['text'])
test_pred = pipeline.predict(df_test['text'])
print(accuracy_score(df_train['label'], train_pred))
print(accuracy_score(df_test['label'], test_pred))

0.91288
0.8174


## Question 4: Why accuracy is sufficient as a method of evaluation ?
- Because for evaluation we don't need property such as continuity or derivability. In this case a classification problem, the metric we really need to see is on average how well it classifies between the positive and negative review.

## Question 5: What are the top 10 most important words (features) for each class?
- Before stopwords removal, we can clearly see that almost all the top 10 most important words is dominated by stopwords such as `the` or `and` or `of` etc. These words often bring no value to the semantics as well as the morphology of the sentence (Right Column)

In [21]:
def show_most_informative_features(vectorizer, clf, n=20):
    feature_names = vectorizer.get_feature_names_out()
    coefs_with_fns = sorted(zip(clf.feature_log_prob_[0], feature_names))
    top = zip(coefs_with_fns[:n], coefs_with_fns[:-(n + 1):-1])
    for (coef_1, fn_1), (coef_2, fn_2) in top:
        print("\t%.4f\t%-15s\t\t%.4f\t%-15s" % (coef_1, fn_1, coef_2, fn_2))

show_most_informative_features(pipeline.steps[0][1], pipeline.steps[1][1], n=10)

	-14.8692	0001           		-2.8712	the            
	-14.8692	00383042       		-3.6649	and            
	-14.8692	006            		-3.7311	of             
	-14.8692	0079           		-3.7324	to             
	-14.8692	0080           		-4.0529	is             
	-14.8692	0083           		-4.1928	in             
	-14.8692	012310         		-4.2613	this           
	-14.8692	013007         		-4.3065	it             
	-14.8692	03             		-4.3975	that           
	-14.8692	048            		-4.5734	br             


Using `nltk` package to remove stopwords from the sentence. There is a parameter, `stopwords` in `CountVectorizer()` that allows the pipeline to do the same thing. However, there has been numerous known problems relate to this parameter, therefore we opt for the recommended way of `nltk`

In [22]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

nltk.download('stopwords')
nltk.download('punkt')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/longvo/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /Users/longvo/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [23]:
def remove_stop_words(stop_words: set, 
                      dataframe: pd.DataFrame, column: str):
    data = []
    for i in dataframe[column]:
        d = i.split()
        s = ""
        for w in d:
            if w not in stop_words:
                s+=" "+w
        s = s.strip()
        data.append(s)
    dataframe[column] = data

In [24]:
remove_stop_words(stop_words, df_train, 'text')
remove_stop_words(stop_words, df_test, 'text')

- We see a shift in the top 10 most important words. Words such as good, bad, like, really demonstrate the emotion and the conotation of the review.

In [25]:
pipeline.fit(df_train['text'], df_train['label'])
show_most_informative_features(pipeline.steps[0][1], pipeline.steps[1][1], n=10)
from sklearn.metrics import accuracy_score
train_pred = pipeline.predict(df_train['text'])
test_pred = pipeline.predict(df_test['text'])
print(accuracy_score(df_train['label'], train_pred))
print(accuracy_score(df_test['label'], test_pred))


	-14.3119	0001           		-4.0161	br             
	-14.3119	00383042       		-4.2400	movie          
	-14.3119	006            		-4.5215	film           
	-14.3119	0079           		-4.8832	one            
	-14.3119	0080           		-5.0141	like           
	-14.3119	0083           		-5.3751	even           
	-14.3119	012310         		-5.4356	good           
	-14.3119	013007         		-5.4464	bad            
	-14.3119	03             		-5.4803	would          
	-14.3119	048            		-5.5773	really         
0.92556
0.82708


# Part 3: Stemming treatment

- By using `SnowballStemmer` we first:
    - Tokenize the sentence/ paragraph
    - Remove stopwords
    - Stem each word in the word token
    - Added back to the list
    - Assign the list as a column in a given dataframe

In [26]:
import re
from nltk.stem.snowball import SnowballStemmer

re_word = re.compile(r"^\w+$")
stemmer = SnowballStemmer("english")

In [27]:
def stemming(stemmer: nltk.SnowballStemmer, stop_words: set,
             dataframe: pd.DataFrame, column: str):
    data = []
    for i in dataframe[column]:
        d = word_tokenize(i)
        s = ""
        for w in d:
            if w not in stop_words:
                if re_word.match(w):
                    s +=' ' + stemmer.stem(w)
                else:
                    s+=" "+w
        s = s.strip()
        data.append(s)
    dataframe[column] = data 

stemming(stemmer, stop_words, df_train, 'text')
stemming(stemmer, stop_words, df_test, 'text')

In [28]:
pipeline.fit(df_train['text'], df_train['label'])

- Accuracy of the model after stemming:
    - $91.35\%$ on train
    - $81.94\%$ on test

- The accuracy did increase a bit, due to the stemming and the removal of the stop words. Stemming helps words like worst, worse to stem back the same root, stop words remove filler that doesn't contribute to the meaning of the sentence


In [29]:
train_pred = pipeline.predict(df_train['text'])
test_pred = pipeline.predict(df_test['text'])
print(accuracy_score(df_train['label'], train_pred))
print(accuracy_score(df_test['label'], test_pred))

0.91356
0.81944
