In [47]:
import sklearn
from datasets import load_dataset
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
import random
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import re

In [2]:
raw_datasets = load_dataset("imdb")

Reusing dataset imdb (C:\Users\laram\.cache\huggingface\datasets\imdb\plain_text\1.0.0\2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1)


  0%|          | 0/3 [00:00<?, ?it/s]

The method used to classify the sentiment of a review in [this notebook](https://colab.research.google.com/drive/17LTkRgS5FaLsl2vKLiTB7R93MbQMLUuu?usp=sharing) is by creating vectors based on word counts, and training a logistic regression model on these vectors and their corresponding labels. The CountVectoriser Function from python's sklearn packages is the one used in this explanation. 

I will try to increase the accuracy by assessing the functionality of the CountVectoriser and assessing whether stop words or stemming can be applied.

## Exploring the Functionality of CountVectorizer

The CountVectoriser function removes punctuation from the review, changes it to lowercase, takes the 200 most common words that appear in the reviews, and transform each review into a 200-length vector containing the counts for each word in a review (as explained here by [GeeksForGeeks](https://www.geeksforgeeks.org/using-countvectorizer-to-extracting-features-from-text/#:~:text=CountVectorizer%20creates%20a%20matrix%20in,in%20that%20particular%20text%20sample.) and from an article on [Medium](https://medium.com/@vasista/preparing-the-text-data-with-scikit-learn-b31a3df567e#:~:text=We%20can%20use%20CountVectorizer%20of,punctuation%20and%20lower%20the%20documents.&text=It%20turns%20each%20vector%20into,the%20word%20in%20the%20vocabulary.).

To ensure I understand the way this function works, I will try to replicate this manually.

In [28]:
train_dataset = raw_datasets['train']
train_data = []
train_data_labels = []

# Limiting the size of the dataset to 3, to allow for easier comparison
i = 0
for item in train_dataset:
    train_data.append(item['text'])
    train_data_labels.append(item['label'])
    i += 1
    if i > 2:
        break

In [30]:
vectorizer = CountVectorizer(analyzer='word',max_features=20,lowercase=True)
features = vectorizer.fit_transform(train_data)
features_nd = features.toarray()

In [31]:
features_nd

array([[ 8,  2,  6,  1,  3,  4, 10,  4,  6,  1,  2,  6,  2,  0,  5,  4,
        10,  4,  9,  1],
       [ 5,  3,  0,  3,  2,  3,  7,  3,  1,  1,  3,  5,  3,  2,  0,  2,
         9,  2,  3,  3],
       [ 0,  0,  6,  2,  3,  1,  1,  1,  4,  2,  0,  1,  1,  6,  0,  0,
         2,  3,  1,  2]], dtype=int64)

### Investigating Counts of Words in all Reviews

In [37]:
print(vectorizer.get_feature_names())

['and', 'are', 'br', 'but', 'film', 'for', 'in', 'is', 'it', 'no', 'nudity', 'of', 'on', 'one', 'sex', 'that', 'the', 'this', 'to', 'with']


In [40]:
word_dict = {}

for review in small_train_data:
    review = re.sub(r'[^\w\s]', '', review)
    review = review.lower().split(" ")
    for word in review:
        if word in word_dict:
            word_dict[word] += 1
        else:
            word_dict[word] = 1
            
print(sorted(word_dict.items(), key=lambda x: x[1], reverse=True)[:20])

[('the', 21), ('in', 18), ('to', 13), ('and', 13), ('of', 12), ('a', 12), ('it', 10), ('this', 9), ('i', 8), ('for', 8), ('is', 8), ('film', 7), ('that', 6), ('br', 6), ('on', 6), ('with', 6), ('but', 6), ('sex', 5), ('nudity', 5), ('are', 5)]


The lists seem to contain the mostly the same vocabulary.

The CountVectoriser does not contain the words 'a' and 'i', it's possible that as part of the CountVectoriser function, it removes any words with a length less than 0. 

Let's try implement this into our manual analysis and our most common words may not contain 'a' and 'i', but rather 'no' and 'one', as seen in the CountVectorizer vocabulary.

In [42]:
word_dict = {}

for review in small_train_data:
    review = re.sub(r'[^\w\s]', '', review)
    review = review.lower().split(" ")
    for word in review:
        if len(word) > 1:
            if word in word_dict:
                word_dict[word] += 1
            else:
                word_dict[word] = 1
            
print(sorted(word_dict.items(), key=lambda x: x[1], reverse=True)[:20])

[('the', 21), ('in', 18), ('to', 13), ('and', 13), ('of', 12), ('it', 10), ('this', 9), ('for', 8), ('is', 8), ('film', 7), ('that', 6), ('br', 6), ('on', 6), ('with', 6), ('but', 6), ('sex', 5), ('nudity', 5), ('are', 5), ('one', 5), ('am', 4)]


Both lists contain 19/20 of the words in common. The manual version contains the word 'am', and the CountVectorizer contains the word 'no'. This is due to both words having a frequency of 4. It is safe to assume that this is roughly how the CountVectoriser function works in the background.

### Exploring the Impact of Removing Stop Words

In [43]:
vectorizer = CountVectorizer(analyzer='word',max_features=20,lowercase=True, stop_words='english')
features = vectorizer.fit_transform(train_data)
features_nd = features.toarray()

In [44]:
print(vectorizer.get_feature_names())

['br', 'comes', 'considered', 'curious', 'doesn', 'double', 'drama', 'film', 'films', 'issues', 'like', 'making', 'men', 'nudity', 'really', 'sex', 'shown', 'standard', 'swedish', 'yellow']


It's clear that removing stop words has improved the amount of meaningful words in the WordVectoriser feature count. 

Let's try this at a large scale, on all 50,000 rows of data, to test if there is an improvement to accuracy.

# Removing Stop Words from Dataset

We will use an 80/20 split for training and testing data.

### Assigning all Data to List

In [48]:
all_data = []
all_data_labels = []
for item in raw_datasets['train']:
    all_data.append(item['text'])
    all_data_labels.append(item['label'])

for item in raw_datasets['train']:
    all_data.append(item['text'])
    all_data_labels.append(item['label'])

In [49]:
training_ids = random.sample(list(np.arange(0,50000)), 40000)

### Assigning Training and Testing Data

In [50]:
train_data = []
train_data_labels = []
test_data = []
test_data_labels = []

for review_id in range(0,50000):
    if review_id in training_ids:
        train_data.append(all_data[review_id])
        train_data_labels.append(all_data_labels[review_id])
    else:
        test_data.append(all_data[review_id])
        test_data_labels.append(all_data_labels[review_id])

We need to ensure that the reviews are evenly distributed between the training set and the test set to ensure there is no bias that can affect the accuracy.

In [51]:
positive_training = sum(train_data_labels) / len(train_data_labels) * 100
positive_testing = sum(test_data_labels) / len(test_data_labels) * 100
    
print("The percentage of positive reviews in the training set is " + str(positive_training) + "%")
print("The percentage of positive reviews in the testing set is " + str(positive_testing) + "%")

The percentage of positive reviews in the training set is 50.002500000000005%
The percentage of positive reviews in the testing set is 49.99%


The percentages are close enough to 50% to allow the distribution to be considered fair.

In [52]:
vectorizer = CountVectorizer(analyzer='word', max_features=200, lowercase=True, stop_words='english')
features = vectorizer.fit_transform(train_data)
features_nd = features.toarray()

### Training the Model

In [53]:
log_model = LogisticRegression()
log_model = log_model.fit(X=features_nd,y=train_data_labels)



### Assigning the Test Data and Testing the Model

In [54]:
test_pred=log_model.predict(vectorizer.transform(test_data).toarray())

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  indices = (scores > 0).astype(np.int)


In [55]:
print(accuracy_score(test_pred,test_data_labels))

0.7813


An accuracy of 78.13% is a slight improvement from the accuracy obtained using the 80/20 split in [this notebook](https://github.com/laramurphyyx/CA4023_Assignment2/blob/main/First%20Attempt%20at%20Improving%20Accuracy.ipynb), which was 76.77%.

Stop words are unlikely to influence the classification of a review in a meaningful way. As the most common 200 words in reviews will likely contain many stop words, it is reducing the number of influential words being chosen as a vector feature. This would imply that there would be a significant improvement in accuracy. It is difficult to explain why this did not occur. It could be that the stop words had no affect on the classification of positive/negative reviews, and so removing them and replacing them with less common 'meaningful' words.

It's also possible that the most common words between all reviews are unevenly distributed between positive and negative words. The next step is to create a 200-length vector of the 100 most common words in positive reviews and the 100 most common words in negative reviews.

# Re-Distribution of the Word Vector 

In [58]:
positive_train_data = []
negative_train_data = []

for i in range(0,40000):
    if train_data_labels[i]:
        positive_train_data.append(train_data[i])
    else:
        negative_train_data.append(train_data[i])

In [120]:
# Positive Vector
vectorizer_positive = CountVectorizer(analyzer='word', max_features=150, lowercase=True, stop_words='english')
features_positive = vectorizer_positive.fit_transform(positive_train_data)
positive_vocabulary = vectorizer_positive.vocabulary_

# Negative Vector
vectorizer_negative = CountVectorizer(analyzer='word', max_features=150, lowercase=True, stop_words='english')
features_negative = vectorizer_negative.fit_transform(negative_train_data)
negative_vocabulary = vectorizer_negative.vocabulary_

# Combining both vectors and re-indexing the vector
vocabulary = {**positive_vocabulary, **negative_vocabulary}
temp = []
unique_vocabulary = dict()
i = 0
for key, val in vocabulary.items():
    vocabulary[key] = i
    i += 1

# Using CountVectorizer with this new vocabulary
vectorizer_combined = CountVectorizer(vocabulary=vocabulary, lowercase=True, stop_words='english')
features = vectorizer_combined.fit_transform(train_data)
features_nd = features.toarray()

In [122]:
log_model = LogisticRegression()
log_model = log_model.fit(X=features_nd,y=train_data_labels)



In [123]:
log_model = LogisticRegression()
log_model = log_model.fit(X=features_nd,y=train_data_labels)



In [125]:
test_pred=log_model.predict(vectorizer_combined.transform(test_data).toarray())

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  indices = (scores > 0).astype(np.int)


In [126]:
print(accuracy_score(test_pred,test_data_labels))

0.8008


80.08% is the highest accuracy that we have achieved as of yet. This is expected as there is an evenly distributed number of words in the vector that are positive and negative. 

Let's see if applying stemming will increase the accuracy further.