In [44]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
import os
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import spacy
import re,string
from nltk.tokenize.toktok import ToktokTokenizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score

In [45]:
nlp = spacy.load('en_core_web_sm')

# 1. Loading the Data

In [46]:
directory_pos = 'AclImdb/test/pos'
arr = []
for filename in tqdm(os.listdir(directory_pos)):
    path= os.path.join(directory_pos,filename)
    with open(path) as f:
        review = f.readlines()
        arr.append(review)
        
posArr = np.array(arr)
posArr = np.insert(posArr, 1, 1, axis=1)

100%|██████████| 12500/12500 [00:07<00:00, 1754.21it/s]


In [47]:
directory_neg = 'AclImdb/test/neg'
arr = []
for filename in tqdm(os.listdir(directory_neg)):
    path= os.path.join(directory_neg,filename)
    with open(path) as f:
        review = f.readlines()
        arr.append(review)

negArr = np.array(arr)
negArr = np.insert(negArr, 1, 0, axis=1)

100%|██████████| 12500/12500 [00:06<00:00, 1860.13it/s]


In [48]:
directory_pos_1 = 'AclImdb/train/pos'
arr = []
for filename in tqdm(os.listdir(directory_pos_1)):
    path= os.path.join(directory_pos_1,filename)
    with open(path) as f:
        review = f.readlines()
        arr.append(review)
        
posArr1 = np.array(arr)
posArr1 = np.insert(posArr1, 1, 1, axis=1)

100%|██████████| 12500/12500 [00:03<00:00, 3232.90it/s]


In [49]:
directory_neg_1 = 'AclImdb/train/neg'
arr = []
for filename in tqdm(os.listdir(directory_neg_1)):
    path= os.path.join(directory_neg_1,filename)
    with open(path) as f:
        review = f.readlines()
        arr.append(review)

negArr1 = np.array(arr)
negArr1 = np.insert(negArr1, 1, 0, axis=1)

100%|██████████| 12500/12500 [00:06<00:00, 1845.35it/s]


In [50]:
print(posArr.shape)
print(negArr.shape)
print(posArr1.shape)
print(negArr1.shape)

(12500, 2)
(12500, 2)
(12500, 2)
(12500, 2)


# 2. Data Preparation (4/10)

## 2.1 Preparing the data for ML model (1/10)

In [51]:
df_pos = pd.DataFrame(data = posArr, columns = ['review','label'])
df_neg = pd.DataFrame(data = negArr, columns = ['review','label'])
df_pos1 = pd.DataFrame(data = posArr1, columns = ['review','label'])
df_neg1 = pd.DataFrame(data = negArr1, columns = ['review','label'])
df = df_pos.append(df_neg)
df_test = df_pos1.append(df_neg1)

In [52]:
print(df.shape)
print(df_test.shape)

(25000, 2)
(25000, 2)


In [53]:
df.describe()

Unnamed: 0,review,label
count,25000,25000
unique,24801,2
top,Loved today's show!!! It was a variety and not...,1
freq,5,12500


In [54]:
df_test.describe()

Unnamed: 0,review,label
count,25000,25000
unique,24904,2
top,How has this piece of crap stayed on TV this l...,1
freq,3,12500


## 2.2 Cleaning and preprocessing of the text (2/10)

### Text Nomalisation

In [55]:
#Convert all text to lowercase
df['review'] = df['review'].str.lower()
df_test['review'] = df_test['review'].str.lower()

In [56]:
#Remove digits from text
df['review'] = df['review'].str.replace('\d+', '')
df_test['review'] = df_test['review'].str.replace('\d+', '')

In [57]:
#Remove Special Characters
def remove_special_characters(text, remove_digits=True):
    pattern = r'[^a-zA-z0-9\s]'
    text = re.sub(pattern,'',text)
    return text

df['review'] = df['review'].apply(remove_special_characters)
df_test['review'] = df_test['review'].apply(remove_special_characters)

In [59]:
#encoding labels
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
lbl_enc = preprocessing.LabelEncoder()
y = lbl_enc.fit_transform(df.label)
y_test = lbl_enc.fit_transform(df_test.label)

### Removing Stop Words & Stemming

In [60]:
import nltk
nltk.download('stopwords')

stopword_list=nltk.corpus.stopwords.words('english')
tokenizer=ToktokTokenizer()
#set stopwords to english
stop=set(stopwords.words('english'))

#removing the stopwords
def remove_stopwords(text, is_lower_case=False):
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    if is_lower_case:
        filtered_tokens = [token for token in tokens if token not in stopword_list]
    else:
        filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]
    filtered_text = ' '.join(filtered_tokens)    
    return filtered_text

df['review']=df['review'].apply(remove_stopwords)
df_test['review']=df_test['review'].apply(remove_stopwords)

[nltk_data] Downloading package stopwords to /Users/david/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [61]:
#Stemming
def simple_stemmer(text):
    ps=nltk.porter.PorterStemmer()
    text= ' '.join([ps.stem(word) for word in text.split()])
    return text

df['review']=df['review'].apply(simple_stemmer)
df_test['review']=df_test['review'].apply(simple_stemmer)

## 2.3 Represent the text in numerical format (1/10)

### Bag Of Words

In [62]:
#Count vectorizer for bag of words
cv=CountVectorizer(min_df=0,max_df=1,binary=False,ngram_range=(1,3))

#transformed train reviews
train_bow =cv.fit_transform(df['review'])

#transformed test reviews
test_bow =cv.transform(df_test['review'])

print('BOW_cv_train:',train_bow.shape)
print('BOW_cv_test:',test_bow.shape)

BOW_cv_train: (25000, 3898050)
BOW_cv_test: (25000, 3898050)


### TF-IDF

In [63]:
tf_idf = TfidfVectorizer(min_df=3,  max_features=None, 
            strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
            ngram_range=(1, 3), use_idf=1,smooth_idf=1,sublinear_tf=1,
            stop_words = 'english')

# Fitting TF-IDF to both training and test sets
tf_idf.fit(list(df['review']) + list(df_test['review']))
train_tf_idf =  tf_idf.transform(df['review']) 
test_tf_idf = tf_idf.transform(df_test['review'])


print('tf_idf_train:',train_tf_idf.shape)
print('tf_idf_test:',test_tf_idf.shape)

tf_idf_train: (25000, 385818)
tf_idf_test: (25000, 385818)


# 3. Training ML classifier to predict the sentiment (1/10)

### Bag Of Words

In [64]:
from sklearn.linear_model import LogisticRegression
# Fitting a simple Logistic Regression on TFIDF
lr_bow = LogisticRegression(C=1.0)
lr_bow.fit(train_bow, y)
y_pred_bow = lr_bow.predict_proba(test_bow)

lr_bow.score(test_bow,y_test)

0.55108

### TF-IDF

In [65]:
from sklearn.linear_model import LogisticRegression
# Fitting a simple Logistic Regression on TFIDF
lr_tf_idf = LogisticRegression(C=1.0)
lr_tf_idf.fit(train_tf_idf, y)
y_pred_tf_idf = lr_tf_idf.predict_proba(test_tf_idf)

lr_tf_idf.score(test_tf_idf,y_test)

0.87652

# 4. Evaluate the model extracting: precision, recall and f1 (1/10)

### Bag Of Words

In [66]:
lr_bow_predict = lr_bow.predict(test_bow)
print(lr_bow_predict)

[0 0 0 ... 0 0 0]


In [67]:
lr_bow_score=accuracy_score(y_test,lr_bow_predict)
print("lr_bow_score :",lr_bow_score)

lr_bow_score : 0.55108


In [68]:
lr_bow_report=classification_report(y_test,lr_bow_predict,target_names=['Positive','Negative'])
print(lr_bow_report)

              precision    recall  f1-score   support

    Positive       0.53      0.98      0.69     12500
    Negative       0.86      0.12      0.21     12500

    accuracy                           0.55     25000
   macro avg       0.69      0.55      0.45     25000
weighted avg       0.69      0.55      0.45     25000



In [70]:
cm_bow=confusion_matrix(y_test,lr_bow_predict,labels=[1,0])
print(cm_bow)

[[ 1533 10967]
 [  256 12244]]


### TF-IDF

In [71]:
lr_tf_idf_predict = lr_tf_idf.predict(test_tf_idf)
print(lr_tf_idf_predict)

[1 1 1 ... 0 0 0]


In [72]:
lr_tf_idf_score=accuracy_score(y_test,lr_tf_idf_predict)
print("lr_tf_idf_score :",lr_tf_idf_score)

lr_tf_idf_score : 0.87652


In [73]:
lr_tf_idf_report=classification_report(y_test,lr_tf_idf_predict,target_names=['Positive','Negative'])
print(lr_tf_idf_report)


              precision    recall  f1-score   support

    Positive       0.89      0.86      0.88     12500
    Negative       0.87      0.89      0.88     12500

    accuracy                           0.88     25000
   macro avg       0.88      0.88      0.88     25000
weighted avg       0.88      0.88      0.88     25000



In [75]:
cm_tf_idf=confusion_matrix(y_test,lr_tf_idf_predict,labels=[1,0])
print(cm_tf_idf)

[[11101  1399]
 [ 1688 10812]]


# 5. Prediction of model for one review (1/10)

In [None]:
"Very well made! The acting was great & the story was interesting throughout!"

# 6. Can you extract general topics from this corpus of texts? (Optional)

No

# 7. Questions

## 7.1 Have you observed any limitations in this dataset? (1/10)

The dataset does not contain the ID of the individual who posted it. Some people might just post more bitter and negative reviews than other skewing the overall data. 

## 7.2 What other techniques or methods that you haven’t used could you apply in any of the stages of the NLP pipeline to improve the performance of the model? (1/10)

I could potentially use different models from the Logistic regression.  \
I tried using lemmatization as seen in class but my laptop kept struggling and the results didn't improve.  \
A different stopword list can also be used. \
Noise could be reduced or features that barely apear can be removed.  \
N-grams and Bi grams could be used.  \
Furthermore, I could also use a larger training dataset by making the test one smaller.

## 7.3 In case you haven’t answered question 6, what technique could you use if you wanted to extract topics from the corpus of texts? (1/10)

LDA could be used as it classifies text in the document to a particular topic.  

Each review is modeled as a multinomial distribution of topics and each topic is modeled as a multinomial distribution of words.