In [2]:
%load_ext autoreload
%autoreload 2
%load_ext autotime

In [3]:
import sys
sys.path.append('../src')
from main import load_data, Extractor, remove_stop_words, negation_handling, clean_text, lemmatizing
from preprocessing import emoji_tagging
from scipy.sparse import save_npz, load_npz
import numpy as np
import nltk
from nltk import word_tokenize
from util import curry
from pprint import pprint

from wordcloud import WordCloud
import matplotlib.pyplot as plt

time: 1.65 s


# Load data and do some preprocessing:

In [4]:
Xtr_text, Ytr, Xte_text, Yte = load_data('../aclImdb/train/', '../aclImdb/test/')

Load Data...
Done loading data!

time: 18.7 s


In [5]:
extractor1 = Extractor(Xtr_text, Xte_text)\
        .bind(curry(emoji_tagging))\
        .bind(curry(remove_stop_words))\
        .bind(curry(lemmatizing))\
        .bind(curry(negation_handling))\
        .bind(curry(clean_text))

time: 6min 5s


In [6]:
extractor2 = Extractor(Xtr_text, Xte_text)\
        .bind(curry(remove_stop_words))\
        .bind(curry(lemmatizing))\
        .bind(curry(clean_text))

time: 3min 38s


In [7]:
extractor3 = Extractor(Xtr_text, Xte_text)\
        .bind(curry(lemmatizing))\
        .bind(curry(clean_text))

time: 1min 27s


In [8]:
extractor4 = Extractor(Xtr_text, Xte_text)\
        .bind(curry(clean_text))

time: 39.5 s


In [9]:
Xtr1, Xte1 = extractor1.get_features()
Xtr2, Xte2 = extractor2.get_features()
Xtr3, Xte3 = extractor3.get_features()
Xtr4, Xte4 = extractor4.get_features()

time: 1.12 ms


In [10]:
print(Xtr1.shape)
print(Xte1.shape)

AttributeError: 'list' object has no attribute 'shape'

time: 149 ms


# Split the Training Dataset up in positive and negative sets

Lets analyze the datasets

In [None]:
Xtr_np = np.array(Xtr1)
Xte_np = np.array(Xte1)

Xtr_pos = Xtr_np[Ytr>0]
Xtr_neg = Xtr_np[Ytr<1]

In [None]:
super_review_pos = " ".join(Xtr_pos)
super_review_neg = " ".join(Xtr_neg)

In [None]:
tokens_pos = word_tokenize(super_review_pos)
tokens_neg = word_tokenize(super_review_neg)

In [None]:
frequency_dist_pos = nltk.FreqDist(tokens_pos)
frequency_dist_neg = nltk.FreqDist(tokens_neg)

In [None]:
pos = sorted(frequency_dist_pos.items(), key=lambda x:-x[1])[:50]
neg = sorted(frequency_dist_neg.items(), key=lambda x:-x[1])[:50]
print(pos)
print(neg)

## Create a wordcloud based on the top 50 words in each set

In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt

# fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(50,50))

wordcloud_pos = WordCloud().generate_from_frequencies(frequency_dist_pos)
wordcloud_neg = WordCloud().generate_from_frequencies(frequency_dist_pos)

plt.imshow(wordcloud_pos)
plt.title("Positive")
plt.axis("off")
plt.savefig("positive_wordcloud.png", bbox_inches='tight')
plt.show()

plt.imshow(wordcloud_neg)
plt.title("Negative")
plt.axis("off")
plt.savefig("negative_wordcloud.png", bbox_inches='tight')
plt.show()

# Feature selection: Kbest based on Chi test and ngram range performance

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2

In [None]:
def vectorize(Xtr, Ytr, Xte, Yte, ngram_r=(1,1)):
    vectorizer = TfidfVectorizer(ngram_range=ngram_r)

    vectorizer.fit(Xtr, Ytr)
    XtrS = vectorizer.transform(Xtr)
    XteS = vectorizer.transform(Xte)
    print("Number of Features: ", len(vectorizer.get_feature_names()))
    
    return XtrS, XteS

def k_Best(Xtr, Ytr, Xte, Yte, k_best):
    Select = SelectKBest(chi2, k=k_best).fit(Xtr, Ytr)
    XtrS = Select.transform(Xtr)
    XteS = Select.transform(Xte)

    print("Number of features after select KBest", k_best)
    
    return XtrS, XteS

def plot_scores(k_list, f1_scores, p_scores, r_scores, labels, save_to_file=False):
    for f1, k in zip(f1_scores, k_list):
        plt.semilogx(k, f1)
        plt.title("F-1 score for best k features")
    plt.legend(labels)
    plt.xlabel("k")
    if save_to_file:
        plt.savefig('f1-score.png', bbox_inches='tight')
    plt.show()
    
    
    for p, k in zip(p_scores, k_list):
        plt.semilogx(k, p)
    plt.title("Precision for best k features")
    plt.legend(labels)
    plt.xlabel("k")
    if save_to_file:
        plt.savefig('precision-score.png', bbox_inches='tight')
    plt.show()
        
    
    for r, k in zip(r_scores, k_list):
        plt.semilogx(k, r)
    plt.title("Recall for best k features")
    plt.legend(labels)
    plt.xlabel("k")
    if save_to_file:
        plt.savefig('recall-score.png', bbox_inches='tight')
    plt.show()


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, precision_score, recall_score
import matplotlib.pyplot as plt
import time 

In [None]:
Xtr_11_1, Xte_11_1 = vectorize(Xtr1, Ytr, Xte1, Yte, ngram_r=(1,1))
Xtr_11_2, Xte_11_2 = vectorize(Xtr2, Ytr, Xte2, Yte, ngram_r=(1,1))
Xtr_11_3, Xte_11_3 = vectorize(Xtr3, Ytr, Xte4, Yte, ngram_r=(1,1))
Xtr_11_4, Xte_11_4 = vectorize(Xtr4, Ytr, Xte4, Yte, ngram_r=(1,1))

Xtr_set = [Xtr_11_1, Xtr_11_2, Xtr_11_3, Xtr_11_4]
Xte_set = [Xte_11_1, Xte_11_2, Xte_11_3, Xte_11_4]

k_list = [
    [10, 100, 1000, 10000, 50000, 88391],
    [10, 100, 1000, 10000, 50000, 69950],
    [10, 100, 1000, 10000, 50000, 72198],
    [10, 100, 1000, 10000, 50000, 73986]
]

labels=["All preprocessing", "wo negation/emoji", "wo negation/emoji/stopword_removing", "Only clean text"]

In [None]:
# Xtr_11, Xte_11 = vectorize(Xtr1, Ytr, Xte1, Yte, ngram_r=(1,1))
# Xtr_22, Xte_22 = vectorize(Xtr1, Ytr, Xte1, Yte, ngram_r=(2,2))
# Xtr_33, Xte_33 = vectorize(Xtr1, Ytr, Xte1, Yte, ngram_r=(3,3))
# Xtr_12, Xte_12 = vectorize(Xtr1, Ytr, Xte1, Yte, ngram_r=(1,2))
# Xtr_13, Xte_13 = vectorize(Xtr1, Ytr, Xte1, Yte, ngram_r=(1,3))

# Xtr_set = [Xtr_11, Xtr_22, Xtr_33, Xtr_12, Xtr_13]
# Xte_set = [Xte_11, Xte_22, Xte_33, Xte_12, Xte_13]

# k_list = [
#     [10, 100, 1000, 10000, 50000, 88391],
#     [10, 100, 1000, 10000, 100000, 1000000, 1790783],
#     [10, 100, 1000, 10000, 100000, 1000000, 2842984],
#     [10, 100, 1000, 10000, 100000, 1000000, 1879174],
#     [10, 100, 1000, 10000, 100000, 1000000, 4722158]
# ]

# labels=["ngram=(1,1)", "ngram=(2,2)", "ngram=(3,3)", "ngram=(1,2)", "ngram=(1,3)"]

In [None]:

f1_scores_all = []
p_scores_all = []
r_scores_all  = []

for (Xtrain, Xtest), k_best in zip(zip(Xtr_set, Xte_set), k_list):
    f1_scores = []
    p_scores = []
    r_scores  = []
    
    for k in k_best:
        Xtr_vec, Xte_vec = k_Best(Xtrain, Ytr, Xtest, Yte, k)
        classifier = LogisticRegression(solver='lbfgs')
        classifier.fit(Xtr_vec, Ytr)
        Yte_hat = classifier.predict(Xte_vec)

        f1_scores.append(f1_score(Yte, Yte_hat))
        p_scores.append(precision_score(Yte, Yte_hat))
        r_scores.append(recall_score(Yte, Yte_hat))

    f1_scores_all.append(f1_scores)
    p_scores_all.append(p_scores)
    r_scores_all.append(r_scores)
    


In [None]:
plot_scores(k_list, f1_scores_all, p_scores_all, r_scores_all, labels, save_to_file=True)
