In [None]:
from __future__ import print_function, division
import sys
reload(sys)
sys.setdefaultencoding('utf-8')

In [None]:
import os
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
def data_load(data_dir):
  
    data = {}
    for split in ["train", "test"]:
        data[split] = []
        for sentiment in ["neg", "pos"]:
            score = 1 if sentiment == "pos" else 0

            path = os.path.join(data_dir, split, sentiment)
            file_names = os.listdir(path)
            for f_name in file_names:
                with open(os.path.join(path, f_name), "r") as f:
                    review = f.read()
                    data[split].append([review, score])

    np.random.shuffle(data["train"])        
    data["train"] = pd.DataFrame(data["train"],
                                 columns=['text', 'sentiment'])

    np.random.shuffle(data["test"])
    data["test"] = pd.DataFrame(data["test"],
                                columns=['text', 'sentiment'])

    return data["train"], data["test"]

In [None]:
#The path may vary depending on the enviroment used 
#to work with the Notebook
#(Google Colab, Kaggle Kernel, Jupyter)

train, test = data_load('aclImdb_v1/aclImdb/')

The text should be treated as string, because by default python char array (' ') adds extra symbols for quoting, this might screw the preprocessing on the further steps.

In [None]:
corpus = pd.concat([train,test], axis=0)

corpus['text'] = corpus['text'].astype(str)

#Clean the memory
del train
del test

In [None]:
corpus.head()

## Simple MultionomialNB with no preprocessing

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(corpus['text'], corpus['sentiment'],
                                                   test_size=0.25)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
vec = CountVectorizer()
X_train = vec.fit_transform(X_train)
X_test = vec.transform(X_test)

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import f1_score

#alpha parameters stands for Add-One(Laplace) Smooting
model = MultinomialNB(alpha=1.0)

model.fit(X_train, y_train)

pred = model.predict(X_test)
print("The F1 accuracy score: {}%".format(f1_score(y_test, pred) * 100))

# EDA
The result above is good enough for raw, uncleaned data. Let's explore data, to see what kind of preprocessing steps will be crucial
The plan for EDA is
*  Check the word frequency to build a better stop_words list
* Check if data is dirty with HTML
* Check for some extra features such as emoticons

### The most frequent words

In [None]:
def word_freq(bag_of_words, n):
    sum_words = bag_of_words.sum(axis=0)
    words_freq = [(word, sum_words[0, idx]) for word, idx in     vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return np.array(words_freq[:n])

In [None]:
pd.DataFrame(word_freq(X_train, 30), columns=['Word', 'Frequency'])

### HTML Tags and Numericals

In [None]:
#Check if dataset contains HTML Tags
corpus[corpus['text'].str.contains("<br")]['text'].iloc[1]

### Emoticons

In [None]:
#Check if there are relevant emoticons in the dataset

sad_emoticon =   '[:;](.?)[\(\[\{]|[\(\[\{]{2,}'
happy_emoticon = '[:;](.?)[\)\]\}]|[\)\]\}]{2,}'

sad_in_negative = sum(corpus[corpus['text'].str.contains(sad_emoticon,
                                       regex=True)]['sentiment']==0) 
sad_in_positive = sum(corpus[corpus['text'].str.contains(sad_emoticon,
                                       regex=True)]['sentiment']==1)

happy_in_negative = sum(corpus[corpus['text'].str.contains(happy_emoticon,
                                       regex=True)]['sentiment']==0)
happy_in_positive = sum(corpus[corpus['text'].str.contains(happy_emoticon,
                                       regex=True)]['sentiment']==1)

In [None]:
print("Sad emoticon in Negative class:   {}".format(sad_in_negative))
print("Sad emoticon in Positive class:   {}".format(sad_in_positive))
print("Happy emoticon in Negative class: {}".format(happy_in_negative))
print("Happy emoticon in Positive class: {}".format(happy_in_positive))

In [None]:
import matplotlib.pyplot as plt
fig, ax = plt.subplots()
index = np.arange(2)
bar_width = 0.4
rects1 = ax.bar(index, ((sad_in_negative/(sad_in_negative+sad_in_positive))*100, 
                        (sad_in_positive/(sad_in_negative+sad_in_positive))*100), 
                         bar_width, alpha=0.4, color='b',label='Sad')

rects2 = ax.bar(index + bar_width, 
                ((happy_in_negative/(happy_in_negative+happy_in_positive))*100, 
                 (happy_in_positive/(happy_in_negative+happy_in_positive))*100),
                  bar_width, alpha=0.5, color='r', label='Happy')

ax.set_xlabel('Sentiment')
ax.set_ylabel('Percentages')
ax.set_title('Scores by sentiment and \'happy\' and \'sad\' emoticons')
ax.set_xticks(index + bar_width / 2)
ax.set_xticklabels(('Negative', 'Positive'))

ax

## Preprocessing
### Goal for this stage
* Remove HTML tags and numericals
* Turn emoticons into corresponding codes (':)))))' -> HAPPY_EMOT)
* Preprocess negation (Example: "don't like this movie. awful" -> "don't NOT_like awful")
* Delete all the symbols that were not recognized as HTML, numbers, and apostrophes as part of word shortenings

In between each preprocessing stage it is also important to review how they modified the text data

### Clean HTML and Numericals

In [None]:
import re
def clean_html(raw_text):
  cleanr = re.compile('<.*?>|[0-9]')
  cleantext = re.sub(cleanr, '', raw_text)
    
  return cleantext

In [None]:
#train['text'] = train['text'].apply(clean_html)
#test['text'] = test['text'].apply(clean_html)
corpus['text'] = corpus['text'].apply(clean_html)

In [None]:
corpus['text'].iloc[1]

### Emoticons
Preprocess emoticons and convert to corresponding code representations

In [None]:
import re
def emoticons(raw_text):
    #this function turns emoticons into corresponding text
    #representation, ignoting single parenthesis 
    sad_emoticon =   re.compile('[:;](.?)[\(\[\{]|[\(\[\{]{2,}')
    happy_emoticon = re.compile('[:;](.?)[\)\]\}]|[\)\]\}]{2,}')  
    cleantext = re.sub(sad_emoticon, 'SAD_EMOT ', raw_text)
    cleantext = re.sub(happy_emoticon, ' '+'HAPPY_EMOT'+' ', cleantext) 
    
    return cleantext

In [None]:

#train['text'] = train['text'].apply(emoticons)
#test['text'] = test['text'].apply(emoticons)

corpus['text'] = corpus['text'].apply(emoticons)

### Negations
Preprocess word negations and convert to corresponding representations

In [None]:
#edited NLTK stopwords

stopWords = {'its', 'is', "you'd", 'was','movie',
             'above', 'further', 'y', "should've",'again', 'then', 'am', 'are',
             'their', 'being', 'does', 'no', 'over', 'them',  'her', 'for', 'after',
             'yourselves', 'both', 'before', 'now', 'should', 'too',
             'yourself', 'here', 'same', 'do', 'our', 'has', 'all', "you'll",
             'only', 'as', 'my', 'any',"that'll", 'i', 'when', 'by', 'than', 'had',
             'your', "you're", 'can', 'be', 'herself','myself', 'at',  'in', 'during',
             'did', 'me', 'who', 'own', 'ours', 'won', 'up',"it's", 'that', 'but', 'those', 
             'so', 'an', 'whom', 'shan', 'himself','very','about', 'from', 'which', 'once', 
             'where', 'his', 'few', 'these', 'each', 'other','most',  "she's", 're', 'a',
             'him', 'the', 'under', 'd', 'there', 'we', 'having', 'into', 'you',  
             'between', 's', 'o', 'itself', 'below', 'll', 'were', 'they', 'how', 'through',
             "you've", 'ourselves', 'until', 'to', 'theirs', 'against', 'themselves', 'and',
             'ma', 'or', 'off', 'because', 'on',  'such',  'what',  'out', 'with', 'just',
             'doing', 'he', 'it', 'why', 't', 'yours',  'some', 've', 'if', 'will', 'hers',
             'while', 'she','been', 'of', 'more', 'nor', 'this', 'm','have', 'down'}

negateWords = {'ain', 'don', "don't",
               'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't",'doesn', "doesn't",
               'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'mightn',
               "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'not', "shan't", 'shouldn',
               "shouldn't", 'wasn', "wasn't", 'weren', "weren't", "won't", 'wouldn',"wouldn't"}

In [None]:
def negate_sequence(text):
    """
    Detects negations and transforms negated words into "NOT_" form 
    for all words until the next punctuation mark.
    """
    negation = False
    delims = "?.,!:;'\""
    result = []
#Here rather then applying split, we can directly feed our extracted symptoms list
    words = text.lower().split()
    prev = None
    pprev = None
    for word in words:
        #stripped = word.strip(delchars)
        stripped = word.strip(delims)
        if stripped not in stopWords:
            negated = "NOT_" + stripped if negation else stripped
            if negated not in negateWords:
                result.append(negated)
        
        if any(neg in word for neg in ['NOT','not','N\'T', 'n\'t','nt','NT']):
            negation = not negation

        if any(c in word for c in delims):
            negation = False

    return ' '.join(result)

In [None]:
#train['text'].head().apply(negate_sequence).iloc[1]
#test['text'] = test['text'].apply(negate_sequence)

corpus['text'] = corpus['text'].apply(negate_sequence)

In [None]:
corpus['text'].iloc[1]

### Symbols
Clean data from extra symbols that were not cleaned on earlier stages

In [None]:
import re
def preproc(raw_text):
    cleanr = re.compile('[\-\+!@#$%^&*()<>?\|\/]')
    cleantext = re.sub(cleanr, '', raw_text)
    
    return cleantext

In [None]:
#train['text'] = train['text'].apply(preproc)
#test['text'] = test['text'].apply(preproc)

corpus['text'] = corpus['text'].apply(preproc)

### WordNetLemmatizer
Lemmatize the whole corpus

In [None]:

from nltk.stem.wordnet import WordNetLemmatizer
lmtzr = WordNetLemmatizer()
def lemmatize_sentences(sentence):
    tokens = sentence.split()
    lemmatized_tokens = [lmtzr.lemmatize(token) for token in tokens]
    return ' '.join(lemmatized_tokens)

In [None]:
#train['text'] = train['text'].apply(lemmatize_sentences)
#test['text'] = test['text'].apply(lemmatize_sentences)

corpus['text'] = corpus['text'].apply(lemmatize_sentences)

In [None]:
corpus['text'].iloc[11]

### Porter Stemmer (SKIP)
For this particular dataset the PorterStemmer does not bring better performance, so it is better to skip this step.

In [None]:
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
def stem_sentences(sentence):
    tokens = sentence.split()
    stemmed_tokens = [stemmer.stem(token) for token in tokens]
    return ' '.join(stemmed_tokens)

In [None]:
#corpus['text'] = corpus['text'].apply(stem_sentences)

In [None]:
corpus['text'].iloc[1]

# Model Training

In [None]:
#ngram_range 1,3 brings highest accuracy score, 
#yet it is very slow to train
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(corpus['text'], corpus['sentiment'],
                                                   test_size=0.25)

vec = CountVectorizer(ngram_range=(1,3),
                      binary=True)
X_train = vec.fit_transform(X_train)
X_test = vec.transform(X_test)

### MultinomialNB training and accuracy

In [None]:
model = MultinomialNB(alpha=1.0)

model.fit(X_train, y_train)

pred = model.predict(X_test)
print("The F1 accuracy score: {}%".format(f1_score(y_test, pred) * 100))

## Linear SVC Training and accuracy

In [None]:
from sklearn.svm import LinearSVC
from sklearn.model_selection import GridSearchCV

model = LinearSVC(C=4,loss='squared_hinge')
model.fit(X_train, y_train)
pred = model.predict(X_test)
print("The F1 accuracy score: {}%".format(f1_score(y_test, pred) * 100))