### https://towardsdatascience.com/sentiment-analysis-with-python-part-2-4f71e7bde59a

In [1]:
from nltk.corpus import stopwords
import re

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.svm import LinearSVC

from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
reviews_test = []
reviews_train = []

for line in open('data/aclImdb/movie_data/full_train.txt', encoding="utf8"):
    reviews_train.append(line.strip())
reviews_train[0]

'Bromwell High is a cartoon comedy. It ran at the same time as some other programs about school life, such as "Teachers". My 35 years in the teaching profession lead me to believe that Bromwell High\'s satire is much closer to reality than is "Teachers". The scramble to survive financially, the insightful students who can see right through their pathetic teachers\' pomp, the pettiness of the whole situation, all remind me of the schools I knew and their students. When I saw the episode in which a student repeatedly tried to burn down the school, I immediately recalled ......... at .......... High. A classic line: INSPECTOR: I\'m here to sack one of your teachers. STUDENT: Welcome to Bromwell High. I expect that many adults of my age think that Bromwell High is far fetched. What a pity that it isn\'t!'

In [3]:
for line in open('data/aclImdb/movie_data/full_test.txt', encoding="utf8"):
    reviews_test.append(line.strip())

In [4]:
REPLACE_NO_SPACE = re.compile("[.;:!\'?,\"()\[\]]")
REPLACE_WITH_SPACE = re.compile("(<br\s*/><br\s*/>)|(\-)|(\/)")

def preprocess_reviews(reviews):
    reviews = [REPLACE_WITH_SPACE.sub(" ", REPLACE_NO_SPACE.sub("", review.lower())) for review in reviews]
    return reviews
reviews_train_clean = preprocess_reviews(reviews_train)
reviews_test_clean = preprocess_reviews(reviews_test)
reviews_train_clean[0]

'bromwell high is a cartoon comedy it ran at the same time as some other programs about school life such as teachers my 35 years in the teaching profession lead me to believe that bromwell highs satire is much closer to reality than is teachers the scramble to survive financially the insightful students who can see right through their pathetic teachers pomp the pettiness of the whole situation all remind me of the schools i knew and their students when i saw the episode in which a student repeatedly tried to burn down the school i immediately recalled  at  high a classic line inspector im here to sack one of your teachers student welcome to bromwell high i expect that many adults of my age think that bromwell high is far fetched what a pity that it isnt'

In [5]:
# Check if the below code can be replaced with CountVectorizer using parameters

english_words = set(stopwords.words('english'))
def remove_stop_words(corpus):
    filtered_stop_words = []
    for line in corpus:
        filtered_stop_words.append(' '.join([word for word in line.split() if word not in english_words]))
    return filtered_stop_words

reviews_filtered = remove_stop_words(reviews_train_clean)
reviews_filtered[0]

'bromwell high cartoon comedy ran time programs school life teachers 35 years teaching profession lead believe bromwell highs satire much closer reality teachers scramble survive financially insightful students see right pathetic teachers pomp pettiness whole situation remind schools knew students saw episode student repeatedly tried burn school immediately recalled high classic line inspector im sack one teachers student welcome bromwell high expect many adults age think bromwell high far fetched pity isnt'

In [6]:
def run_lr_model(X, X_test):
    target = [1 if x < 12500 else 0 for x in range(25000)]
    X_train, X_val, y_train, y_val = train_test_split(X, target, train_size=0.75)

    for c in [0.01, 0.05, 0.25, 0.5, 1]:
        lr = LogisticRegression(C=c, max_iter=250)
        lr.fit(X_train, y_train)
        print ("Accuracy for C=%s: %s" 
               % (c, accuracy_score(y_val, lr.predict(X_val))))

    final_ngram = LogisticRegression(C=0.5, max_iter=250)
    final_ngram.fit(X, target)
    print ("Final Accuracy: %s" % accuracy_score(target, final_ngram.predict(X_test)))

In [7]:
def run_svm_model(X, X_test):
    target = [1 if x < 12500 else 0 for x in range(25000)]
    X_train, X_val, y_train, y_val = train_test_split(X, target, train_size=0.75)

    for c in [0.01, 0.05, 0.25, 0.5, 1]:
        svm = LinearSVC(C=c, max_iter=250)
        svm.fit(X_train, y_train)
        print ("Accuracy for C=%s: %s" % (c, accuracy_score(y_val, svm.predict(X_val))))

    final_ngram = LinearSVC(C=0.5, max_iter=250)
    final_ngram.fit(X, target)
    print ("Final Accuracy: %s" % accuracy_score(target, final_ngram.predict(X_test)))

## Normalization

Normalization in NLP is the process of converting a word to its canonical form.

### Stemming

In [8]:
def get_stemmed_text(corpus):
    stemmer = PorterStemmer()
    return [' '.join([stemmer.stem(word) for word in review.split()]) for review in corpus]

stemmed_reviews = get_stemmed_text(reviews_train_clean)
stemmed_reviews[0]

'bromwel high is a cartoon comedi it ran at the same time as some other program about school life such as teacher my 35 year in the teach profess lead me to believ that bromwel high satir is much closer to realiti than is teacher the scrambl to surviv financi the insight student who can see right through their pathet teacher pomp the petti of the whole situat all remind me of the school i knew and their student when i saw the episod in which a student repeatedli tri to burn down the school i immedi recal at high a classic line inspector im here to sack one of your teacher student welcom to bromwel high i expect that mani adult of my age think that bromwel high is far fetch what a piti that it isnt'

### Lemmatization

In [9]:
def get_lemmatized_text(corpus):
    lemmatizer = WordNetLemmatizer()
    return [' '.join([lemmatizer.lemmatize(word) for word in review.split()]) for review in corpus]

lemmatized_reviews = get_lemmatized_text(reviews_train_clean)
lemmatized_reviews[0]

'bromwell high is a cartoon comedy it ran at the same time a some other program about school life such a teacher my 35 year in the teaching profession lead me to believe that bromwell high satire is much closer to reality than is teacher the scramble to survive financially the insightful student who can see right through their pathetic teacher pomp the pettiness of the whole situation all remind me of the school i knew and their student when i saw the episode in which a student repeatedly tried to burn down the school i immediately recalled at high a classic line inspector im here to sack one of your teacher student welcome to bromwell high i expect that many adult of my age think that bromwell high is far fetched what a pity that it isnt'

## n-grams

In [10]:
ngram_vectorizer = CountVectorizer(binary=True, ngram_range=(1, 2))
ngram_vectorizer.fit(reviews_train_clean)
X = ngram_vectorizer.transform(reviews_train_clean)
X_test = ngram_vectorizer.transform(reviews_test_clean)

run_lr_model(X, X_test)

Accuracy for C=0.01: 0.88992
Accuracy for C=0.05: 0.89568
Accuracy for C=0.25: 0.89824
Accuracy for C=0.5: 0.89696
Accuracy for C=1: 0.89712
Final Accuracy: 0.8978


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [11]:
ngram_vectorizer.fit(lemmatized_reviews)
X = ngram_vectorizer.transform(lemmatized_reviews)
X_test = ngram_vectorizer.transform(lemmatized_reviews)

run_lr_model(X, X_test)

Accuracy for C=0.01: 0.88128
Accuracy for C=0.05: 0.8896
Accuracy for C=0.25: 0.89168
Accuracy for C=0.5: 0.89072
Accuracy for C=1: 0.89104
Final Accuracy: 1.0


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


### Word Counts

In [12]:
wc_vectorizer = CountVectorizer(binary=False)
wc_vectorizer.fit(reviews_train_clean)
X = wc_vectorizer.transform(reviews_train_clean)
X_test = wc_vectorizer.transform(reviews_test_clean)

run_lr_model(X, X_test)

Accuracy for C=0.01: 0.87776


KeyboardInterrupt: 

In [None]:
wc_vectorizer.fit(lemmatized_reviews)
X = wc_vectorizer.transform(lemmatized_reviews)
X_test = wc_vectorizer.transform(lemmatized_reviews)

run_lr_model(X, X_test)

## TF-IDF

In [None]:
tfidf_vectorizer = TfidfVectorizer()
tfidf_vectorizer.fit(reviews_train_clean)
X = tfidf_vectorizer.transform(reviews_train_clean)
X_test = tfidf_vectorizer.transform(reviews_test_clean)

run_lr_model(X, X_test)

### Removing a small set of stop words along with an n-gram range from 1 to 3 and a linear svm