# Stage 1: Loading the data

In [65]:
reviews_train = []
for line in open('./movie_data/full_train.txt', 'r', encoding = "utf-8"):
    reviews_train.append(line.strip())
    
reviews_test = []
for line in open('./movie_data/full_test.txt', 'r', encoding = "utf-8"):
    reviews_test.append(line.strip())

# Stage 2: Cleaning the data

'Bromwell High is a cartoon comedy. It ran at the same time as some other programs about school life, such as "Teachers". My 35 years in the teaching profession lead me to believe that Bromwell High\'s satire is much closer to reality than is "Teachers". The scramble to survive financially, the insightful students who can see right through their pathetic teachers\' pomp, the pettiness of the whole situation, all remind me of the schools I knew and their students. When I saw the episode in which a student repeatedly tried to burn down the school, I immediately recalled ......... at .......... High. A classic line: INSPECTOR: I\'m here to sack one of your teachers. STUDENT: Welcome to Bromwell High. I expect that many adults of my age think that Bromwell High is far fetched. What a pity that it isn\'t!'

'Bromwell High is a cartoon comedy It ran at the same time as some other programs about school life such as Teachers My 35 years in the teaching profession lead me to believe that Bromwell High s satire is much closer to reality than is Teachers The scramble to survive financially the insightful students who can see right through their pathetic teachers pomp the pettiness of the whole situation all remind me of the schools I knew and their students When I saw the episode in which a student repeatedly tried to burn down the school I immediately recalled at High A classic line INSPECTOR I m here to sack one of your teachers STUDENT Welcome to Bromwell High I expect that many adults of my age think that Bromwell High is far fetched What a pity that it isn t '

In [66]:
import re

reviews_train_clean = [re.sub('\W+',' ', line) for line in reviews_train]
reviews_test_clean = [re.sub('\W+',' ', line) for line in reviews_test]

# Stage 3: Preprocess: Vectorization

> Convert a collection of text documents to a matrix of token counts

https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html

In order for this data to make sense to our machine learning algorithm we’ll need to convert each review to a numeric representation, which we call vectorization

In [67]:
## Example
from sklearn.feature_extraction.text import CountVectorizer
corpus = [
    'This is the first document.',
    'This document is the second document.',
    'And this is the third one.',
    'Is this the first document?',
]
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)
print(vectorizer.get_feature_names())
print(X.toarray()) 

['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this']
[[0 1 1 1 0 0 1 0 1]
 [0 2 0 1 0 1 1 0 1]
 [1 0 0 1 1 0 1 1 1]
 [0 1 1 1 0 0 1 0 1]]


In [68]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(binary=True)
vectorizer.fit(reviews_train_clean)
X = vectorizer.transform(reviews_train_clean)
X_test = vectorizer.transform(reviews_test_clean)


# Stage 4: Build Classifier

Now that we’ve transformed our dataset into a format suitable for modeling we can start building a classifier

### Logistic Regression
**Logistic Regression** is a good baseline model for us to use for several reasons: 
1. They’re easy to interpret 
2. linear models tend to perform well on sparse datasets like this one 
3. they learn very fast compared to other algorithms.


In [105]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import warnings

warnings.filterwarnings("ignore", category=FutureWarning)

target = [1 if i < 12500 else 0 for i in range(25000)]

X_train, X_val, y_train, y_val = train_test_split(
    X, target, train_size = 0.75
)

for c in [0.01, 0.05, 0.25, 0.5, 1]:
    lr = LogisticRegression(C=c)
    lr.fit(X_train, y_train)
    print ("Accuracy for C=%s: %s" 
           % (c, accuracy_score(y_val, lr.predict(X_val))))

Accuracy for C=0.01: 0.8704
Accuracy for C=0.05: 0.87792
Accuracy for C=0.25: 0.87616
Accuracy for C=0.5: 0.8728
Accuracy for C=1: 0.87216


In [106]:
final_model = LogisticRegression(C=0.25)
final_model.fit(X, target)
print ("Final Accuracy: %s" 
       % accuracy_score(target, final_model.predict(X_test)))

Final Accuracy: 0.87756


In [109]:
feature_to_coef = {
    word: coef for word, coef in zip(
        cv.get_feature_names(), final_model.coef_[0]
    )
}
print("-"*10 + "Positive" + "-"*10)
for best_positive in sorted(
    feature_to_coef.items(), 
    key=lambda x: x[1], 
    reverse=True)[:5]:
    print (best_positive)

print("-"*10 + "Nagetive" + "-"*10)

for best_negative in sorted(
    feature_to_coef.items(), 
    key=lambda x: x[1])[:5]:
    print (best_negative)

----------Positive----------
('excellent', 1.229279210024921)
('perfect', 1.0576354995558486)
('refreshing', 1.0263537523627557)
('superb', 0.9573770637489298)
('wonderfully', 0.9226987711022778)
----------Nagetive----------
('worst', -1.808807015721503)
('waste', -1.6837499074071884)
('disappointment', -1.4148073505989953)
('poorly', -1.411221205215189)
('awful', -1.3633340916152703)
