## Loading the DataSet

In [0]:
!wget http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz

!tar -zxvf aclImdb_v1.tar.gz

In [0]:
import os
import numpy as np
import pandas as pd

In [0]:
def data_load(data_dir):
  
    data = {}
    for split in ["train", "test"]:
        data[split] = []
        for sentiment in ["neg", "pos"]:
            score = 1 if sentiment == "pos" else 0

            path = os.path.join(data_dir, split, sentiment)
            file_names = os.listdir(path)
            for f_name in file_names:
                with open(os.path.join(path, f_name), "r") as f:
                    review = f.read()
                    data[split].append([review, score])

    np.random.shuffle(data["train"])        
    data["train"] = pd.DataFrame(data["train"],
                                 columns=['text', 'sentiment'])

    np.random.shuffle(data["test"])
    data["test"] = pd.DataFrame(data["test"],
                                columns=['text', 'sentiment'])

    return data["train"], data["test"]

In [0]:
train, test = data_load("aclImdb")

In [4]:
train.head()

Unnamed: 0,text,sentiment
0,This movie has not aged well. Maybe it's just ...,0
1,The Brain That Wouldn't Die is one awful piece...,0
2,Ed Wood is eclipsed and becomes Orson Welles. ...,0
3,"Billed as a kind of sequel to The Full Monty, ...",0
4,"First things first, Edison Chen did a fantasti...",1


## Preprocessing, Lemmatization and Stemming

In [0]:
y_train = train['sentiment']
X_train = train.drop(['sentiment'], axis=1)

In [0]:
y_test = test['sentiment']
X_test = test.drop(['sentiment'], axis=1)

In [0]:
import re
def preproc(s):
    s = re.sub(r"\\", "", s)    
    s = re.sub(r"\'", "", s)    
    s = re.sub(r"\"", "", s)    
    
    s = s.strip().lower()
    
    filters='!@#$%^&*()_-+={}/\[],.<>\t\n'
    translate_dict = dict((c, " ") for c in filters)
    translate_map = str.maketrans(translate_dict)
    s = s.translate(translate_map)

    return s

## Model Training

In [0]:
from sklearn.feature_extraction.text import TfidfVectorizer
tf_idf = TfidfVectorizer(preprocessor=preproc,
                         analyzer='word',
                         stop_words='english',
                         ngram_range=(1,2))

In [0]:
train_feat = tf_idf.fit_transform(X_train['text'])
test_feat = tf_idf.transform(X_test['text'])

In [0]:
from sklearn.metrics import accuracy_score
from sklearn.svm import LinearSVC

In [0]:
model = LinearSVC(C=.4, 
                  loss='squared_hinge',
                  random_state=13)

## Model Evaluation

In [13]:
model.fit(train_feat, y_train)

LinearSVC(C=0.4, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=13, tol=0.0001,
     verbose=0)

In [0]:
pred = model.predict(test_feat)

In [15]:
print("Accuracy: {:.2f}%".format(accuracy_score(y_test, pred)*100))

Accuracy: 88.47%
