In [1]:
%mkdir ../data
!wget -O ../data/aclImdb_v1.tar.gz http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
!tar -zxf ../data/aclImdb_v1.tar.gz -C ../data

--2020-06-14 17:41:02--  http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
Resolving ai.stanford.edu (ai.stanford.edu)... 171.64.68.10
Connecting to ai.stanford.edu (ai.stanford.edu)|171.64.68.10|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 84125825 (80M) [application/x-gzip]
Saving to: ‘../data/aclImdb_v1.tar.gz’


2020-06-14 17:41:07 (15.7 MB/s) - ‘../data/aclImdb_v1.tar.gz’ saved [84125825/84125825]



In [2]:
import os
import glob

def read_imdb_data(data_dir='../data/aclImdb'):
    data = {}
    labels = {}
    
    for data_type in ['train', 'test']:
        data[data_type] = {}
        labels[data_type] = {}
        
        for sentiment in ['pos', 'neg']:
            data[data_type][sentiment] = []
            labels[data_type][sentiment] = []
            
            path = os.path.join(data_dir, data_type, sentiment, '*.txt')
            files = glob.glob(path)
            
            for f in files:
                with open(f) as review:
                    data[data_type][sentiment].append(review.read())
                    labels[data_type][sentiment].append(1 if sentiment == 'pos' else 0)
                    
            assert len(data[data_type][sentiment]) == len(labels[data_type][sentiment]), \
                    "{}/{} data size does not match labels size".format(data_type, sentiment)
                
    return data, labels
data, labels = read_imdb_data()
print("IMDB reviews: train = {} pos / {} neg, test = {} pos / {} neg".format(
            len(data['train']['pos']), len(data['train']['neg']),
            len(data['test']['pos']), len(data['test']['neg'])))

IMDB reviews: train = 12500 pos / 12500 neg, test = 12500 pos / 12500 neg


In [0]:
from sklearn.utils import shuffle

def prepare_imdb_data(data, labels):
   
    data_train = data['train']['pos'] + data['train']['neg']
    data_test = data['test']['pos'] + data['test']['neg']
    labels_train = labels['train']['pos'] + labels['train']['neg']
    labels_test = labels['test']['pos'] + labels['test']['neg']

    data_train, labels_train = shuffle(data_train, labels_train)
    data_test, labels_test = shuffle(data_test, labels_test)
 
    return data_train, data_test, labels_train, labels_test

In [4]:
train_X, test_X, train_y, test_y = prepare_imdb_data(data, labels)
print("IMDb reviews (combined): train = {}, test = {}".format(len(train_X), len(test_X)))

IMDb reviews (combined): train = 25000, test = 25000


In [5]:
print(train_X[1])
print(train_y[1])

After witnessing his wife (Linda Hoffman) engaging in sexual acts with the pool boy, the already somewhat unstable dentist Dr. Feinstone (Corbin Bernsen) completely snaps which means deep trouble for his patients.<br /><br />This delightful semi-original and entertaining horror flick from director Brian Yuzna was a welcome change of pace from the usual horror twaddle that was passed out in the late Nineties. Although The Dentist' is intended to be a cheesy, fun little film, Yuzna ensures that the movie delivers the shocks and thrills that many more serious movies attempt to dispense. Despite suffering somewhat from the lack of background on the central characters, and thus allowing events that should have been built up to take place over a couple of days, the movie is intriguing, generally well scripted and well paced which allows the viewer to maintain interest, even during the more ludicrous of moments. The Dentist' suffers, on occasion, from dragging but unlike the much inferior 1

In [7]:
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import *

import re
from bs4 import BeautifulSoup

def review_to_words(review):
    nltk.download("stopwords", quiet=True)
    stemmer = PorterStemmer()
    
    text = BeautifulSoup(review, "html.parser").get_text() # Remove HTML tags
    text = re.sub(r"[^a-zA-Z0-9]", " ", text.lower()) # Convert to lower case
    words = text.split() # Split string into words
    words = [w for w in words if w not in stopwords.words("english")] # Remove stopwords
    words = [PorterStemmer().stem(w) for w in words] # stem
    
    return words
review_to_words(train_X[1])

['wit',
 'wife',
 'linda',
 'hoffman',
 'engag',
 'sexual',
 'act',
 'pool',
 'boy',
 'alreadi',
 'somewhat',
 'unstabl',
 'dentist',
 'dr',
 'feinston',
 'corbin',
 'bernsen',
 'complet',
 'snap',
 'mean',
 'deep',
 'troubl',
 'patient',
 'delight',
 'semi',
 'origin',
 'entertain',
 'horror',
 'flick',
 'director',
 'brian',
 'yuzna',
 'welcom',
 'chang',
 'pace',
 'usual',
 'horror',
 'twaddl',
 'pass',
 'late',
 'nineti',
 'although',
 'dentist',
 'intend',
 'cheesi',
 'fun',
 'littl',
 'film',
 'yuzna',
 'ensur',
 'movi',
 'deliv',
 'shock',
 'thrill',
 'mani',
 'seriou',
 'movi',
 'attempt',
 'dispens',
 'despit',
 'suffer',
 'somewhat',
 'lack',
 'background',
 'central',
 'charact',
 'thu',
 'allow',
 'event',
 'built',
 'take',
 'place',
 'coupl',
 'day',
 'movi',
 'intrigu',
 'gener',
 'well',
 'script',
 'well',
 'pace',
 'allow',
 'viewer',
 'maintain',
 'interest',
 'even',
 'ludicr',
 'moment',
 'dentist',
 'suffer',
 'occas',
 'drag',
 'unlik',
 'much',
 'inferior',
 '19

In [0]:
import pickle

cache_dir = os.path.join("../cache", "sentiment_analysis") 
os.makedirs(cache_dir, exist_ok=True)  

def preprocess_data(data_train, data_test, labels_train, labels_test,
                    cache_dir=cache_dir, cache_file="preprocessed_data.pkl"):
 
    cache_data = None
    if cache_file is not None:
        try:
            with open(os.path.join(cache_dir, cache_file), "rb") as f:
                cache_data = pickle.load(f)
            print("Read preprocessed data from cache file:", cache_file)
        except:
            pass  
    if cache_data is None:
        words_train = [review_to_words(review) for review in data_train]
        words_test = [review_to_words(review) for review in data_test]

        if cache_file is not None:
            cache_data = dict(words_train=words_train, words_test=words_test,
                              labels_train=labels_train, labels_test=labels_test)
            with open(os.path.join(cache_dir, cache_file), "wb") as f:
                pickle.dump(cache_data, f)
            print("Wrote preprocessed data to cache file:", cache_file)
    else:
        words_train, words_test, labels_train, labels_test = (cache_data['words_train'],
                cache_data['words_test'], cache_data['labels_train'], cache_data['labels_test'])
    
    return words_train, words_test, labels_train, labels_test

In [9]:
train_X, test_X, train_y, test_y = preprocess_data(train_X, test_X, train_y, test_y)

Wrote preprocessed data to cache file: preprocessed_data.pkl


In [10]:
print(train_X[0])
train_y[0]

['bell', 'book', 'candl', 'releas', 'decemb', '1958', 'featur', 'jame', 'stewart', 'kim', 'novak', 'jack', 'lemmon', 'erni', 'kovak', 'film', 'jame', 'stewart', 'kim', 'novak', 'second', 'screen', 'pair', 'alfr', 'hitchcock', 'classic', 'vertigo', 'releas', 'earlier', 'year', 'stewart', 'last', 'film', 'romant', 'lead', 'deem', 'old', 'age', '50', 'play', 'sort', 'part', 'anymor', 'movi', 'witch', 'play', 'kim', 'novak', 'attract', 'mortal', 'play', 'jame', 'stewart', 'put', 'spell', 'fall', 'head', 'heel', 'love', 'enjoy', 'movi', 'cast', 'movi', 'time', 'moder', 'success', 'nomin', 'golden', 'globe', 'best', 'movi', 'comedi', 'gimmeclass']


1

In [0]:
try1=[]
for i in train_X:
  try1.append(" ".join(i))

try2=[]
for i in test_X:
  try2.append(" ".join(i))

In [12]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.externals import joblib


def extract_BoW_features(words_train, words_test, vocabulary_size=10000,
                         cache_dir=cache_dir, cache_file="bow_features.pkl"):
       
    cache_data = None
    if cache_file is not None:
        try:
            with open(os.path.join(cache_dir, cache_file), "rb") as f:
                cache_data = joblib.load(f)
            print("Read features from cache file:", cache_file)
        except:
            pass  
    if cache_data is None:
        vectorizer = CountVectorizer(max_features=vocabulary_size)
        features_train = vectorizer.fit_transform(words_train).toarray()
        features_test = vectorizer.transform(words_test).toarray()
        if cache_file is not None:
            vocabulary = vectorizer.vocabulary_
            cache_data = dict(features_train=features_train, features_test=features_test,
                             vocabulary=vocabulary)
            with open(os.path.join(cache_dir, cache_file), "wb") as f:
                joblib.dump(cache_data, f)
            print("Wrote features to cache file:", cache_file)
    else:
        features_train, features_test, vocabulary = (cache_data['features_train'],
                cache_data['features_test'], cache_data['vocabulary'])
    
    # Return both the extracted features as well as the vocabulary
    return features_train, features_test, vocabulary



In [13]:
train_X, test_X, vocabulary = extract_BoW_features(try1, try2)

Wrote features to cache file: bow_features.pkl


In [0]:
import pandas as pd

val_X = pd.DataFrame(train_X[:10000])
train_X = pd.DataFrame(train_X[10000:])
val_y = pd.DataFrame(train_y[:10000])
train_y = pd.DataFrame(train_y[10000:])

In [15]:
test_X

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [0]:
data_dir = '../data/sentiment_web_app'
if not os.path.exists(data_dir):
    os.makedirs(data_dir)

In [0]:
pd.DataFrame(test_X).to_csv( ('test.csv'), header=False, index=False)

pd.concat([val_y, val_X], axis=1).to_csv(('validation.csv'), header=False, index=False)
pd.concat([train_y, train_X], axis=1).to_csv(('train.csv'), header=False, index=False)

In [0]:
test_location = pd.read_csv( 'test.csv')
val_location = pd.read_csv('validation.csv')
train_location = pd.read_csv('train.csv')

In [19]:
train_y

Unnamed: 0,0
0,1
1,1
2,1
3,1
4,1
...,...
14995,1
14996,1
14997,1
14998,1


In [0]:
import pandas as pd
import numpy as np
import glob, os, string, re, spacy
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB

In [0]:
LR = LogisticRegression(solver = 'lbfgs', n_jobs = -1)
LR.fit(train_X, train_y)
LR_clf = LR.predict(test_X)

In [24]:
LR.score(train_X, train_y)

0.9948

In [25]:
accuracy_score(test_y, LR_clf)

0.84656

In [0]:
LSVM = LinearSVC()
LSVM.fit(train_X, train_y)
LSVM_clf = LSVM.predict(test_X)

In [27]:
LSVM.score(train_X, train_y)

1.0

In [28]:
accuracy_score(test_y, LSVM_clf)

0.8182

In [0]:
MNB = MultinomialNB()
MNB.fit(train_X, train_y)
MNB_clf = MNB.predict(test_X)

In [30]:
MNB.score(train_X, train_y)

0.8769333333333333

In [31]:
accuracy_score(test_y, MNB_clf)

0.82228

**Trying some other parts also**

In [38]:
!wget -O aclImdb_v1.tar.gz http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
!tar -zxf aclImdb_v1.tar.gz 

--2020-06-14 18:35:52--  http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
Resolving ai.stanford.edu (ai.stanford.edu)... 171.64.68.10
Connecting to ai.stanford.edu (ai.stanford.edu)|171.64.68.10|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 84125825 (80M) [application/x-gzip]
Saving to: ‘aclImdb_v1.tar.gz’


2020-06-14 18:35:56 (17.5 MB/s) - ‘aclImdb_v1.tar.gz’ saved [84125825/84125825]



In [0]:
import pandas as pd
import numpy as np
import glob, os, string, re, spacy
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB

In [0]:
train_pos_files = glob.glob("aclImdb/train/pos/*.txt")
train_neg_files = glob.glob("aclImdb/train/neg/*.txt")
train_pos_ls = []

for i in train_pos_files:
    file = open(i, "r")
    str = file.readline()
    clean = re.compile('<.*?>')
    str = re.sub(clean, ' ', str)
    train_pos_ls.append(str)
    
train_neg_ls = []
for i in train_neg_files:
    file = open(i, "r")
    str = file.readline()
    clean = re.compile('<.*?>')
    str = re.sub(clean, ' ', str)
    train_neg_ls.append(str)

In [46]:
labels = ['reveiw', 'label']
df_train_pos = pd.DataFrame()
df_train_pos['review'] = train_pos_ls
df_train_pos['label'] = 1
df_train_neg = pd.DataFrame()
df_train_neg['review'] = train_neg_ls
df_train_neg['label'] = 0
df_train = pd.concat([df_train_pos , df_train_neg])
df_train

Unnamed: 0,review,label
0,I first saw this film around ten years ago and...,1
1,Cosimo (Luis Guzman) ends up in prison for car...,1
2,This movie is a journey through the mind of a ...,1
3,Standard rise to fame tale that has a few high...,1
4,I originally saw this movie as a boy at the ol...,1
...,...,...
12495,A response to previous comments made by reside...,0
12496,A scientist (John Carradine--sadly) finds out ...,0
12497,"Revolution is a terrible movie, I don't care i...",0
12498,It's boggles the mind how this movie was nomin...,0


In [43]:
test_pos_files = glob.glob("aclImdb/test/pos/*.txt")
test_neg_files = glob.glob("aclImdb/test/neg/*.txt")
test_pos_ls = []
for i in test_pos_files:
    file = open(i, "r")
    str = file.readline()
    clean = re.compile('<.*?>')
    str = re.sub(clean, ' ', str)
    test_pos_ls.append(str)
    
test_neg_ls = []
for i in test_neg_files:
    file = open(i, "r")
    str = file.readline()
    clean = re.compile('<.*?>')
    str = re.sub(clean, ' ', str)
    test_neg_ls.append(str)

labels = ['reveiw', 'label']
df_test_pos = pd.DataFrame()
df_test_pos['review'] = test_pos_ls
df_test_pos['label'] = 1
df_test_neg = pd.DataFrame()
df_test_neg['review'] = test_neg_ls
df_test_neg['label'] = 0
df_test = pd.concat([df_test_pos , df_test_neg])
df_test

Unnamed: 0,review,label
0,My giving this film a score of 8 is relative t...,1
1,Probably because this is Columbia's first film...,1
2,Genius or utter madness? That depends on your ...,1
3,This movie was one of the rolling on the floor...,1
4,1 let's suspend belief for a moment and let's ...,1
...,...,...
12495,I gave this movie 2 instead of 1 just just bec...,0
12496,I want the 99 minutes of my life back that was...,0
12497,"Ineffectual, molly-coddled, self-pitying, lous...",0
12498,This film was sourced from my friends mum who ...,0


In [0]:
df_train.to_csv('train1.csv', index = False)

In [0]:
df_test.to_csv('test2.csv', index=False)

In [61]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [0]:

# Define text pre-processing functions
lemma = WordNetLemmatizer()
stops = set(stopwords.words('english'))

nlp = spacy.load('en_core_web_sm')
spacy_stopwords = spacy.lang.en.stop_words.STOP_WORDS
            
def text_prep(text):
    no_punct = [char for char in text if char not in string.punctuation]
    text = "".join(no_punct)
    text = [lemma.lemmatize(text, pos='v') for text in text.lower().split() if text not in stops] 
    text = " ".join(text)
    return (text)

In [66]:
df_train['rev'] = df_train['review'].apply(lambda x:text_prep(x))
df_train[['rev', 'label']].head()

Unnamed: 0,rev,label
0,first saw film around ten years ago think funn...,1
1,cosimo luis guzman end prison car burglary hes...,1
2,movie journey mind screenwriter catch paradoxi...,1
3,standard rise fame tale high point number one ...,1
4,originally saw movie boy old rialto theatre pa...,1


In [67]:
df_test['rev'] = df_test['review'].apply(lambda x:text_prep(x))
df_test[['rev', 'label']].head()

Unnamed: 0,rev,label
0,give film score 8 relative featurelength film ...,1
1,probably columbias first film color color look...,1
2,genius utter madness depend interpretation fil...,1
3,movie one roll floor laugh movies ever see dan...,1
4,1 let suspend belief moment let stop pretend c...,1


In [68]:
tfidf = TfidfVectorizer(max_features = 1000)
x_train = tfidf.fit_transform(df_train['rev'])
y_train = df_train['label']
x_test = tfidf.transform(df_test['rev'])
y_test = df_test['label']
x_train.shape

(25000, 1000)

In [69]:
from keras.models import Sequential
from keras.layers import LSTM, Convolution1D, Flatten, Dropout, Dense

model = Sequential()
model.add(Dense(256, input_shape=(1000,) , activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(200, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(160, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(120, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(80, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

Using TensorFlow backend.


Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 256)               256256    
_________________________________________________________________
dropout_1 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 200)               51400     
_________________________________________________________________
dropout_2 (Dropout)          (None, 200)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 160)               32160     
_________________________________________________________________
dropout_3 (Dropout)          (None, 160)               0         
_________________________________________________________________
dense_4 (Dense)              (None, 120)              

In [71]:
model.fit(x_train, y_train, batch_size=128, epochs=10, verbose=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.callbacks.History at 0x7fd41f027a58>

In [72]:
loss, accuracy = model.evaluate(x_train, y_train)
print (loss, accuracy)

0.005503535429150798 0.9986000061035156


In [74]:
predictions = model.predict(x_test)
rounded = [round(x[0]) for x in predictions]
predictions = rounded
score = accuracy_score(y_test ,predictions)
print(score)

0.84656
