In [14]:
## importing all the libraries needed

from sklearn.linear_model import LogisticRegression
from sklearn import naive_bayes
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import nltk
from nltk import word_tokenize
import numpy as np
import pandas as pd
import re
from io import StringIO
from keras.datasets import imdb
from keras.preprocessing.text import Tokenizer

In [2]:
vocabulary_size = 5000

(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words = vocabulary_size)
print('Loaded dataset with {} training samples, {} test samples'.format(len(X_train), len(X_test)))

Downloading data from https://s3.amazonaws.com/text-datasets/imdb.npz
Loaded dataset with 25000 training samples, 25000 test samples


In [3]:
print('---review---')
print(X_train[6])
print('---label---')
print(y_train[6])

---review---
[1, 2, 365, 1234, 5, 1156, 354, 11, 14, 2, 2, 7, 1016, 2, 2, 356, 44, 4, 1349, 500, 746, 5, 200, 4, 4132, 11, 2, 2, 1117, 1831, 2, 5, 4831, 26, 6, 2, 4183, 17, 369, 37, 215, 1345, 143, 2, 5, 1838, 8, 1974, 15, 36, 119, 257, 85, 52, 486, 9, 6, 2, 2, 63, 271, 6, 196, 96, 949, 4121, 4, 2, 7, 4, 2212, 2436, 819, 63, 47, 77, 2, 180, 6, 227, 11, 94, 2494, 2, 13, 423, 4, 168, 7, 4, 22, 5, 89, 665, 71, 270, 56, 5, 13, 197, 12, 161, 2, 99, 76, 23, 2, 7, 419, 665, 40, 91, 85, 108, 7, 4, 2084, 5, 4773, 81, 55, 52, 1901]
---label---
1


In [4]:
word2id = imdb.get_word_index()
id2word = {i: word for word, i in word2id.items()}
print('---review with words---')
print([id2word.get(i, ' ') for i in X_train[6]])
print('---label---')
print(y_train[6])

Downloading data from https://s3.amazonaws.com/text-datasets/imdb_word_index.json
---review with words---
['the', 'and', 'full', 'involving', 'to', 'impressive', 'boring', 'this', 'as', 'and', 'and', 'br', 'villain', 'and', 'and', 'need', 'has', 'of', 'costumes', 'b', 'message', 'to', 'may', 'of', 'props', 'this', 'and', 'and', 'concept', 'issue', 'and', 'to', "god's", 'he', 'is', 'and', 'unfolds', 'movie', 'women', 'like', "isn't", 'surely', "i'm", 'and', 'to', 'toward', 'in', "here's", 'for', 'from', 'did', 'having', 'because', 'very', 'quality', 'it', 'is', 'and', 'and', 'really', 'book', 'is', 'both', 'too', 'worked', 'carl', 'of', 'and', 'br', 'of', 'reviewer', 'closer', 'figure', 'really', 'there', 'will', 'and', 'things', 'is', 'far', 'this', 'make', 'mistakes', 'and', 'was', "couldn't", 'of', 'few', 'br', 'of', 'you', 'to', "don't", 'female', 'than', 'place', 'she', 'to', 'was', 'between', 'that', 'nothing', 'and', 'movies', 'get', 'are', 'and', 'br', 'yes', 'female', 'just', '

In [5]:
print('Maximum review length: {}'.format(
len(max((X_train + X_test), key=len))))

Maximum review length: 2697


In [6]:
print('Minimum review length: {}'.format(
len(min((X_test + X_test), key=len))))

Minimum review length: 14


In [7]:
from keras.preprocessing import sequence

max_words = 500
X_train = sequence.pad_sequences(X_train, maxlen=max_words)
X_test = sequence.pad_sequences(X_test, maxlen=max_words)

In [8]:
from keras import Sequential
from keras.layers import Embedding, LSTM, Dense, Dropout

embedding_size=32
model=Sequential()
model.add(Embedding(vocabulary_size, embedding_size, input_length=max_words))
model.add(LSTM(100))
model.add(Dense(1, activation='sigmoid'))

print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 500, 32)           160000    
_________________________________________________________________
lstm_1 (LSTM)                (None, 100)               53200     
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 101       
Total params: 213,301
Trainable params: 213,301
Non-trainable params: 0
_________________________________________________________________
None


In [9]:
model.compile(loss='binary_crossentropy', 
             optimizer='adam', 
             metrics=['accuracy'])

In [10]:
batch_size = 64
num_epochs = 3

X_valid, y_valid = X_train[:batch_size], y_train[:batch_size]
X_train2, y_train2 = X_train[batch_size:], y_train[batch_size:]

model.fit(X_train2, y_train2, validation_data=(X_valid, y_valid), batch_size=batch_size, epochs=num_epochs)

Train on 24936 samples, validate on 64 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7f78df4ed4a8>

In [23]:
class classifier:
    
    def __init__(self):
        """
        Initializes the class with the right classifier attribute depending on the type of classifier
        """
        self.vec = Tokenizer()
        embedding_size=32
        model=Sequential()
        model.add(Embedding(vocabulary_size, embedding_size))
        model.add(LSTM(100))
        model.add(Dense(1, activation='sigmoid'))
        model.compile(loss='binary_crossentropy', 
             optimizer='adam', 
             metrics=['accuracy'])
        self.clf = model

        
    def _read(self, documents):
        """
        Reads and combines all the documents in one big pandas data frame
        """
        data = []
        X,Y = [], []
        for document in documents:
            d_ata = pd.read_csv(document, sep='\t', names=['review','label'])
            data.append(d_ata)
        data = pd.concat(data)
        self.data = data
        Y = data.label
        self.vec.fit_on_texts(data.review)
        X = self.preprocess(data)
        
        return train_test_split(X,Y)
    
    def preprocess(self, data_t):
        """
        Preprocesses the text data by turning it into frequency tables
        Does a few normalization steps (lowercasing, removing stopwords ...) if self.normalize = true
        """
        
        return self.vec.texts_to_sequences(data_t.review)
    
    def train(self, documents):
        """
        Calls the train function
        Trains the classifier object
        """
        X_train, X_test, Y_train, Y_test =  self._read(documents)  
        
        batch_size = 64
        num_epochs = 3
                
        self.clf.fit(np.asarray(X_train), Y_train, validation_data=(np.asarray(X_test), Y_test), batch_size=batch_size, epochs=num_epochs)
        
        
    def predict(self, sentence):
        """
        Predicts for a sentence
        """
        data = pd.read_csv(StringIO(sentence), names=['review'])
        X = self.preprocess(data)
        Y = self.clf.predict_proba(X)
        
        return np.argmax(Y)
    
    def test_file(self, file_name):
        """
        Tests with a file and outputs a file of labels
        """
        labels = []
        with open(file_name) as f:
            for line in f.readlines():
                print(line,self.predict(line))
                labels.append(self.predict(line))
        
        with open('test_results.txt', 'w') as f:
            for label in labels:
                f.write(str(label)+"\n")
                
        print ("Results from ",file_name," printed to: output.txt")
                

In [24]:
my_clf = classifier()
my_clf.train(["../project1/sentiment_labelled_sentences/amazon_cells_labelled.txt",
                  "../project1/sentiment_labelled_sentences/imdb_labelled.txt",
                  "../project1/sentiment_labelled_sentences/yelp_labelled.txt"])

Train on 2061 samples, validate on 687 samples
Epoch 1/3


ValueError: setting an array element with a sequence.