In [1]:
# Import libraries
import os
import sys
import logging
import pandas as pd
import pandas.io.sql as psql

# psycopg2 for interacting with postgres
try:
    import psycopg2 as pg
    import psycopg2.extras
except:
    print( "Install psycopg2")
    exit(123)

In [2]:
PG_CONN_STRING = "dbname='postgres' port='5432' user='postgres' password='phludphlud'"
dbconn = pg.connect(PG_CONN_STRING)
cursor = dbconn.cursor()

row_count = int(pd.read_sql('SELECT COUNT(*) from review_view', con=dbconn).values)

print(row_count)
chunksize = 50000
review_df = pd.DataFrame(columns = ['text', 'stars'])

# Load review into Pandas DataFrame
for i in range(int(row_count/chunksize) +1):
    query = 'SELECT text, stars from review_view LIMIT {chunksize} OFFSET {offset}'.format(offset = i*chunksize, chunksize=chunksize)
    review_df = review_df.append(pd.read_sql_query(query, con=dbconn))
    print("{} rows have been loaded to dataframe.".format(i*chunksize))

5996996
0 rows have been loaded to dataframe.
50000 rows have been loaded to dataframe.
100000 rows have been loaded to dataframe.
150000 rows have been loaded to dataframe.
200000 rows have been loaded to dataframe.
250000 rows have been loaded to dataframe.
300000 rows have been loaded to dataframe.
350000 rows have been loaded to dataframe.
400000 rows have been loaded to dataframe.
450000 rows have been loaded to dataframe.
500000 rows have been loaded to dataframe.
550000 rows have been loaded to dataframe.
600000 rows have been loaded to dataframe.
650000 rows have been loaded to dataframe.
700000 rows have been loaded to dataframe.
750000 rows have been loaded to dataframe.
800000 rows have been loaded to dataframe.
850000 rows have been loaded to dataframe.
900000 rows have been loaded to dataframe.
950000 rows have been loaded to dataframe.
1000000 rows have been loaded to dataframe.
1050000 rows have been loaded to dataframe.
1100000 rows have been loaded to dataframe.
115000

In [3]:
# Quick preview of the data
print(review_df.head()) 
print(review_df.shape)

                                                text stars
0  Its a shame the staff Dr Agarwal chooses to su...     1
1  I purchased my phone October 2009 and have bee...     1
2  I am originally from NY and grew up eating Spa...     5
3  Don't waste your money getting an interior and...     2
4  On Saturday November 24th, 2012, we visited ou...     1
(5996996, 2)


# A multi-label classifier might be difficult in this case. In future iterations we may test or even train for this, but let's only use most positive and negative reviews for now.

In [4]:
review_df['stars'] = review_df['stars'].astype(int)
print(review_df.dtypes)
filtered_df = review_df[review_df['stars'].isin([1,5])]

print(filtered_df.shape)

text     object
stars     int32
dtype: object
(3494625, 2)


Let's convert stars to a 0-1 scale, which will make it allow us to compare to sigmoid outputs later.

In [5]:
# Convert labels to 0-1 scale (1 star becomes 0, 5 star becomes 1)
def convert_scale(x):    
    if x == 1: x = 0
    if x == 5: x = 1
    return x

filtered_df['stars'] = filtered_df['stars'].apply(convert_scale)
print(filtered_df.head())

                                                text  stars
0  Its a shame the staff Dr Agarwal chooses to su...      0
1  I purchased my phone October 2009 and have bee...      0
2  I am originally from NY and grew up eating Spa...      1
4  On Saturday November 24th, 2012, we visited ou...      0
6  I've been to this Burger King between 5-10 tim...      0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


Using all the reviews took too long since I'm training on a CPU, so let's take only 10% of the total filtered data.

In [32]:
print("All 1 and 5 star reviews:", filtered_df.shape)
x = int(filtered_df.shape[0]/10)

print(filtered_df.iloc[0:x].shape)


All 1 and 5 star reviews: (3494625, 2)
(349462, 2)
                                                    text  stars
25676  This place is beautifully decorated and very w...      1
25678  Sherrill came to my rescue after a previous ha...      1
25679  I have been going here for a little over four ...      1
25680  So we walked I today at 2:20 and my appointmen...      0
25681  I stayed here on 2/10/17.   I couldn't find th...      0


Convert to .csv format which is compatible with TorchText.

In [7]:
filtered_df.iloc[0:x].to_csv("filtered2.csv", index=False)

# PyTorch will be used for building our model since it has some nice libraries for NLP 

In [8]:
import torch
from torchtext import data, datasets

SEED = 1337

torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)

# Based on bi-grams concept from "FastText" model (Joulin et al., 2016)
def generate_bigrams(x):
    n_grams = set(zip(*[x[i:] for i in range(2)]))
    for n_gram in n_grams:
        x.append(' '.join(n_gram))
    return x

TEXT = data.Field(tokenize='spacy', preprocessing=generate_bigrams)
LABEL = data.LabelField(tensor_type=torch.FloatTensor)

In [9]:
datafields = [("text", TEXT), ("stars", LABEL)]
train = data.TabularDataset(
        path = 'filtered2.csv', format='csv', skip_header=True, fields=datafields) 

Split the data into training, validation and test sets, and then generate vocabs.

In [10]:
train, valid, test = train.split(split_ratio=[0.98, 0.01, 0.01])

print('Train length:', len(train))
print('Valid length:', len(valid))
print('Test length:', len(test))

TEXT.build_vocab(train, max_size=25000, vectors="glove.6B.100d")
LABEL.build_vocab(train)

print("Length of TEXT vocab:", len(TEXT.vocab))
print("Length of LABEL vocab:", len(LABEL.vocab))

BATCH_SIZE = 64

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train, valid, test), 
    batch_size=BATCH_SIZE, 
    sort_key=lambda x: len(x.text), 
    repeat=False)

print("Iterators generated.")

Train length: 342473
Valid length: 3494
Test length: 3495
Length of TEXT vocab: 25002
Length of LABEL vocab: 2
Iterators generated.


In [15]:
# Build the neural network based on FastText paper
import torch.nn as nn
import torch.nn.functional as F

class FastText(nn.Module):
    def __init__(self, vocab, embedding_dim, output_dim):
        super().__init__()
        
        self.embedding = nn.Embedding(vocab, embedding_dim)
        self.fc = nn.Linear(embedding_dim, output_dim)
        
    def forward(self,x):
        embedded = self.embedding(x)
        
        embedded = embedded.permute(1,0,2)
        
        pooled = F.avg_pool2d(embedded, (embedded.shape[1], 1)).squeeze(1)
        
        return self.fc(pooled)

print("Input dimensions:", len(TEXT.vocab))
INPUT_DIM = len(TEXT.vocab)

EMBEDDING_DIM = 100
OUTPUT_DIM = 1

Input dimensions: 25002


In [22]:
model = FastText(INPUT_DIM, EMBEDDING_DIM, OUTPUT_DIM)

pretrained_embeddings = TEXT.vocab.vectors

model.embedding.weight.data.copy_(pretrained_embeddings)

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.3398,  0.2094,  0.4635,  ..., -0.2339,  0.4730, -0.0288],
        ...,
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]])

In [23]:
# Now we are ready to train the model!
import torch.optim as optim

optimizer = optim.Adam(model.parameters())

criterion = nn.BCEWithLogitsLoss()

device = torch.device('cpu')

model = model.to(device)
criterion = criterion.to(device)

def binary_accuracy(preds, y):
    rounded_preds = torch.round(torch.sigmoid(preds))
    
    # print(y)
    correct = (rounded_preds == y).float()
    
    accuracy = correct.sum()/len(correct)
    
    return accuracy

def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in iterator:
        
        optimizer.zero_grad()
        
        predictions = model(batch.text).squeeze(1)
        
        loss = criterion(predictions, batch.stars)
        
        acc = binary_accuracy(predictions, batch.stars)
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for batch in iterator:

            predictions = model(batch.text).squeeze(1)
            
            loss = criterion(predictions, batch.stars)
            
            acc = binary_accuracy(predictions, batch.stars)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [24]:
# Train for a number of epochs
N_EPOCHS = 5

for epoch in range(N_EPOCHS):
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    
    print('Epoch: {}, Train Loss: {:.3f}, Train Acc: {:.2f}%, Val. Loss: {:.3f}, Val. Acc: {:.2f}%'.format(epoch + 1, train_loss, train_acc * 100, valid_loss, valid_acc * 100))

  return Variable(arr, volatile=not train)


Epoch: 1, Train Loss: 0.225, Train Acc: 90.97%, Val. Loss: 0.189, Val. Acc: 97.09%
Epoch: 2, Train Loss: 0.074, Train Acc: 97.80%, Val. Loss: 0.195, Val. Acc: 97.75%
Epoch: 3, Train Loss: 0.055, Train Acc: 98.33%, Val. Loss: 0.193, Val. Acc: 98.06%
Epoch: 4, Train Loss: 0.046, Train Acc: 98.61%, Val. Loss: 0.198, Val. Acc: 98.28%
Epoch: 5, Train Loss: 0.041, Train Acc: 98.75%, Val. Loss: 0.213, Val. Acc: 98.33%


This accuracy seems quite good, let's test it on the test set.

In [25]:
test_loss, test_acc = evaluate(model, test_iterator, criterion)
print('Test Loss: {:.3f}, Test Acc: {:.2f}%'.format(test_loss, test_acc*100))

  return Variable(arr, volatile=not train)


Test Loss: 0.257, Test Acc: 98.11%


Great, it seems like test set accuracy is quite close to train and validation accuracy, indicating that we are probably not overfitting much. Just for fun, let's predict sentiment of some custom reviews.

In [50]:
import spacy
nlp = spacy.load('en')

def predict_sentiment(sentence):
    tokenized = [tok.text for tok in nlp.tokenizer(sentence)]
    indexed = [TEXT.vocab.stoi[t] for t in tokenized]
    tensor = torch.LongTensor(indexed).to(device)
    tensor = tensor.unsqueeze(1)
    prediction = 1.0 - torch.sigmoid(model(tensor))
    return prediction.item()

sentence1 = "I love this restaurant!"
sentence2 = "This place is the worst"
sentence3 = "I'll come here again"
sentence4 = "I'll never come here again"
sentence5 = "This place was great but the service was terrible"

for i in [sentence1, sentence2, sentence3, sentence4, sentence5]:
    print("Sentiment: {:.4f}, Text = {}".format(predict_sentiment(i), i))



Sentiment: 1.0000, Text = I love this restaurant!
Sentiment: 0.0000, Text = This place is the worst
Sentiment: 1.0000, Text = I'll come here again
Sentiment: 0.0000, Text = I'll never come here again
Sentiment: 0.0000, Text = This place was great but the service was terrible
