In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import nltk, re, time
from nltk.corpus import stopwords

from bs4 import BeautifulSoup 

from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer

nltk.download('stopwords')
from time import time
import warnings
warnings.filterwarnings('ignore')

Using TensorFlow backend.


[nltk_data] Downloading package stopwords to /home/ideis/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
train = pd.read_csv("data/labeledTrainData.tsv", header=0, delimiter="\t", quoting=3)
test = pd.read_csv( "data/testData.tsv", header=0, delimiter="\t", quoting=3)

print("Train: %d, Test: %d\n"
      % (train["review"].size, test["review"].size))

Train: 25000, Test: 25000



In [4]:
def clean_text(text, remove_stopwords=True):
    '''Clean the text, with the option to remove stopwords'''
    text = BeautifulSoup(text).get_text()
    text = text.lower().split()

    # Optionally, remove stop words
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        text = [w for w in text if not w in stops]
    
    text = " ".join(text)

    text = re.sub(r"[^a-z]", " ", text)
    text = re.sub(r"   ", " ", text) # Remove any extra spaces
    text = re.sub(r"  ", " ", text)
    
    return(text)

def extract_sentiment(s):
    s = re.sub(r"[^0-9]", " ", s)
    n = int(s.split()[1])
    if n <= 5:
        return 0
    else:
        return 1
train['review'] = train.apply(lambda x: clean_text(x['review']), axis=1)
test['review'] = test.apply(lambda x: clean_text(x['review']), axis=1)
test['sentiment'] = test.apply(lambda x: extract_sentiment(x['id']), axis=1)

In [14]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

def build_feature_matrix(documents, ngram_range=(1, 1), min_df=0.0, max_df=1.0):
    
    vectorizer = TfidfVectorizer(min_df=min_df, max_df=max_df, 
                                 ngram_range=ngram_range)
    
    feature_matrix = vectorizer.fit_transform(documents).astype(float)
    
    return vectorizer, feature_matrix


vectorizer, train_features = build_feature_matrix(documents=train['review'],
                                                  ngram_range=(1,2), 
                                                  min_df=0.0, max_df=1.0)   

test_features = vectorizer.transform(test['review'])
train_rating = np.asarray(train['sentiment'])
test_rating = np.asarray(test['sentiment'])

In [13]:
# best model selection and hyperparameters optimization
from sklearn.model_selection import cross_val_score

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
    
nb = MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)
scores = cross_val_score(nb, train_features, train_rating, cv=5)
print("Cross-validation Accuracy: %0.2f" % (scores.mean()))

lg = LogisticRegression(solver='newton-cg',
                        multi_class='multinomial',
                        C=1,
                        penalty='l2',
                        max_iter=100,
                        random_state=42)
scores = cross_val_score(lg, train_features, train_rating, cv=5)
print("Cross-validation Accuracy: %0.2f" % (scores.mean()))

Cross-validation Accuracy: 0.87
Cross-validation Accuracy: 0.89


In [16]:
# model evaluation and metrics
from sklearn import metrics

def display_evaluation_metrics(true_labels, predicted_labels):
    
    print('Accuracy:', np.round(
                        metrics.accuracy_score(true_labels, 
                                               predicted_labels),
                        2))
    print('ROC-AUC:', np.round(
                    metrics.roc_auc_score(true_labels, 
                                           predicted_labels),
                        2))
    
def display_model_quality(model, test_features, test_rating):
    predicted_sentiments = model.predict(test_features)
    
    print(model)
    display_evaluation_metrics(true_labels=test_rating,
                               predicted_labels=predicted_sentiments)
nb.fit(train_features, train_rating)
display_model_quality(nb, test_features, test_rating)
print()
lg.fit(train_features, train_rating)
display_model_quality(lg, test_features, test_rating)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)
Accuracy: 0.83
ROC-AUC: 0.83

LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='multinomial',
          n_jobs=1, penalty='l2', random_state=42, solver='newton-cg',
          tol=0.0001, verbose=0, warm_start=False)
Accuracy: 0.86
ROC-AUC: 0.86
