In [1]:
# add scripts/ folder to path
import os, sys

SCRIPTS_PATH = os.environ['DSX_PROJECT_DIR'] + '/scripts'
sys.path.insert(0, SCRIPTS_PATH)

In [2]:
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt

import visualization # custom script

## Load Dataset

In [3]:
DATASET_PATH = "/user-home/libraries/text-analytics/datasets/aclImdb-small"
TRAIN_PATH = DATASET_PATH + "/train/"
TEST_PATH = DATASET_PATH + "/test/"

In [4]:
from sklearn.datasets import load_files

In [5]:
reviews_train = load_files(TRAIN_PATH)

In [6]:
# load_files returns a bunch, containing training texts and training labels
text_train, y_train = reviews_train.data, reviews_train.target
print("type of text_train: {}".format(type(text_train)))
print("length of text_train: {}".format(len(text_train)))
print("text_train[6]:\n{}".format(text_train[6]))

type of text_train: <class 'list'>
length of text_train: 2000
text_train[6]:
b"When I first saw this movie I was with my dad. He encouraged me to watch this movie because it was one of his favorites. After watching the movie it instantly became one of my favorites. <br /><br />A River Runs Through It is about two brothers who each take a different path in life. Norman Maclean (Craig Sheffer) is the older of the two brothers and he is set on the path of education. Paul Maclean (Brad Pitt) is the rebellious younger brother who travels on a path full of obstacles. The movie follows these characters as the each follow their own path.<br /><br />There is no downside to this movie. You will be entertained the whole way through. The acting, directing, and script is all perfect. The two things that are exceptional are the cinematography and the score. Both of which entrap you in the world Robert Redford creates for you. <br /><br />This is an all around great movie that is destined to be a cla

In [7]:
reviews_test = load_files(TEST_PATH)

In [8]:
text_test, y_test = reviews_test.data, reviews_test.target
print("Number of documents in test data: {}".format(len(text_test)))
print("Samples per class (test): {}".format(np.bincount(y_test)))

Number of documents in test data: 2000
Samples per class (test): [1000 1000]


In [9]:
text_train = [doc.replace(b"<br />", b" ").decode('utf-8') for doc in text_train]
text_test = [doc.replace(b"<br />", b" ").decode('utf-8') for doc in text_test]

In [10]:
text_train = pd.DataFrame({"review": text_train})
text_test = pd.DataFrame({"review": text_test})

In [11]:
text_train.to_csv('../datasets/train.csv')

In [12]:
print("Samples per class (training): {}".format(np.bincount(y_train)))

Samples per class (training): [1000 1000]


## Load Pre-Trained Models

In [17]:
import sys, os, pickle
import pandas as pd
from sklearn.externals import joblib

topic_model_path = os.getenv("DSX_PROJECT_DIR") + os.path.join("/models", os.getenv("DSX_MODEL_NAME","simple-topic-modeling"), os.getenv("DSX_MODEL_VERSION","3"),"model")
bow_model_path = os.getenv("DSX_PROJECT_DIR") + os.path.join("/models", os.getenv("DSX_MODEL_NAME","simple-bow"), os.getenv("DSX_MODEL_VERSION","2"),"model")

topic_pipe = joblib.load(open(topic_model_path, 'rb'))
bow_classifier = joblib.load(open(bow_model_path, 'rb'))

## Train Topic Model Based Classifier

In [18]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier
from sklearn.pipeline import make_pipeline

topic_classifier = make_pipeline(topic_pipe, LogisticRegression())
scores = cross_val_score(topic_classifier, text_train, y_train, cv=5)
print("Mean cross-validation accuracy: {:.5f}".format(np.mean(scores)))

Mean cross-validation accuracy: 0.69100


## Merge Topic and BOW Classifiers  

In [19]:
ensemble_classifier = VotingClassifier([('bow', bow_classifier), ('topic', topic_classifier)], voting = 'soft')
scores_ensemble = cross_val_score(ensemble_classifier, text_train, y_train, cv=5)

  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


In [20]:
scores_bow = cross_val_score(bow_classifier, text_train, y_train, cv=5)

print("Mean cross-validation accuracy BOW: {:.5f}".format(np.mean(scores_bow)))
print("Mean cross-validation accuracy ensemble: {:.5f}".format(np.mean(scores_ensemble)))

Mean cross-validation accuracy BOW: 0.86650
Mean cross-validation accuracy ensemble: 0.86850
