In [None]:
import pandas as pd
import numpy as np
import os
import gensim
import spacy
import nltk
import pickle
from gensim import corpora
nltk.download('wordnet')
from nltk.corpus import wordnet as wn
from nltk.stem.wordnet import WordNetLemmatizer
nltk.download('stopwords')
from spacy.lang.en import English
from tpot import TPOTClassifier
from datetime import datetime
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import SCORERS, accuracy_score, f1_score
import h2o
from h2o.automl import H2OAutoML

parser = English()
stop_words = set(nltk.corpus.stopwords.words('english'))
DATE_FORMAT = "%Y-%m-%d"
MIN_DATE = datetime.strptime('2009-07-07', DATE_FORMAT)

In [None]:
def tokenize(text):
    lda_tokens = []
    tokens = parser(text)
    for token in tokens:
        if token.orth_.isspace():
            continue
        elif token.like_url:
            lda_tokens.append('URL')
        elif token.orth_.startswith('@'):
            lda_tokens.append('SCREEN_NAME')
        else:
            lda_tokens.append(token.lower_)
    return lda_tokens

def get_lemma(word):
    lemma = wn.morphy(word)
    if lemma is None:
        return word
    else:
        return lemma
    
def prepare_text_for_lda(text):
    tokens = tokenize(text)
    tokens = [token for token in tokens if len(token) > 4]
    tokens = [token for token in tokens if token not in stop_words]
    tokens = [get_lemma(token) for token in tokens]
    return tokens

def preprocess_dataset(dataset_dir, load_vocab_and_lda=True):
    dataset = pd.read_csv(dataset_dir)
    features = dataset['text']
    labels = dataset['sentiment']
    
    tokenized_samples = []

    for row in features:
        tokenized_samples.append(prepare_text_for_lda(row))
    
    if load_vocab_and_lda:
        vocabulary = pickle.load(open('lda/imdb_reviews/vocab.pkl', 'rb'))
    else:
        vocabulary = corpora.Dictionary(tokenized_samples)
        pickle.dump(vocabulary, open('lda/imdb_reviews/vocab.pkl', 'wb'))
        
    corpus = [vocabulary.doc2bow(doc) for doc in tokenized_samples]
    num_topics = 30
    
    if load_vocab_and_lda:
        ldamodel =  gensim.models.ldamodel.LdaModel.load('lda/imdb_reviews/model5.gensim')
    else:
        ldamodel = gensim.models.ldamodel.LdaModel(corpus, id2word=vocabulary, num_topics = num_topics, passes=15)
        ldamodel.save('lda/imdb_reviews/model5.gensim')
        
    topic_features = []

    for doc in corpus:
        row = [0.0 for i in range(num_topics)]

        for topic, prob in ldamodel[doc]:
            row[topic] = prob

        topic_features.append(row)

    topic_features = pd.DataFrame(topic_features)
    
    return topic_features, labels

In [None]:
dataset_dir = 'datasets/imdb_reviews/train.csv'

features, labels = preprocess_dataset(dataset_dir)

In [None]:
# H2O Testing
h2o.init()


In [None]:
str_labels = ["c" + str(x) for x in labels]
combined = pd.concat([features, pd.DataFrame(str_labels, columns=["class"]).astype(str)], axis=1)
h2o_dataset = h2o.H2OFrame(combined)
h2o_dataset

In [None]:
x = h2o_dataset.columns[:-1]
y = h2o_dataset.columns[-1]

In [None]:
kf = KFold(5, shuffle=True, random_state=42)

acc = []
f1 = []
counter = 0

for train_ind, val_ind in kf.split(h2o_dataset):
    print("> Iteration", counter)
    train = h2o.H2OFrame(combined.iloc[train_ind])
    val = h2o.H2OFrame(combined.iloc[val_ind])
    val_labels = combined.iloc[val_ind]['class']

    aml = H2OAutoML(max_runtime_secs=3600, seed=1)
    aml.train(x=x,y=y, training_frame=train)
    pred = aml.leader.predict(val)
    pred = h2o.as_list(pred[:, 0])

    acc.append(accuracy_score(pred, val_labels))
    f1.append(f1_score(pred, val_labels, average='macro'))
        
    counter += 1
    

In [None]:
# TPOT Testing
acc = []
f1 = []

for train_ind, val_ind in kf.split(features, labels):
    X_train, y_train = features.iloc[train_ind], labels[train_ind]
    X_val, y_val = features.iloc[val_ind], labels[val_ind]
    
    tpot = TPOTClassifier(max_time_mins=60, verbosity=2)
    tpot.fit(X_train, y_train)
    clf = tpot.fitted_pipeline_
    
    
    acc.append(SCORERS['accuracy'](clf, X_val, y_val))
    f1.append(SCORERS['f1_macro'](clf, X_val, y_val))

In [None]:
# SVM Testing
acc = []
f1 = []

for train_ind, val_ind in kf.split(features, labels):
    X_train, y_train = features.iloc[train_ind], labels[train_ind]
    X_val, y_val = features.iloc[val_ind], labels[val_ind]
    
    svm = SVC(kernel='linear')
    svm.fit(X_train, y_train)
    
    acc.append(SCORERS['accuracy'](svm, X_val, y_val))
    f1.append(SCORERS['f1_macro'](svm, X_val, y_val))

In [None]:
# Random Forest Testing
acc = []
f1 = []

for train_ind, val_ind in kf.split(features, labels):
    X_train, y_train = features.iloc[train_ind], labels[train_ind]
    X_val, y_val = features.iloc[val_ind], labels[val_ind]
    
    rf = RandomForestClassifier()
    rf.fit(X_train, y_train)
    
    acc.append(SCORERS['accuracy'](rf, X_val, y_val))
    f1.append(SCORERS['f1_macro'](rf, X_val, y_val))