In [None]:
import pandas as pd
import numpy as np
import os
import gensim
import nltk
import pickle
nltk.download('wordnet')
from nltk.corpus import wordnet as wn
nltk.download('stopwords')
from tpot import TPOTClassifier
from datetime import datetime
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import SCORERS, accuracy_score, f1_score
import h2o
from h2o.automl import H2OAutoML

stop_words = set(nltk.corpus.stopwords.words('english'))
DATE_FORMAT = "%Y-%m-%d"
MIN_DATE = datetime.strptime('2009-07-07', DATE_FORMAT)

In [None]:
def get_terms(terms_file):
    term_dict = {}
    
    for line in terms_file:
        key, word = line.split(",")
        term_dict[int(key)] = word.split("\n")[0]
        
    return term_dict

def get_labels(labels_file):
    curr_article = -1
    labels_votes = []
    article_dict = {}

    for line in labels_file:
        split_line = line.split(",")

        if int(split_line[0]) != curr_article:
            curr_article = int(split_line[0])
            article_dict[curr_article] = True
            labels_votes.append([0, 0, 0]) # index 0 = negative, 1 = irrelevant, 2 = positive

        if 'negative' in split_line[2]:
            labels_votes[-1][0] += 1
        elif 'irrelevant' in split_line[2]:
            labels_votes[-1][1] += 1
        else:
            labels_votes[-1][2] += 1

    labels = []

    for vote_counts in labels_votes:
        labels.append(np.argmax(vote_counts))
        
    return labels, article_dict

def create_bow_and_features(word_list, term_dict):
    word_list = word_list.split(",")
    
    doc = []
    
    for i, entry in enumerate(word_list):
        if i > 2:
            word, freq = entry.split(":")

            if term_dict[int(word)] not in stop_words:
                doc.append((int(word), int(freq)))
            
    curr_date = datetime.strptime(word_list[1], DATE_FORMAT)
    delta = curr_date - MIN_DATE
    return doc, word_list[0], delta.days, word_list[2]

def preprocess_datasets(articles_dir, terms_dir, annotations_dir, preload_lda=True):
    terms_file = open(terms_dir)
    term_dict = get_terms(terms_file)
    
    dataset = []
    features = []
    articles_file = open(articles_dir)

    for line in articles_file:
        article, article_id, date_in_days, provider = create_bow_and_features(line, term_dict)
        dataset.append(article)
        features.append([article_id, provider, date_in_days])

    features = pd.DataFrame(features)
    
    ldamodel = None
    num_topics = 30
    
    if preload_lda is False:
        ldamodel = gensim.models.ldamodel.LdaModel(dataset, num_topics = num_topics, passes=15)
        ldamodel.save('lda/irish_sentiment/model5.gensim')
    else:
        ldamodel =  gensim.models.ldamodel.LdaModel.load('lda/irish_sentiment/model5.gensim')
        
    topic_features = []

    for doc in dataset:
        row = [0.0 for i in range(num_topics)]

        for topic, prob in ldamodel[doc]:
            row[topic] = prob

        topic_features.append(row)

    topic_features = pd.DataFrame(topic_features)
    
    features = pd.concat([features, topic_features], axis=1)

    columns = ['id', 'publisher', 'date_in_days']
    topic_columns = [i for i in range(num_topics)]
    columns = np.concatenate([columns, topic_columns])
    features.columns = columns
    
    labels_file = open(annotations_dir)
    labels, article_dict = get_labels(labels_file)
    
    drop_list = []

    for index, row, in features.iterrows():
        if int(row['id']) not in article_dict:
            drop_list.append(index)
            
    features = features.drop(drop_list, axis=0)
    features = features.drop('id', axis=1)
    features = pd.get_dummies(features)
    features = features.reset_index()
    features = features.drop('index', axis=1)
    
    return features, np.array(labels)

In [None]:
terms_dir = r'datasets\irish_sentiment\sentiment_all_terms.csv'
articles_dir = r'datasets\irish_sentiment\sentiment_all_articles.csv'
annotations_dir = r'datasets\irish_sentiment\sentiment_all_annotations.csv'

features, labels = preprocess_datasets(articles_dir, terms_dir, annotations_dir, preload_lda=True)


In [None]:
# TPOT Testing
kf = KFold(5, shuffle=True, random_state=42)

acc = []
f1 = []

for train_ind, val_ind in kf.split(features, labels):
    X_train, y_train = features.iloc[train_ind], labels[train_ind]
    X_val, y_val = features.iloc[val_ind], labels[val_ind]
    
    tpot = TPOTClassifier(max_time_mins=60, verbosity=2)
    tpot.fit(X_train, y_train)
    clf = tpot.fitted_pipeline_
    
    
    acc.append(SCORERS['accuracy'](clf, X_val, y_val))
    f1.append(SCORERS['f1_macro'](clf, X_val, y_val))

In [None]:
print(acc)
print(f1)
print(neg_loss)

In [None]:
# H2O Testing
h2o.init()
str_labels = ["c" + str(x) for x in labels]
combined = pd.concat([features, pd.DataFrame(str_labels, columns=["class"]).astype(str)], axis=1)
h2o_dataset = h2o.H2OFrame(combined)
h2o_dataset

In [None]:
x = h2o_dataset.columns[:-1]
y = h2o_dataset.columns[-1]

In [None]:
acc = []
f1 = []
counter = 0

for train_ind, val_ind in kf.split(h2o_dataset):
    print("> Iteration", counter)
    if counter != 0:
        train = h2o.H2OFrame(combined.iloc[train_ind])
        val = h2o.H2OFrame(combined.iloc[val_ind])
        val_labels = combined.iloc[val_ind]['class']

        aml = H2OAutoML(max_runtime_secs=3600, seed=1)
        aml.train(x=x,y=y, training_frame=train)
        pred = aml.leader.predict(val)
        pred = h2o.as_list(pred[:, 0])

        acc.append(accuracy_score(pred, val_labels))
        f1.append(f1_score(pred, val_labels, average='macro'))
        
    counter += 1
    

In [None]:
# SVM Testing
acc = []
f1 = []

for train_ind, val_ind in kf.split(features, labels):
    X_train, y_train = features.iloc[train_ind], labels[train_ind]
    X_val, y_val = features.iloc[val_ind], labels[val_ind]
    
    svm = SVC(kernel='linear')
    svm.fit(X_train, y_train)
    
    acc.append(SCORERS['accuracy'](svm, X_val, y_val))
    f1.append(SCORERS['f1_macro'](svm, X_val, y_val))

In [None]:
# Random Forest Testing
acc = []
f1 = []

for train_ind, val_ind in kf.split(features, labels):
    X_train, y_train = features.iloc[train_ind], labels[train_ind]
    X_val, y_val = features.iloc[val_ind], labels[val_ind]
    
    rf = RandomForestClassifier()
    rf.fit(X_train, y_train)
    
    acc.append(SCORERS['accuracy'](rf, X_val, y_val))
    f1.append(SCORERS['f1_macro'](rf, X_val, y_val))