In [11]:
import pandas as pd
import numpy as np
import os
from wordEmbedders import Word2Vec, WESCScore, AverageClassifier, AverageTFIDFClassifier, MinMaxClassifier, MinMaxTFIDFClassifier
from tqdm.notebook import tqdm
from prettytable import PrettyTable, ALL
from gensim.models import TfidfModel
from gensim.corpora import Dictionary
tqdm.pandas()

In [2]:
datasets = ['AirlineTweets']
#datasets = ['Sentiment140']

In [3]:
wordEmbedders = [Word2Vec]

In [4]:
classifiers = [AverageClassifier, AverageTFIDFClassifier, MinMaxClassifier, MinMaxTFIDFClassifier]

In [5]:
positiveWords = ["good", "nice", "cool", "lovely", "wonderful", "great", "awesome", "fantastic", "amazing", "fun", "excellent"]
negativeWords = ["bad", "horrible", "terrible", "awful", "worst", "shitty", "crappy", "sucks", "hate"]

In [6]:
out = []
for dataset in tqdm(datasets, desc="Datasets"):
    dataFile   = f'./data/{dataset}/Data-Cleaned.csv'
    tfidfFile  = f'./models/{dataset}/TF-IDF.model'
    dictFile   = f'./models/{dataset}/Dictionary.model'
    tfidf = TfidfModel.load(tfidfFile)
    dct   = Dictionary.load(dictFile)
    if not os.path.exists(dataFile):
        raise ValueError(f'Dataset {dataset} has not been cleaned')
    
    df = pd.read_csv(dataFile)
        
    embedderOut = []
    for embedder in wordEmbedders:
        modelFile = f'./models/{dataset}/{embedder.name}.model'
        if not os.path.exists(modelFile):
            raise ValueError(f'Dataset {dataset} has no {embedder} trained')
        model = embedder.load(modelFile)

        for classifier in classifiers:
            classifier = classifier(model, positiveWords, negativeWords, tfidf, dct)

            columnPredict = f"{embedder.name}_{classifier.name}_predict"
            columnPredictCorrect = f"{embedder.name}_{classifier.name}_predict_correct"

            #predict results
            result = classifier.predict(df)
            embedderOut.append((embedder.name, classifier.name, result))

    out.append((dataset, embedderOut))

Datasets:   0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/11541 [00:00<?, ?it/s]

  0%|          | 0/11541 [00:00<?, ?it/s]

  0%|          | 0/11541 [00:00<?, ?it/s]

  0%|          | 0/11541 [00:00<?, ?it/s]

In [12]:
print("Baseline dataset evaluation")
for resultset in out:
    (dataset, results) = resultset
    table = PrettyTable(['Embeddings', 'Averaging', 'Balanced Accuracy', 'F1 Score', 'Confusion Matrix'])
    table.align['Embeddings'] = 'l'
    table.align['Averaging'] = 'l'
    table.align['Balanced Accuracy'] = 'r'
    table.align['F1 Score'] = 'r'
    table.align['Confusion Matrix'] = 'r'
    table.hrules = ALL
    for result in results:
        name, classifier, result = result
        table.add_row([name, classifier, '%.5f' % result.balancedAccuracy, '%.5f' % result.f1Score, result.confusionMatrix])


    print(f"\n{dataset}:")
    print(table)

Baseline dataset evaluation

AirlineTweets:
+------------+--------------+-------------------+----------+------------------+
| Embeddings | Averaging    | Balanced Accuracy | F1 Score | Confusion Matrix |
+------------+--------------+-------------------+----------+------------------+
| Word2Vec   | Average      |           0.53841 |  0.16236 |       223 | 161  |
|            |              |                   |          |      -----+----- |
|            |              |                   |          |      2140 | 9017 |
+------------+--------------+-------------------+----------+------------------+
| Word2Vec   | AverageTFIDF |           0.48133 |  0.31813 |      1929 | 7835 |
|            |              |                   |          |      -----+----- |
|            |              |                   |          |       434 | 1343 |
+------------+--------------+-------------------+----------+------------------+
| Word2Vec   | MinMax       |           0.48115 |  0.32461 |      2121 | 858