In [1]:
import pandas as pd
import numpy as np
import os
from wordEmbedders import Word2Vec, WESCScore, WESClassifier, AverageTFIDFClassifier, AverageClassifier
from tqdm.notebook import tqdm
from gensim.models import TfidfModel
from gensim.corpora import Dictionary
from prettytable import PrettyTable
tqdm.pandas()

In [2]:
#datasets = os.listdir('./data')
#datasets = ['Sentiment140']
datasets = ['AirlineTweets']

In [3]:
wordEmbedders = [Word2Vec]
#classifier = AverageTFIDFClassifier
classifier = AverageClassifier

In [4]:
positiveWords = ["good", "nice", "cool", "lovely", "wonderful", "great", "awesome", "fantastic", "amazing", "fun", "excellent"]
negativeWords = ["bad", "horrible", "terrible", "awful", "worst", "shitty", "crappy", "sucks", "hate"]

In [5]:
out = []
for dataset in tqdm(datasets, desc="Datasets"):
    dataFile   = f'./data/{dataset}/Data-Cleaned.csv'
    outputFile = f'./data/{dataset}/Data-Predicted.csv'
    tfidfFile  = f'./models/{dataset}/TF-IDF.model'
    dictFile   = f'./models/{dataset}/Dictionary.model'
    tfidf = TfidfModel.load(tfidfFile)
    dct   = Dictionary.load(dictFile)

    if not os.path.exists(dataFile):
        raise ValueError(f'Dataset {dataset} has not been cleaned')
    df = pd.read_csv(dataFile)
    # if os.path.exists(outputFile):
    #     df = pd.read_csv(outputFile)
    # else:
    #     df = pd.read_csv(dataFile)

    embedderOut = []
    for embedder in wordEmbedders:
        columnPredict = f"{embedder.name}_predict"
        columnPredictCorrect = f"{embedder.name}_predict_correct"

        if columnPredict in df: 
            #load previus results
            data = pd.DataFrame()
            data['truth'] = df['sentiment']
            data['predicted'] = df[columnPredict]
            result = WESCScore(data)
            embedderOut.append((embedder.name, result))
        else:
            #predict results
            modelFile = f'./models/{dataset}/{embedder.name}.model'
            if not os.path.exists(modelFile):
                raise ValueError(f'Dataset {dataset} has no {embedder} trained')

            model = embedder.load(modelFile)
            clas = classifier(model, positiveWords, negativeWords, tfidf, dct)
            result = clas.predict(df)
            embedderOut.append((embedder.name, result))
            df[columnPredict] = result.data['predicted']
            df[columnPredictCorrect] = result.correctPredictions

    df.to_csv(outputFile, index=False)
    out.append((dataset, embedderOut))

Datasets:   0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/11541 [00:00<?, ?it/s]

In [7]:
print("Baseline dataset evaluation")
for resultset in out:
    (dataset, results) = resultset
    table = PrettyTable(['Embeddings', 'Balanced Accuracy', 'F1'])
    for result in results:
        name, result = result
        table.add_row([name, result.balancedAccuracy, result.f1Score])

    print(f"\n{dataset}:")
    print(table)

Baseline dataset evaluation

AirlineTweets:
+------------+--------------------+---------------------+
| Embeddings | Balanced Accuracy  |          F1         |
+------------+--------------------+---------------------+
|  Word2Vec  | 0.5384148067187106 | 0.16235893702220605 |
+------------+--------------------+---------------------+
