In [19]:
import pandas as pd
import numpy as np
import os
from wordEmbedders import Word2Vec, WESCScore, WESClassifier
from tqdm.notebook import tqdm
from prettytable import PrettyTable
tqdm.pandas()

In [2]:
datasets = os.listdir('./data')

In [3]:
wordEmbedders = [Word2Vec]

In [4]:
positiveWords = ["good", "nice", "cool", "lovely", "wonderful", "great", "awesome", "fantastic", "amazing", "fun", "excellent"]
negativeWords = ["bad", "horrible", "terrible", "awful", "worst", "shitty", "crappy", "sucks", "hate"]

In [22]:
out = []
for dataset in tqdm(datasets, desc="Datasets"):
    dataFile   = f'./data/{dataset}/Data-Cleaned.csv'
    outputFile = f'./data/{dataset}/Data-Predicted.csv'
    if not os.path.exists(dataFile):
        raise ValueError(f'Dataset {dataset} has not been cleaned')
    
    if os.path.exists(outputFile):
        df = pd.read_csv(outputFile)
    else:
        df = pd.read_csv(dataFile)

    embedderOut = []
    for embedder in wordEmbedders:
        columnPredict = f"{embedder.name}_predict"
        columnPredictCorrect = f"{embedder.name}_predict_correct"

        if columnPredict in df: 
            #load previus results
            data = pd.DataFrame()
            data['truth'] = df['sentiment']
            data['predicted'] = df[columnPredict]
            result = WESCScore(data)
            embedderOut.append((embedder.name, result))
        else:
            #predict results
            modelFile = f'./models/{dataset}/{embedder.name}.model'
            if not os.path.exists(modelFile):
                raise ValueError(f'Dataset {dataset} has no {embedder} trained')

            model = embedder.load(modelFile)
            classifier = WESClassifier(model, positiveWords, negativeWords)
            result = classifier.predict(df)
            embedderOut.append((embedder.name, result))
            df[columnPredict] = result.data['predicted']
            df[columnPredictCorrect] = result.correctPredictions

    df.to_csv(outputFile, index=False)
    out.append((dataset, embedderOut))

Datasets:   0%|          | 0/3 [00:00<?, ?it/s]

In [24]:
print("Baseline dataset evaluation")
for resultset in out:
    (dataset, results) = resultset
    table = PrettyTable(['Embeddings', 'Accuracy', 'F1'])
    for result in results:
        name, result = result
        table.add_row([name, result.balancedAccuracy, result.f1Score])

    print(f"\n{dataset}:")
    print(table)

Baseline dataset evaluation

AirlineTweets:
+------------+--------------------+---------------------+
| Embeddings |      Accuracy      |          F1         |
+------------+--------------------+---------------------+
|  Word2Vec  | 0.5384148067187106 | 0.16235893702220605 |
+------------+--------------------+---------------------+

IMDB:
+------------+--------------------+--------------------+
| Embeddings |      Accuracy      |         F1         |
+------------+--------------------+--------------------+
|  Word2Vec  | 0.5616399999999999 | 0.2339577799524675 |
+------------+--------------------+--------------------+

Sentiment140:
+------------+-----------+--------------------+
| Embeddings |  Accuracy |         F1         |
+------------+-----------+--------------------+
|  Word2Vec  | 0.6862125 | 0.6727609289350358 |
+------------+-----------+--------------------+
