In [1]:
import os
import prettytable
import numpy as np
import pandas as pd
from tqdm.notebook  import tqdm
from functions      import readSet, dirs
from gensim.models  import TfidfModel
from gensim.corpora import Dictionary
from prettytable    import PrettyTable
from wordEmbedders  import Word2Vec, WESCScore, AverageClassifier
#from gensim.models import Word2Vec
tqdm.pandas()

In [6]:
datasets = dirs('./data')
#datasets = ['AirlineTweets']
ignoreCache = True
embedder = Word2Vec
classifier = AverageClassifier
positiveWords = readSet('./wordlists/positiveWords.txt')
negativeWords = readSet('./wordlists/negativeWords.txt')

In [7]:
out = []
for dataset in tqdm(datasets, desc="Datasets"):
    dataFile   = f'./data/{dataset}/Data-Cleaned.csv'
    outputFile = f'./data/{dataset}/{embedder.name}-Prediction.csv'
    modelFile  = f'./models/{dataset}/{embedder.name}.model'
    tfidfFile  = f'./models/{dataset}/TF-IDF.model'
    dictFile   = f'./models/{dataset}/Dictionary.model'
    tfidf = TfidfModel.load(tfidfFile)
    dct   = Dictionary.load(dictFile)

    if not os.path.exists(dataFile):
        raise ValueError(f'Dataset {dataset} has not been cleaned')
    if not os.path.exists(modelFile):
        raise ValueError(f'Dataset {dataset} has no {embedder.name} trained')
    
    if os.path.exists(outputFile) and not ignoreCache:
        print(f'{dataset}: using cached data')
        result = WESCScore.load(outputFile)
    else:
        print(f'{dataset}: predicting')
        df     = pd.read_csv(dataFile)
        model  = embedder.load(modelFile)
        #model  = Word2Vec.load(modelFile).wv
        clas   = classifier(model, positiveWords, negativeWords, tfidf, dct)
        result = clas.predict(df)
        result.save(outputFile)

    out.append((dataset, result))

Datasets:   0%|          | 0/3 [00:00<?, ?it/s]

AirlineTweets: predicting


  0%|          | 0/11541 [00:00<?, ?it/s]

IMDB: predicting


  0%|          | 0/50000 [00:00<?, ?it/s]

Sentiment140: predicting


  0%|          | 0/1600000 [00:00<?, ?it/s]

In [8]:
print("Baseline dataset evaluation")
table = PrettyTable(['Dataset', 'Accuracy', 'Balanced Accuracy', 'F1', 'Confusion Matrix'])
for dataset, result in out:
    acc = (result.truePos + result.trueNeg) / len(result.data)
    table.add_row([dataset, acc, result.balancedAccuracy, result.f1Score, result.confusionMatrix])
table.hrules = prettytable.ALL
print(table)

Baseline dataset evaluation
+---------------+--------------------+--------------------+---------------------+------------------+
|    Dataset    |      Accuracy      | Balanced Accuracy  |          F1         | Confusion Matrix |
+---------------+--------------------+--------------------+---------------------+------------------+
| AirlineTweets | 0.8084221471276319 | 0.5453603609876125 | 0.17592247484159523 |    236 | 84      |
|               |                    |                    |                     |   -----+-----    |
|               |                    |                    |                     |   2127 | 9094    |
+---------------+--------------------+--------------------+---------------------+------------------+
|      IMDB     |       0.5755       |       0.5755       |  0.279409268375488  |   4115 | 340     |
|               |                    |                    |                     |  ------+------   |
|               |                    |                    |    

In [5]:
Single word for good and bad
+---------------+--------------------+--------------------+--------------------+------------------+
|    Dataset    |      Accuracy      | Balanced Accuracy  |         F1         | Confusion Matrix |
+---------------+--------------------+--------------------+--------------------+------------------+
| AirlineTweets | 0.5347023654795945 | 0.4920447449867007 | 0.2697851509382649 |    992 | 3999    |
|               |                    |                    |                    |   -----+-----    |
|               |                    |                    |                    |   1371 | 5179    |
+---------------+--------------------+--------------------+--------------------+------------------+
|      IMDB     |      0.57672       |      0.57672       | 0.7009720809313892 |  24806 | 20970   |
|               |                    |                    |                    |  ------+------   |
|               |                    |                    |                    |    194 | 4030    |
+---------------+--------------------+--------------------+--------------------+------------------+
|  Sentiment140 |    0.669751875     |    0.669751875     | 0.6459582799263771 | 482037 | 210434  |
|               |                    |                    |                    | -------+-------  |
|               |                    |                    |                    | 317963 | 589566  |
+---------------+--------------------+--------------------+--------------------+------------------+

SyntaxError: invalid syntax (<ipython-input-5-e0be91b24ebe>, line 1)