In [9]:
import os
import sys
import inspect
currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parentdir = os.path.dirname(currentdir)
sys.path.insert(0, parentdir)

import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
from wordEmbedders import Word2Vec, AverageClassifier
from functions import dirs, readSet, saveSet
tqdm.pandas()

In [10]:
datasets = dirs('../data')
#datasets = ['AirlineTweets2']

In [11]:
embedder = Word2Vec
classifier = AverageClassifier

In [12]:
sizes = [75, 50, 25, 10]
seeds = [1, 2, 3, 4, 5]

In [13]:
positiveWords = readSet('../wordlists/positiveWords.txt')
negativeWords = readSet('../wordlists/negativeWords.txt')

In [14]:
for dataset in tqdm(datasets, desc="Datasets"):
    dataFile   = f'../data/{dataset}/Data-Cleaned.csv'
    

    if not os.path.exists(dataFile):
        raise ValueError(f'Dataset {dataset} has not been cleaned')
    df = pd.read_csv(dataFile)

    for size in sizes:
        outputFile = f'./data/{dataset}/Prediction-{size}.txt'
        out = []
        for seed in seeds:
            modelFile = f'./models/{dataset}/{embedder.name}-{size}-{seed}.model'

            if not os.path.exists(modelFile):
                raise ValueError(f'Dataset {dataset} has no {embedder} of size {size} with seed {seed} trained')

            model = embedder.load(modelFile)
            clas = classifier(model, positiveWords, negativeWords, None, None)
            data = df.sample(frac=size/100, random_state=seed)
            result = clas.predict(data)
            out.append(str(result.balancedAccuracy))
        saveSet(outputFile, out)

Datasets:   0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/6673 [00:00<?, ?it/s]

  0%|          | 0/6673 [00:00<?, ?it/s]

  0%|          | 0/6673 [00:00<?, ?it/s]

  0%|          | 0/6673 [00:00<?, ?it/s]

  0%|          | 0/6673 [00:00<?, ?it/s]

  0%|          | 0/4448 [00:00<?, ?it/s]

  0%|          | 0/4448 [00:00<?, ?it/s]

  0%|          | 0/4448 [00:00<?, ?it/s]

  0%|          | 0/4448 [00:00<?, ?it/s]

  0%|          | 0/4448 [00:00<?, ?it/s]

  0%|          | 0/2224 [00:00<?, ?it/s]

  0%|          | 0/2224 [00:00<?, ?it/s]

  0%|          | 0/2224 [00:00<?, ?it/s]

  0%|          | 0/2224 [00:00<?, ?it/s]

  0%|          | 0/2224 [00:00<?, ?it/s]

  0%|          | 0/890 [00:00<?, ?it/s]

  0%|          | 0/890 [00:00<?, ?it/s]

  0%|          | 0/890 [00:00<?, ?it/s]

  0%|          | 0/890 [00:00<?, ?it/s]

  0%|          | 0/890 [00:00<?, ?it/s]

  0%|          | 0/37500 [00:00<?, ?it/s]

  0%|          | 0/37500 [00:00<?, ?it/s]

  0%|          | 0/37500 [00:00<?, ?it/s]

  0%|          | 0/37500 [00:00<?, ?it/s]

  0%|          | 0/37500 [00:00<?, ?it/s]

  0%|          | 0/25000 [00:00<?, ?it/s]

  0%|          | 0/25000 [00:00<?, ?it/s]

  0%|          | 0/25000 [00:00<?, ?it/s]

  0%|          | 0/25000 [00:00<?, ?it/s]

  0%|          | 0/25000 [00:00<?, ?it/s]

  0%|          | 0/12500 [00:00<?, ?it/s]

  0%|          | 0/12500 [00:00<?, ?it/s]

  0%|          | 0/12500 [00:00<?, ?it/s]

  0%|          | 0/12500 [00:00<?, ?it/s]

  0%|          | 0/12500 [00:00<?, ?it/s]

  0%|          | 0/5000 [00:00<?, ?it/s]

  0%|          | 0/5000 [00:00<?, ?it/s]

  0%|          | 0/5000 [00:00<?, ?it/s]

  0%|          | 0/5000 [00:00<?, ?it/s]

  0%|          | 0/5000 [00:00<?, ?it/s]