In [7]:
import os
import sys
import inspect
currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parentdir = os.path.dirname(currentdir)
sys.path.insert(0, parentdir)

import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
from wordEmbedders import Word2Vec
from functions import dirs
tqdm.pandas()

In [8]:
datasets = dirs('../data')
embedder = Word2Vec
sizes = [75, 50, 25, 10]
seeds = [1,2,3,4,5]

In [9]:
for dataset in tqdm(datasets, desc="Datasets"):
    dataFile = f'../data/{dataset}/Data-Cleaned.csv'
    
    if not os.path.exists(dataFile):
        raise ValueError(f'Dataset {dataset} has not been cleaned')
    if not os.path.exists(f'./models/{dataset}'):
        os.mkdir(f'./models/{dataset}')

    df = pd.read_csv(dataFile)
    for size in sizes:
        for seed in seeds:
            modelFile = f'./models/{dataset}/{embedder.name}-{size}-{seed}.model'
            if not os.path.exists(modelFile):
                print(f'Training {embedder.name} size {size} with seed {seed} for {dataset}')
                data = df.sample(frac=size/100, random_state=seed)
                model = embedder(data['text'], 1)
                print('Vocab time: {} mins'.format(round((model.vocabTime) / 60, 2)))
                model.train(tqdm)
                model.save(modelFile)

Datasets:   0%|          | 0/2 [00:00<?, ?it/s]

Training Word2Vec size 75 with seed 3 for AirlineTweets2
Vocab time: 0.01 mins


Word2Vec epochs:   0%|          | 0/30 [00:00<?, ?it/s]

Training Word2Vec size 75 with seed 4 for AirlineTweets2
Vocab time: 0.01 mins


Word2Vec epochs:   0%|          | 0/30 [00:00<?, ?it/s]

Training Word2Vec size 75 with seed 5 for AirlineTweets2
Vocab time: 0.01 mins


Word2Vec epochs:   0%|          | 0/30 [00:00<?, ?it/s]

Training Word2Vec size 50 with seed 3 for AirlineTweets2
Vocab time: 0.01 mins


Word2Vec epochs:   0%|          | 0/30 [00:00<?, ?it/s]

Training Word2Vec size 50 with seed 4 for AirlineTweets2
Vocab time: 0.01 mins


Word2Vec epochs:   0%|          | 0/30 [00:00<?, ?it/s]

Training Word2Vec size 50 with seed 5 for AirlineTweets2
Vocab time: 0.01 mins


Word2Vec epochs:   0%|          | 0/30 [00:00<?, ?it/s]

Training Word2Vec size 25 with seed 3 for AirlineTweets2
Vocab time: 0.0 mins


Word2Vec epochs:   0%|          | 0/30 [00:00<?, ?it/s]

Training Word2Vec size 25 with seed 4 for AirlineTweets2
Vocab time: 0.0 mins


Word2Vec epochs:   0%|          | 0/30 [00:00<?, ?it/s]

Training Word2Vec size 25 with seed 5 for AirlineTweets2
Vocab time: 0.0 mins


Word2Vec epochs:   0%|          | 0/30 [00:00<?, ?it/s]

Training Word2Vec size 10 with seed 3 for AirlineTweets2
Vocab time: 0.0 mins


Word2Vec epochs:   0%|          | 0/30 [00:00<?, ?it/s]

Training Word2Vec size 10 with seed 4 for AirlineTweets2
Vocab time: 0.0 mins


Word2Vec epochs:   0%|          | 0/30 [00:00<?, ?it/s]

Training Word2Vec size 10 with seed 5 for AirlineTweets2
Vocab time: 0.0 mins


Word2Vec epochs:   0%|          | 0/30 [00:00<?, ?it/s]

Training Word2Vec size 75 with seed 3 for IMDB
Vocab time: 0.46 mins


Word2Vec epochs:   0%|          | 0/30 [00:00<?, ?it/s]

Training Word2Vec size 75 with seed 4 for IMDB
Vocab time: 0.46 mins


Word2Vec epochs:   0%|          | 0/30 [00:00<?, ?it/s]

Training Word2Vec size 75 with seed 5 for IMDB
Vocab time: 0.44 mins


Word2Vec epochs:   0%|          | 0/30 [00:00<?, ?it/s]

Training Word2Vec size 50 with seed 3 for IMDB
Vocab time: 0.31 mins


Word2Vec epochs:   0%|          | 0/30 [00:00<?, ?it/s]

Training Word2Vec size 50 with seed 4 for IMDB
Vocab time: 0.31 mins


Word2Vec epochs:   0%|          | 0/30 [00:00<?, ?it/s]

Training Word2Vec size 50 with seed 5 for IMDB
Vocab time: 0.32 mins


Word2Vec epochs:   0%|          | 0/30 [00:00<?, ?it/s]

Training Word2Vec size 25 with seed 3 for IMDB
Vocab time: 0.17 mins


Word2Vec epochs:   0%|          | 0/30 [00:00<?, ?it/s]

Training Word2Vec size 25 with seed 4 for IMDB
Vocab time: 0.18 mins


Word2Vec epochs:   0%|          | 0/30 [00:00<?, ?it/s]

Training Word2Vec size 25 with seed 5 for IMDB
Vocab time: 0.17 mins


Word2Vec epochs:   0%|          | 0/30 [00:00<?, ?it/s]

Training Word2Vec size 10 with seed 3 for IMDB
Vocab time: 0.08 mins


Word2Vec epochs:   0%|          | 0/30 [00:00<?, ?it/s]

Training Word2Vec size 10 with seed 4 for IMDB
Vocab time: 0.09 mins


Word2Vec epochs:   0%|          | 0/30 [00:00<?, ?it/s]

Training Word2Vec size 10 with seed 5 for IMDB
Vocab time: 0.09 mins


Word2Vec epochs:   0%|          | 0/30 [00:00<?, ?it/s]