In [None]:
# from config import config
import argparse
from dataset import IMDbDataset, get_data_loaders, split_data
from collections import Counter
from tqdm import tqdm
tqdm.pandas()
import numpy as np

# -data_prepocess- #

def process_data(data):
    reviews = data['processed'].values
    words = ' '.join(reviews)
    words = words.split()

    counter = Counter(words)
    vocab = sorted(counter, key=counter.get, reverse=True)
    int2word = dict(enumerate(vocab, 1))
    int2word[0] = ''  # Assign index 0
    word2int = {word: id for id, word in int2word.items()}

    return int2word, word2int

def pad_features(reviews_enc, pad_token, seq_length=128):
    features = np.full((len(reviews_enc), seq_length), pad_token, dtype=int)

    for i, row in enumerate(reviews_enc):
        features[i, :len(row)] = np.array(row)[:seq_length]

    return features

def process_labels(data):
    labels = data.label.to_numpy()
    return labels

def preprocess_data(data, args):
    int2word, word2int = process_data(data)
    seq_length = args.seq_length

    reviews_enc = [[word2int[word] for word in review.split()] for review in tqdm(data['processed'])]
    features = pad_features(reviews_enc, pad_token=word2int[''], seq_length=seq_length)

    assert len(features) == len(reviews_enc)
    assert len(features[0]) == seq_length

    labels = process_labels(data)

    return features, labels

def data_loaders(features, labels, args):
    train_x, train_y, val_x, val_y, test_x, test_y = split_data(features, labels, args.train_size, args.val_size)
    train_loader, val_loader, test_loader = get_data_loaders(train_x, train_y, val_x, val_y, test_x, test_y, args.batch_size)

    return train_loader, val_loader, test_loader
