In [9]:
import torch
import emoji
import os
import sys
import pandas as pd
import numpy as np
import scipy.stats as ss
sys.path.insert(0, '../')

from tqdm import tqdm
from config import GPT2EmojiConfig
from model import GPT2LMEmojiModel
from transformers import GPT2Tokenizer
from run_language_modeling import load_and_cache_examples, targets_mask
from sst_binary import sst_binary
from sklearn.linear_model import LogisticRegression

MODEL_CLASSES = {
    "gpt2": (GPT2EmojiConfig, GPT2LMEmojiModel, GPT2Tokenizer),
}

MODEL_PATH = '../checkpoint-180000'

args = torch.load(os.path.join(MODEL_PATH, 'training_args.bin'))

config_class, model_class, tokenizer_class = MODEL_CLASSES['gpt2']

config = config_class.from_pretrained(MODEL_PATH)

tokenizer = tokenizer_class.from_pretrained(MODEL_PATH)

model = model_class.from_pretrained(
            MODEL_PATH,
            config=config,
)

map_target_to_token_id = dict(
        zip(range(0, len(emoji.UNICODE_EMOJI.keys())), tokenizer.encode(list(emoji.UNICODE_EMOJI.keys())))
)

Token indices sequence length is longer than the specified maximum sequence length for this model (2811 > 1024). Running this sequence through the model will result in indexing errors


In [2]:
trX, vaX, teX, trY, vaY, teY = sst_binary()

In [18]:
def train_with_reg_cv(trX, trY, vaX, vaY, teX=None, teY=None, penalty='l1',
        C=2**np.arange(-8, 1).astype(np.float), seed=42, solver='liblinear', max_iter=int(1e6)):
    scores = []
    for i, c in enumerate(C):
        model = LogisticRegression(C=c, penalty=penalty, random_state=seed+i, solver=solver, max_iter=max_iter)
        model.fit(trX, trY)
        score = model.score(vaX, vaY)
        scores.append(score)
    c = C[np.argmax(scores)]
    model = LogisticRegression(C=c, penalty=penalty, random_state=seed+len(C), solver=solver, max_iter=max_iter)
    model.fit(trX, trY)
    
    return model, c, scores

In [5]:
def encode(X, tokenizer, block_size=512):
    return tokenizer.batch_encode_plus(X, add_special_tokens=True, max_length=block_size)["input_ids"]

def transform(X, model, tokenizer, args):
    X_ids = encode(X, tokenizer, args.block_size)
    probas = []
    for x in tqdm(X_ids):
        outputs = model(torch.tensor(x).unsqueeze(0))
        logits = outputs[0].squeeze(0)
        
        probas.append(logits[-1].tolist())
    return probas

In [7]:
trXt = transform(trX, model, tokenizer, args)
vaXt = transform(vaX, model, tokenizer, args)
teXt = transform(teX, model, tokenizer, args)

100%|██████████| 6920/6920 [08:00<00:00, 14.40it/s]
100%|██████████| 872/872 [01:00<00:00, 14.42it/s]
100%|██████████| 1821/1821 [02:09<00:00, 14.09it/s]


In [19]:
train_with_reg_cv(trXt, trY, vaXt, vaY, teXt, teY)

(LogisticRegression(C=0.0625, class_weight=None, dual=False, fit_intercept=True,
                    intercept_scaling=1, l1_ratio=None, max_iter=1000000,
                    multi_class='auto', n_jobs=None, penalty='l1',
                    random_state=51, solver='liblinear', tol=0.0001, verbose=0,
                    warm_start=False),
 0.0625,
 [0.7993119266055045,
  0.8073394495412844,
  0.8027522935779816,
  0.8188073394495413,
  0.8279816513761468,
  0.8256880733944955,
  0.8188073394495413,
  0.8084862385321101,
  0.7924311926605505])

In [17]:
from sklearn import preprocessing
trXt_scaled = preprocessing.scale(trXt)
vaXt_scaled = preprocessing.scale(vaXt)
teXt_scaled = preprocessing.scale(teXt)

train_with_reg_cv(trXt_scaled, trY, vaXt_scaled, vaY, teXt_scaled, teY, penalty='l2')

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


(LogisticRegression(C=0.03125, class_weight=None, dual=False, fit_intercept=True,
                    intercept_scaling=1, l1_ratio=None, max_iter=100,
                    multi_class='auto', n_jobs=None, penalty='l2',
                    random_state=51, solver='lbfgs', tol=0.0001, verbose=0,
                    warm_start=False),
 0.03125,
 [0.8325688073394495,
  0.8325688073394495,
  0.8348623853211009,
  0.838302752293578,
  0.8371559633027523,
  0.8325688073394495,
  0.8302752293577982,
  0.8348623853211009,
  0.8337155963302753])