In [1]:
import torch
import emoji
import os
import sys
import pandas as pd
import numpy as np
import scipy.stats as ss
import pickle
sys.path.insert(0, '../')

from tqdm import tqdm
from config import GPT2EmojiConfig
from model import GPT2LMEmojiModel
from transformers import GPT2Tokenizer
from run_language_modeling import load_and_cache_examples, targets_mask
from sst_binary import sst_binary
from utils import *
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split


MODEL_CLASSES = {
    "gpt2": (GPT2EmojiConfig, GPT2LMEmojiModel, GPT2Tokenizer),
}

MODEL_PATH = '../checkpoint-180000'

args = torch.load(os.path.join(MODEL_PATH, 'training_args.bin'))

config_class, model_class, tokenizer_class = MODEL_CLASSES['gpt2']

config = config_class.from_pretrained(MODEL_PATH)

tokenizer = tokenizer_class.from_pretrained(MODEL_PATH)

model = model_class.from_pretrained(
            MODEL_PATH,
            config=config,
)

map_target_to_token_id = dict(
        zip(range(0, len(emoji.UNICODE_EMOJI.keys())), tokenizer.encode(list(emoji.UNICODE_EMOJI.keys())))
)

Token indices sequence length is longer than the specified maximum sequence length for this model (2811 > 1024). Running this sequence through the model will result in indexing errors


In [2]:
DATASETS = [
    'Olympic',
    'PsychExp',
    'SCv1',
    'SCv2-GEN',
    'SE0714',
    'SS-Twitter',
    'SS-Youtube',
]
DIR = '../data'
FILENAME_RAW = 'raw.pickle'
CLASSES = 'negative & high control, positive & high control, negative & low control, positive & high control'.split(', ')
TRAIN_SIZE = 250

In [3]:
def load_dataset(dataset):
    print(dataset)
    DATASET_PATH = f'{DIR}/{dataset}/{FILENAME_RAW}'
    with open(DATASET_PATH, 'rb') as file:
        data = pickle.load(file, fix_imports=True, encoding='bytes')
    
    # Decode data
    texts = [str(x) for x in data[b'texts']]
    # Extract labels
    labels = [x[b'label'] for x in data[b'info']]
    return texts, labels

def decode(y_in):
    y_out = []
    for y in y_in:
        y_out.append(np.argmax(y))
    return y_out

In [4]:
X, y = load_dataset(DATASETS[0])
y = decode(y)

Olympic


In [5]:
Xt = tokenize(X, tokenizer)

In [6]:
trXt, teXt, trY, teY = train_test_split(Xt, y, train_size=TRAIN_SIZE, random_state=42)

In [7]:
mean_preds = predict_mean(trXt, model, tokenizer, map_target_to_token_id)

100%|██████████| 250/250 [00:17<00:00, 14.38it/s]


In [8]:
last_preds = predict_last(trXt, model, tokenizer, map_target_to_token_id)

100%|██████████| 250/250 [00:22<00:00, 10.97it/s]


In [9]:
max_preds = predict_max(trXt, model, tokenizer, map_target_to_token_id)

100%|██████████| 250/250 [00:22<00:00, 11.19it/s]


In [10]:
mean_cm = pd.crosstab(pd.Series(mean_preds), pd.Series(trY))
mean_cm

col_0,0,1,2,3
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
✨,0,3,0,0
🏆,0,1,0,0
👀,1,0,0,0
👇,0,2,0,0
👉,0,2,0,0
👍,0,1,0,0
👏,1,19,0,0
💕,0,1,0,0
💖,0,1,0,0
🔥,0,1,0,0


In [11]:
last_cm = pd.crosstab(pd.Series(last_preds), pd.Series(trY))
last_cm

col_0,0,1,2,3
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
✨,1,9,0,0
❤,0,1,0,0
🌊,0,1,0,0
🎉,0,1,0,0
🎶,0,0,0,1
🏀,1,0,0,0
🏆,1,4,0,0
👀,2,4,0,0
👇,1,4,0,1
👉,0,2,0,0


In [12]:
max_cm = pd.crosstab(pd.Series(max_preds), pd.Series(trY))
max_cm

col_0,0,1,2,3
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
✅,0,1,0,0
✨,1,6,0,0
🍆,0,1,0,0
🏀,1,0,0,0
🏆,0,6,0,0
🐐,14,21,2,2
👀,2,2,0,0
👇,4,9,0,0
👉,1,0,0,0
👌,0,2,0,0


In [13]:
cramers_corrected_stat(mean_cm.to_numpy())

0.11667511224672211

In [14]:
cramers_corrected_stat(last_cm.to_numpy())

0.20167103405462053

In [15]:
cramers_corrected_stat(max_cm.to_numpy())

0.0

In [16]:
theil_u(trY, mean_preds)

0.11969018748688671

In [17]:
theil_u(trY, last_preds)

0.2179991267066016

In [18]:
theil_u(trY, max_preds)

0.17182423129507915

## Logreg

In [19]:
Xtrans_last = transform_last(X, model, tokenizer, args)

100%|██████████| 1012/1012 [01:28<00:00, 11.44it/s]


In [20]:
Xtrans_mean = transform_mean(X, model, tokenizer, args)

100%|██████████| 1012/1012 [01:26<00:00, 11.69it/s]


In [21]:
Xtrans_max = transform_max(X, model, tokenizer, args)

100%|██████████| 1012/1012 [01:26<00:00, 11.64it/s]


In [22]:
trXtrans, teXtrans, trY, teY = train_test_split(Xtrans_last, y, train_size=TRAIN_SIZE, random_state=42)

In [23]:
model, C, scores = train_with_reg_cv(trXtrans, trY, teXtrans, teY, None, None, penalty='l2', solver='lbfgs', metrics='f1')
model, C, scores

(LogisticRegression(C=0.25, class_weight=None, dual=False, fit_intercept=True,
                    intercept_scaling=1, l1_ratio=None, max_iter=1000000,
                    multi_class='auto', n_jobs=None, penalty='l2',
                    random_state=51, solver='lbfgs', tol=0.0001, verbose=0,
                    warm_start=False),
 0.25,
 [0.4207020087025065,
  0.426018471659919,
  0.4207236432868747,
  0.4142515008461331,
  0.4250781938271483,
  0.42090328649687514,
  0.4301553470864616,
  0.42614023312430543,
  0.42389038190736467])

In [26]:
trXtrans, teXtrans, trY, teY = train_test_split(Xtrans_mean, y, train_size=TRAIN_SIZE, random_state=42)

In [27]:
model, C, scores = train_with_reg_cv(trXtrans, trY, teXtrans, teY, None, None, penalty='l2', solver='lbfgs', metrics='f1')
model, C, scores

(LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                    intercept_scaling=1, l1_ratio=None, max_iter=1000000,
                    multi_class='auto', n_jobs=None, penalty='l2',
                    random_state=51, solver='lbfgs', tol=0.0001, verbose=0,
                    warm_start=False),
 1.0,
 [0.40789280712329834,
  0.4074218321764669,
  0.4176381528221045,
  0.41807774270089304,
  0.4189967304046919,
  0.4161164392301299,
  0.41272822729714653,
  0.42344224403047936,
  0.42566793157664873])

In [24]:
trXtrans, teXtrans, trY, teY = train_test_split(Xtrans_max, y, train_size=TRAIN_SIZE, random_state=42)

In [25]:
model, C, scores = train_with_reg_cv(trXtrans, trY, teXtrans, teY, None, None, penalty='l2', solver='lbfgs', metrics='f1')
model, C, scores

(LogisticRegression(C=0.0625, class_weight=None, dual=False, fit_intercept=True,
                    intercept_scaling=1, l1_ratio=None, max_iter=1000000,
                    multi_class='auto', n_jobs=None, penalty='l2',
                    random_state=51, solver='lbfgs', tol=0.0001, verbose=0,
                    warm_start=False),
 0.0625,
 [0.36235632183908045,
  0.37275128718871153,
  0.39830666142303967,
  0.39871422576373233,
  0.40240618841627046,
  0.4019058340245685,
  0.39820402021787893,
  0.3926931567292102,
  0.3901222688779777])

In [32]:
trXtrans, teXtrans, trY, teY = train_test_split(Xtrans_mean, y, train_size=TRAIN_SIZE, random_state=42)
model, C, scores = train_with_reg_cv(trXtrans, trY, teXtrans, teY, None, None, penalty='l2', solver='lbfgs', metrics='f1', C=2**np.arange(-8, 6).astype(np.float))
model, C, scores

(LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                    intercept_scaling=1, l1_ratio=None, max_iter=1000000,
                    multi_class='auto', n_jobs=None, penalty='l2',
                    random_state=56, solver='lbfgs', tol=0.0001, verbose=0,
                    warm_start=False),
 1.0,
 [0.40789280712329834,
  0.4074218321764669,
  0.4176381528221045,
  0.41807774270089304,
  0.4189967304046919,
  0.4161164392301299,
  0.41272822729714653,
  0.42344224403047936,
  0.42566793157664873,
  0.4220241624085244,
  0.4221658508580735,
  0.4198090995691614,
  0.41972112354211266,
  0.4202206838459525])