In [1]:
import torch
import emoji
import os
import sys
import pandas as pd
import numpy as np
import scipy.stats as ss
import pickle
sys.path.insert(0, '../')

from tqdm import tqdm
from config import GPT2EmojiConfig
from model import GPT2LMEmojiModel
from transformers import GPT2Tokenizer
from run_language_modeling import load_and_cache_examples, targets_mask
from sst_binary import sst_binary
from utils import *
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split


MODEL_CLASSES = {
    "gpt2": (GPT2EmojiConfig, GPT2LMEmojiModel, GPT2Tokenizer),
}

MODEL_PATH = '../checkpoint-180000'

args = torch.load(os.path.join(MODEL_PATH, 'training_args.bin'))

config_class, model_class, tokenizer_class = MODEL_CLASSES['gpt2']

config = config_class.from_pretrained(MODEL_PATH)

tokenizer = tokenizer_class.from_pretrained(MODEL_PATH)

model = model_class.from_pretrained(
            MODEL_PATH,
            config=config,
)

map_target_to_token_id = dict(
        zip(range(0, len(emoji.UNICODE_EMOJI.keys())), tokenizer.encode(list(emoji.UNICODE_EMOJI.keys())))
)

Token indices sequence length is longer than the specified maximum sequence length for this model (2811 > 1024). Running this sequence through the model will result in indexing errors


In [2]:
DATASETS = [
    'Olympic',
    'PsychExp',
    'SCv1',
    'SCv2-GEN',
    'SE0714',
    'SS-Twitter',
    'SS-Youtube',
]
DIR = '../data'
FILENAME_RAW = 'raw.pickle'
CLASSES = 'fear, joy, sadness'.split(', ')
TRAIN_SIZE = 250

In [3]:
def load_dataset(dataset):
    print(dataset)
    DATASET_PATH = f'{DIR}/{dataset}/{FILENAME_RAW}'
    with open(DATASET_PATH, 'rb') as file:
        data = pickle.load(file, fix_imports=True, encoding='bytes')
    
    # Decode data
    texts = [str(x) for x in data[b'texts']]
    # Extract labels
    labels = [x[b'label'] for x in data[b'info']]
    return texts, labels

def decode(y_in):
    y_out = []
    for y in y_in:
        y_out.append(np.argmax(y))
    return y_out

In [4]:
X, y = load_dataset(DATASETS[4])
y = decode(y)

SE0714


In [5]:
Xt = tokenize(X, tokenizer)

In [6]:
trXt, teXt, trY, teY = train_test_split(Xt, y, train_size=TRAIN_SIZE, random_state=42)

In [7]:
mean_preds = predict_mean(trXt, model, tokenizer, map_target_to_token_id)

100%|██████████| 250/250 [00:16<00:00, 14.72it/s]


In [8]:
last_preds = predict_last(trXt, model, tokenizer, map_target_to_token_id)

100%|██████████| 250/250 [00:17<00:00, 13.98it/s]


In [9]:
max_preds = predict_max(trXt, model, tokenizer, map_target_to_token_id)

100%|██████████| 250/250 [00:20<00:00, 12.33it/s]


In [10]:
mean_cm = pd.crosstab(pd.Series(mean_preds), pd.Series(trY))
mean_cm

col_0,0,1,2
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
🌊,1,0,0
🎂,0,1,0
👀,1,0,0
👇,2,1,0
👍,0,1,0
👑,0,1,0
🔥,4,3,0
😂,146,30,16
😍,2,0,0
😭,28,5,8


In [11]:
last_cm = pd.crosstab(pd.Series(last_preds), pd.Series(trY))
last_cm

col_0,0,1,2
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
✅,2,0,0
➡,1,0,0
🌱,1,0,0
🎂,0,2,0
🎧,1,0,0
🏆,1,0,0
👀,6,2,0
👇,18,6,1
👑,1,1,0
💰,1,1,0


In [12]:
max_cm = pd.crosstab(pd.Series(max_preds), pd.Series(trY))
max_cm

col_0,0,1,2
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
✅,2,1,0
✨,1,0,0
❌,1,0,0
🇳 🇬,2,0,0
🌊,1,1,0
🌍,2,0,0
🌱,1,0,0
🎂,0,1,0
🎉,0,1,0
🎧,1,0,0


In [13]:
cramers_corrected_stat(mean_cm.to_numpy())

0.129196105011018

In [14]:
cramers_corrected_stat(last_cm.to_numpy())

0.0854588539045625

In [15]:
cramers_corrected_stat(max_cm.to_numpy())

0.0

In [16]:
theil_u(trY, mean_preds)

0.05952456520407593

In [17]:
theil_u(trY, last_preds)

0.09891635436735878

In [18]:
theil_u(trY, max_preds)

0.14241994842086

## Logreg

In [19]:
Xtrans_last = transform_last(X, model, tokenizer, args)

100%|██████████| 1250/1250 [01:45<00:00, 11.89it/s]


In [20]:
Xtrans_mean = transform_mean(X, model, tokenizer, args)

100%|██████████| 1250/1250 [01:38<00:00, 12.72it/s]


In [21]:
Xtrans_max = transform_max(X, model, tokenizer, args)

100%|██████████| 1250/1250 [01:26<00:00, 14.38it/s]


In [22]:
trXtrans, teXtrans, trY, teY = train_test_split(Xtrans_last, y, train_size=TRAIN_SIZE, random_state=42)

In [23]:
model, C, scores = train_with_reg_cv(trXtrans, trY, teXtrans, teY, None, None, penalty='l2', solver='lbfgs', metrics='f1')
model, C, scores

(LogisticRegression(C=0.125, class_weight=None, dual=False, fit_intercept=True,
                    intercept_scaling=1, l1_ratio=None, max_iter=1000000,
                    multi_class='auto', n_jobs=None, penalty='l2',
                    random_state=51, solver='lbfgs', tol=0.0001, verbose=0,
                    warm_start=False),
 0.125,
 [0.384587781780695,
  0.39830244024215194,
  0.45681514536908585,
  0.47539682539682543,
  0.4768107247990488,
  0.49088925208778716,
  0.4751599147121535,
  0.47807450374498384,
  0.4739265895404901])

In [24]:
trXtrans, teXtrans, trY, teY = train_test_split(Xtrans_mean, y, train_size=TRAIN_SIZE, random_state=42)

In [25]:
model, C, scores = train_with_reg_cv(trXtrans, trY, teXtrans, teY, None, None, penalty='l2', solver='lbfgs', metrics='f1')
model, C, scores

(LogisticRegression(C=0.125, class_weight=None, dual=False, fit_intercept=True,
                    intercept_scaling=1, l1_ratio=None, max_iter=1000000,
                    multi_class='auto', n_jobs=None, penalty='l2',
                    random_state=51, solver='lbfgs', tol=0.0001, verbose=0,
                    warm_start=False),
 0.125,
 [0.3311064815898504,
  0.36304772219513315,
  0.3967358092239319,
  0.4107119922287467,
  0.43949941247687946,
  0.45652396617424484,
  0.4557292250758515,
  0.45092100915168926,
  0.4459154300321524])

In [26]:
trXtrans, teXtrans, trY, teY = train_test_split(Xtrans_max, y, train_size=TRAIN_SIZE, random_state=42)

In [27]:
model, C, scores = train_with_reg_cv(trXtrans, trY, teXtrans, teY, None, None, penalty='l2', solver='lbfgs', metrics='f1')
model, C, scores

(LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                    intercept_scaling=1, l1_ratio=None, max_iter=1000000,
                    multi_class='auto', n_jobs=None, penalty='l2',
                    random_state=51, solver='lbfgs', tol=0.0001, verbose=0,
                    warm_start=False),
 1.0,
 [0.3259073449763298,
  0.3622496210391668,
  0.42044292084806756,
  0.4318571873879824,
  0.4406101283601238,
  0.4470250485340023,
  0.45210740301885327,
  0.4476758819469817,
  0.4529682346814789])