In [1]:
import torch
import emoji
import os
import sys
import pandas as pd
import numpy as np
import scipy.stats as ss
import pickle
sys.path.insert(0, '../')

from tqdm import tqdm
from config import GPT2EmojiConfig
from model import GPT2LMEmojiModel
from transformers import GPT2Tokenizer
from run_language_modeling import load_and_cache_examples, targets_mask
from sst_binary import sst_binary
from utils import *
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split


MODEL_CLASSES = {
    "gpt2": (GPT2EmojiConfig, GPT2LMEmojiModel, GPT2Tokenizer),
}

MODEL_PATH = '../checkpoint-180000'

args = torch.load(os.path.join(MODEL_PATH, 'training_args.bin'))

config_class, model_class, tokenizer_class = MODEL_CLASSES['gpt2']

config = config_class.from_pretrained(MODEL_PATH)

tokenizer = tokenizer_class.from_pretrained(MODEL_PATH)

model = model_class.from_pretrained(
            MODEL_PATH,
            config=config,
)

map_target_to_token_id = dict(
        zip(range(0, len(emoji.UNICODE_EMOJI.keys())), tokenizer.encode(list(emoji.UNICODE_EMOJI.keys())))
)

Token indices sequence length is longer than the specified maximum sequence length for this model (2811 > 1024). Running this sequence through the model will result in indexing errors


In [2]:
DATASETS = [
    'Olympic',
    'PsychExp',
    'SCv1',
    'SCv2-GEN',
    'SE0714',
    'SS-Twitter',
    'SS-Youtube',
]
DIR = '../data'
FILENAME_RAW = 'raw.pickle'
CLASSES = 'negative, positive'.split(', ')
TRAIN_SIZE = 1000

In [3]:
def load_dataset(dataset):
    print(dataset)
    DATASET_PATH = f'{DIR}/{dataset}/{FILENAME_RAW}'
    with open(DATASET_PATH, 'rb') as file:
        data = pickle.load(file, fix_imports=True, encoding='bytes')
    
    # Decode data
    texts = [str(x) for x in data[b'texts']]
    # Extract labels
    labels = [x[b'label'] for x in data[b'info']]
    return texts, labels

def decode(y_in):
    y_out = []
    for y in y_in:
        y_out.append(np.argmax(y))
    return y_out

In [6]:
X, y = load_dataset(DATASETS[6])

SS-Youtube


In [7]:
Xt = tokenize(X, tokenizer)

In [8]:
trXt, teXt, trY, teY = train_test_split(Xt, y, train_size=TRAIN_SIZE, random_state=42)

In [9]:
mean_preds = predict_mean(trXt, model, tokenizer, map_target_to_token_id)

100%|██████████| 1000/1000 [02:03<00:00,  8.08it/s]


In [10]:
last_preds = predict_last(trXt, model, tokenizer, map_target_to_token_id)

100%|██████████| 1000/1000 [01:55<00:00,  8.69it/s]


In [11]:
max_preds = predict_max(trXt, model, tokenizer, map_target_to_token_id)

100%|██████████| 1000/1000 [01:55<00:00,  8.69it/s]


In [12]:
mean_cm = pd.crosstab(pd.Series(mean_preds), pd.Series(trY))
mean_cm

col_0,0,1
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
✨,1,2
❤,0,8
🎧,1,0
🎶,1,1
👀,2,11
👇,0,10
👉,0,1
👋,0,1
👍,0,41
👏,0,30


In [13]:
last_cm = pd.crosstab(pd.Series(last_preds), pd.Series(trY))
last_cm

col_0,0,1
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
✨,1,6
❤,0,13
🌞,0,1
🌟,1,1
🍀,0,2
🎧,1,0
🎶,2,4
👀,4,20
👇,0,4
👋,1,0


In [14]:
max_cm = pd.crosstab(pd.Series(max_preds), pd.Series(trY))
max_cm

col_0,0,1
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
✅,0,1
✨,1,1
❤,0,3
🇺,1,0
🌊,1,0
🌍,1,0
🌞,0,1
🌟,0,2
🌱,0,1
🌸,0,1


In [15]:
cramers_corrected_stat(mean_cm.to_numpy())

0.41877703830387697

In [16]:
cramers_corrected_stat(last_cm.to_numpy())

0.46682674232550947

In [17]:
cramers_corrected_stat(max_cm.to_numpy())

0.39367314158267297

In [18]:
theil_u(trY, mean_preds)

0.21227387850455948

In [19]:
theil_u(trY, last_preds)

0.24422281945316343

In [20]:
theil_u(trY, max_preds)

0.20098192920466604

## Logreg

In [21]:
Xtrans_last = transform_last(X, model, tokenizer, args)

100%|██████████| 2142/2142 [04:14<00:00,  8.42it/s]


In [22]:
Xtrans_mean = transform_mean(X, model, tokenizer, args)

100%|██████████| 2142/2142 [08:44<00:00,  4.08it/s]


In [23]:
Xtrans_max = transform_max(X, model, tokenizer, args)

100%|██████████| 2142/2142 [04:35<00:00,  7.77it/s]


In [30]:
trXtrans, teXtrans, trY, teY = train_test_split(Xtrans_last, y, train_size=TRAIN_SIZE, random_state=42)

In [31]:
model, C, scores = train_with_reg_cv(trXtrans, trY, teXtrans, teY, None, None, penalty='l2', solver='lbfgs')
model, C, scores

(LogisticRegression(C=0.03125, class_weight=None, dual=False, fit_intercept=True,
                    intercept_scaling=1, l1_ratio=None, max_iter=1000000,
                    multi_class='auto', n_jobs=None, penalty='l2',
                    random_state=51, solver='lbfgs', tol=0.0001, verbose=0,
                    warm_start=False),
 0.03125,
 [0.8957968476357268,
  0.8957968476357268,
  0.8966725043782837,
  0.8975481611208407,
  0.8975481611208407,
  0.8861646234676007,
  0.8835376532399299,
  0.8800350262697023,
  0.8774080560420315])

In [32]:
trXtrans, teXtrans, trY, teY = train_test_split(Xtrans_mean, y, train_size=TRAIN_SIZE, random_state=42)

In [33]:
model, C, scores = train_with_reg_cv(trXtrans, trY, teXtrans, teY, None, None, penalty='l2', solver='lbfgs')
model, C, scores

(LogisticRegression(C=0.00390625, class_weight=None, dual=False,
                    fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                    max_iter=1000000, multi_class='auto', n_jobs=None,
                    penalty='l2', random_state=51, solver='lbfgs', tol=0.0001,
                    verbose=0, warm_start=False),
 0.00390625,
 [0.8870402802101576,
  0.8852889667250438,
  0.8835376532399299,
  0.8800350262697023,
  0.8835376532399299,
  0.882661996497373,
  0.8774080560420315,
  0.873029772329247,
  0.87215411558669])

In [34]:
trXtrans, teXtrans, trY, teY = train_test_split(Xtrans_max, y, train_size=TRAIN_SIZE, random_state=42)

In [35]:
model, C, scores = train_with_reg_cv(trXtrans, trY, teXtrans, teY, None, None, penalty='l2', solver='lbfgs')
model, C, scores

(LogisticRegression(C=0.0078125, class_weight=None, dual=False,
                    fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                    max_iter=1000000, multi_class='auto', n_jobs=None,
                    penalty='l2', random_state=51, solver='lbfgs', tol=0.0001,
                    verbose=0, warm_start=False),
 0.0078125,
 [0.862521891418564,
  0.8669001751313485,
  0.8669001751313485,
  0.8633975481611208,
  0.8642732049036778,
  0.8590192644483362,
  0.8555166374781086,
  0.8528896672504378,
  0.8485113835376532])