# intermediate_IMDB
This notebook takes our IMDB dataset and trains an intermediate model.

## Imports & Settings

First, update working directory to parent so that we may use our custom functions

In [1]:
import os
 
os.chdir('..')
# os.getcwd( )

In [2]:
import params
from utils import *
from trainer import *

import numpy as np
import pandas as pd

from tqdm import tqdm
from tqdm import trange

from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score

from transformers import RobertaTokenizer, RobertaForSequenceClassification

# suppress model warning
from transformers import logging
logging.set_verbosity_error()

# set logging level
import logging
logging.basicConfig(format='%(message)s', level=logging.INFO)

# set general seeds
set_seeds(1)

# set dataloader generator seed
g = torch.Generator()
g.manual_seed(1)

<torch._C.Generator at 0x29420e4f0>

In [3]:
# Ensure we're on an ARM environment if necessary.
platform_check()

We're Armed: macOS-13.1-arm64-i386-64bit


## intermediate_SARC


In [None]:
# load test data
SARC_df = pd.read_csv('data/SARC/SARC_preped_sampled_test.csv')

# get model paths
top_model_dirs = ["model_saves/intermediate_SARC_01"]

In [None]:
for top in top_model_dirs:
    models = parse_model_dir(top)
    evaluate_model(SARC_df, models, 'results/intermediate_results.csv')

Model: model_saves/intermediate_SARC_01/E01_A0.75_F0.71


	Test 20000: 100%|██████████| 20000/20000 [14:00<00:00, 23.79test/s]


	- Accuracy: 0.74985
	- F1: 0.7291723055269854

Model: model_saves/intermediate_SARC_01/E02_A0.75_F0.73


	Test 20000: 100%|██████████| 20000/20000 [16:06<00:00, 20.69test/s]


	- Accuracy: 0.761
	- F1: 0.7458258002765075

Model: model_saves/intermediate_SARC_01/E03_A0.76_F0.73


	Test 20000: 100%|██████████| 20000/20000 [16:38<00:00, 20.02test/s]


	- Accuracy: 0.7604
	- F1: 0.7499739121360743

Model: model_saves/intermediate_SARC_01/E04_A0.75_F0.74


	Test 20000: 100%|██████████| 20000/20000 [16:44<00:00, 19.91test/s]


	- Accuracy: 0.75925
	- F1: 0.7586829048263419

Model: model_saves/intermediate_SARC_01/E05_A0.75_F0.73


	Test 20000: 100%|██████████| 20000/20000 [19:01<00:00, 17.53test/s]


	- Accuracy: 0.75285
	- F1: 0.7490480783875718

Model: model_saves/intermediate_SARC_01/E06_A0.75_F0.73


	Test 20000: 100%|██████████| 20000/20000 [16:45<00:00, 19.89test/s]


	- Accuracy: 0.7444
	- F1: 0.7376308766167111

Model: model_saves/intermediate_SARC_01/E07_A0.75_F0.73


	Test 20000: 100%|██████████| 20000/20000 [16:30<00:00, 20.20test/s]


	- Accuracy: 0.74475
	- F1: 0.743377067310109

Model: model_saves/intermediate_SARC_01/E08_A0.74_F0.73


	Test 20000: 100%|██████████| 20000/20000 [17:20<00:00, 19.22test/s]


	- Accuracy: 0.74535
	- F1: 0.7485807375228316

Model: model_saves/intermediate_SARC_01/E09_A0.75_F0.74


	Test 20000: 100%|██████████| 20000/20000 [16:48<00:00, 19.83test/s]


	- Accuracy: 0.74815
	- F1: 0.7519574530949917

Model: model_saves/intermediate_SARC_01/E10_A0.74_F0.73


	Test 20000: 100%|██████████| 20000/20000 [16:13<00:00, 20.54test/s]


	- Accuracy: 0.74685
	- F1: 0.7468626568671567



## Load & Predict

### Full Test

In [5]:
from transformers import TextClassificationPipeline
from transformers import AutoModelForSequenceClassification

PATH = 'model_saves/intermediate_IMDB_01/E04_A0.94_F0.94/'
model = AutoModelForSequenceClassification.from_pretrained(PATH, local_files_only=True)
tokenizer = RobertaTokenizer.from_pretrained(PATH, local_files_only=True)

# define pipeline
pipe = TextClassificationPipeline(model=model, tokenizer=tokenizer, top_k=2)

In [None]:
df = pd.read_csv('data/target_semEval2022_en/iSarcasmEval-main/test/task_A_En_test.csv')
df = df.rename(columns={'tweet': 'text'})
df = df.rename(columns={'sarcastic': 'label'})

df.head()

In [None]:
test_input = df['text'].to_list()

test_output = []

# run tests and append to output
with tqdm(test_input, unit="test") as prog:
    for step, test in enumerate(prog):
        prog.set_description(f"Test {step}")
        test_output.append(pipe(test)[0])

In [None]:
# parse predictions to new list
predictions = []

for i in test_output:
    predictions.append(i[0]['label'])
    
print(len(predictions))

In [None]:
df['preds'] = predictions
df["preds"] = df["preds"].str.replace("LABEL_","")
df['preds'] = df["preds"].astype(int)
df.tail()

In [None]:
df.info()

In [None]:
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score

# epoch 3
acc = accuracy_score(df['label'], df['preds'])
f1 = f1_score(df['label'], df['preds'])

print(acc)
print(f1)

In [None]:
print(1e-05)
print(type(1e-05))