In [None]:
# !pip install spacy datasets -qq
# !pip install git+https://github.com/huggingface/transformers

In [None]:
# ! python -m spacy download en_core_web_sm

In [None]:
import os
import pandas as pd
import spacy
from spacy.tokenizer import Tokenizer

In [None]:
# Library setups
from google.colab import drive
drive.mount('/content/drive')

nlp = spacy.load("en_core_web_sm")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Constants and paths
ROOT_PATH = '/content/drive/MyDrive/Colab Notebooks/Allen_NLP_hackathon'
EMOTION_DATA = os.path.join(ROOT_PATH, 'Data', 'Emotion_Regression')
BERT_MODELS = os.path.join(ROOT_PATH, 'Models')

emotions = ['anger', 'fear', 'joy', 'sadness']

In [None]:
# Space out tokens and extract intensity scores for BERT
def preprocess_emotion_data(root):
  tokenizer = nlp.tokenizer
  for e in emotions:
      paths = dict(
        train='{}/training/EI-reg-En-{}-train.txt'.format(root, e),
        val='{}/development/2018-EI-reg-En-{}-dev.txt'.format(root, e),
        test='{}/test-gold/2018-EI-reg-En-{}-test-gold.txt'.format(root, e)
    )
      for split_type in ['train', 'val', 'test']:
          filepath = paths[split_type]
          df = pd.read_csv(filepath.format(e), sep='\t', header=0, usecols=['Tweet', 'Intensity Score'])
          df.rename(columns={'Tweet': 'sentence1', 'Intensity Score': 'label'}, inplace=True)

          df.sentence1 = df.sentence1.apply(lambda sent: ' '.join([t.text for t in tokenizer(sent)]))

          df.to_csv('{root}/{split_type}_{e}.csv'.format(root=root, split_type=split_type, e=e), index=False)

  print('Done!')

preprocess_emotion_data(EMOTION_DATA)

SyntaxError: ignored

In [None]:
%cd drive/MyDrive/Allen_NLP_hackathon

In [None]:
!python run_glue.py

In [None]:
# Run repurposed GLUE script to generate models
# ANGER MODEL
!python run_glue.py \
--train_file "Data/Emotion_Regression/train_anger.csv" \
--validation_file "Data/Emotion_Regression/val_anger.csv" \
--test_file "Data/Emotion_Regression/test_anger.csv" \
--model_name_or_path bert-base-uncased \
--do_train \
--do_eval \
--do_predict \
--max_seq_length 128 \
--per_device_train_batch_size 32 \
--learning_rate 2e-4 \
--num_train_epochs 1 \
--output_dir "Models/anger/" \
--overwrite_output_dir \
--logging_steps 50 \
--logging_first_step True \

In [None]:
# FEAR MODEL
!python run_glue.py \
--train_file "Data/Emotion_Regression/train_fear.csv" \
--validation_file "Data/Emotion_Regression/val_fear.csv" \
--test_file "Data/Emotion_Regression/test_fear.csv" \
--model_name_or_path bert-base-uncased \
--do_train \
--do_eval \
--do_predict \
--max_seq_length 128 \
--per_device_train_batch_size 32 \
--learning_rate 2e-4 \
--num_train_epochs 1 \
--output_dir "Models/fear/" \
--overwrite_output_dir \
--logging_steps 50 \
--logging_first_step True \

In [None]:
# SADNESS MODEL
!python run_glue.py \
--train_file "Data/Emotion_Regression/train_sadness.csv" \
--validation_file "Data/Emotion_Regression/val_sadness.csv" \
--test_file "Data/Emotion_Regression/test_sadness.csv" \
--model_name_or_path bert-base-uncased \
--do_train \
--do_eval \
--do_predict \
--max_seq_length 128 \
--per_device_train_batch_size 32 \
--learning_rate 2e-4 \
--num_train_epochs 1 \
--output_dir "Models/sadness/" \
--overwrite_output_dir \
--logging_steps 50 \
--logging_first_step True \

In [None]:
# JOY MODEL
!python run_glue.py \
--train_file "Data/Emotion_Regression/train_joy.csv" \
--validation_file "Data/Emotion_Regression/val_joy.csv" \
--test_file "Data/Emotion_Regression/test_joy.csv" \
--model_name_or_path bert-base-uncased \
--do_train \
--do_eval \
--do_predict \
--max_seq_length 128 \
--per_device_train_batch_size 32 \
--learning_rate 2e-4 \
--num_train_epochs 1 \
--output_dir "Models/joy/" \
--overwrite_output_dir \
--logging_steps 50 \
--logging_first_step True \