<a href="https://colab.research.google.com/github/ipietri/w266_Final_Project/blob/master/notebooks/RtGender-Notebooks/RtGender_Annotations_Bert.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Load

In [None]:
# hugging face modules
! pip install datasets transformers seqeval
import datasets
import transformers
import seqeval 

In [None]:
!pip install keras_tuner

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
# This tells matplotlib not to try opening a new window for each plot.
%matplotlib inline
import seaborn as sns

from collections import Counter

import tensorflow as tf
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import Embedding, Input, Dense, Lambda, Dropout
from keras.models import Sequential
from tensorflow.keras.layers import TextVectorization
import tensorflow.keras.backend as K
# for hyperparameter tunning
import keras_tuner as kt
from keras_tuner import HyperModel
import keras_tuner as kt
from tensorflow.keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences

# import sklearn to calculate the metrics
from sklearn import metrics
from sklearn.preprocessing import label_binarize
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report


In [None]:
# import bert from huggingface
from transformers import pipeline
unmasker = pipeline('fill-mask', model='bert-base-uncased')
unmasker("Hello I'm a [MASK] model.")


In [4]:
from transformers import BertTokenizer, TFBertModel
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = TFBertModel.from_pretrained("bert-base-uncased")
text = "Replace me by any text you'd like."
encoded_input = tokenizer(text, return_tensors='tf')
output = model(encoded_input)

Downloading:   0%|          | 0.00/511M [00:00<?, ?B/s]

Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [7]:
#mount from Google Drive 

try:
  from google.colab import drive
  drive.mount('/content/drive', force_remount=True)
  path = r'/content/drive/MyDrive/w266'
except ModuleNotFoundError:
  path = r'data'

Mounted at /content/drive


In [22]:
import os 
save_path = "/content/drive/MyDrive/w266/bert_base_uncased/"
if not os.path.exists(save_path):
    os.makedirs(save_path)
tokenizer.save_pretrained(save_path)

('/content/drive/MyDrive/w266/bert_base_uncased/tokenizer_config.json',
 '/content/drive/MyDrive/w266/bert_base_uncased/special_tokens_map.json',
 '/content/drive/MyDrive/w266/bert_base_uncased/vocab.txt',
 '/content/drive/MyDrive/w266/bert_base_uncased/added_tokens.json')

In [10]:
annotations = pd.read_parquet('/content/drive/MyDrive/w266/annotations.parquet.gzip')


# Helper Functions

In [13]:
#Posts, Responses, & Annotations: op_gender (M:0, W:1)
gender_binary_mappings = {"M": 0, "W": 1}
gender_labels = ["M", "W"]

# Annotations sentiment
# group mixed and neutral 
sentiment_mappings = {'Positive': 2, 'Mixed': 1, 'Neutral': 1, 'Negative':0}
sentiment_labels = ['Positive', 'Mixed/Neutral', 'Negative']


In [35]:
def convert_data_to_examples(data, mapping_dict, DATA_COLUMN, LABEL_COLUMN): 
  
  # identify x and y columns, map label columns to numeric
  X = data[DATA_COLUMN]
  y = data[LABEL_COLUMN].map(mapping_dict)

  #1st split: 70 train /30 test
  train_X, test_X, train_y, test_y = train_test_split(
  X, y, test_size = .3, random_state = 1222, stratify = y)

  #2nd split: test data 50/50 into test/dev
  test_X, dev_X, test_y, dev_y = train_test_split(
  test_X, test_y, test_size = .5, random_state = 1222, stratify = test_y)
  train_InputExamples = transformers.InputExample(guid=None, # Globally unique ID for bookkeeping, unused in this case
                                      text_a = train_X, 
                                      text_b = None,
                                      label = train_y)

  dev_InputExamples = transformers.InputExample(guid=None, # Globally unique ID for bookkeeping, unused in this case
                                  text_a = dev_X, 
                                  text_b = None,
                                  label = dev_y)
  
  test_InputExamples = transformers.InputExample(guid=None, # Globally unique ID for bookkeeping, unused in this case
                                    text_a = test_X, 
                                    text_b = None,
                                    label = test_y)
  
  return train_InputExamples, dev_InputExamples, test_InputExamples


## Split annotations for gender and sentiment analyses
Reformat for keras

In [36]:
train_InputExamples, dev_InputExamples, test_InputExamples = convert_data_to_examples(annotations, 
                                                                                      gender_binary_mappings,
                                                                                      'post_text', 
                                                                                      'op_gender')

InputExample(guid=None, text_a=2121    NEW POLLS show @realDonaldTrump is back on top...
1260    Happy Thanksgiving Canada, sorry we're a month...
882     Well folks please welcome into this world #6an...
4356    Almost back home after a full day in Billings....
4735    Please keep your thoughts and prayers with our...
                              ...                        
7611    Thanks for the FB! Badass numbers, and lovin' ...
1542                Some #BTS shots from tonight's Grimm.
8342    I got a two week pass to the Y. Guess who will...
8807    Thanks for the follow back! Thanks for keeping...
3739    Capitol switch board is at capacity--phones ri...
Name: post_text, Length: 10746, dtype: object, text_b=None, label=2121    0
1260    0
882     0
4356    0
4735    1
       ..
7611    0
1542    1
8342    1
8807    0
3739    1
Name: op_gender, Length: 10746, dtype: int64)

In [None]:
def convert_examples_to_tf_dataset(examples, tokenizer, max_length=128):
    features = [] # -> will hold InputFeatures to be converted later

    for e in examples:
        # Documentation is really strong for this method, so please take a look at it
        input_dict = tokenizer.encode_plus(
            e.text_a,
            add_special_tokens=True,
            max_length=max_length, # truncates if len(s) > max_length
            return_token_type_ids=True,
            return_attention_mask=True,
            pad_to_max_length=True, # pads to the right by default # CHECK THIS for pad_to_max_length
            truncation=True
        )

        input_ids, token_type_ids, attention_mask = (input_dict["input_ids"],
            input_dict["token_type_ids"], input_dict['attention_mask'])

        features.append(
            InputFeatures(
                input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, label=e.label
            )
        )

    def gen():
        for f in features:
            yield (
                {
                    "input_ids": f.input_ids,
                    "attention_mask": f.attention_mask,
                    "token_type_ids": f.token_type_ids,
                },
                f.label,
            )

    return tf.data.Dataset.from_generator(
        gen,
        ({"input_ids": tf.int32, "attention_mask": tf.int32, "token_type_ids": tf.int32}, tf.int64),
        (
            {
                "input_ids": tf.TensorShape([None]),
                "attention_mask": tf.TensorShape([None]),
                "token_type_ids": tf.TensorShape([None]),
            },
            tf.TensorShape([]),
        ),
    )


DATA_COLUMN = 'DATA_COLUMN'
LABEL_COLUMN = 'LABEL_COLUMN'

# PARKING LOT

In [29]:
# import helper functions from baseline model
%cd /content/drive/My\ Drive/w266/
from Annotations-BoW import tokenize_Xtrain_and_Xdev

SyntaxError: ignored