<a href="https://colab.research.google.com/github/jeanlucjackson/2016_presidential_political_tv_ads/blob/main/notebooks/JJ/230724_preprocessing_nlp.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Notebook Setup

## Installs

In [2]:
!pip install transformers --quiet

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m22.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m20.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m55.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m49.4 MB/s[0m eta [36m0:00:00[0m
[?25h

## Imports

In [46]:
from os import listdir
from os.path import isfile, join

import csv
from pprint import pprint
import string

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

import transformers
from transformers import pipeline, BertTokenizer, TFBertModel

import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize

import tensorflow as tf
from tensorflow import keras

from joblib import dump, load

%matplotlib inline

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
### FROM PLOT EMBED NOTEBOOK

# from os import listdir
# from os.path import isfile, join

# import csv
# import string

# import numpy as np
# import pandas as pd

# import matplotlib.pyplot as plt

# import tensorflow as tf
# from tensorflow import keras
# from tensorflow.keras.layers import Embedding, Input, Dense, Lambda, Conv1D, GlobalMaxPooling1D
# from tensorflow.keras.models import Model
# import tensorflow.keras.backend as K

# import transformers
# from transformers import BertTokenizer, TFBertModel

# from sklearn.preprocessing import RobustScaler, MinMaxScaler, FunctionTransformer
# from sklearn.pipeline import Pipeline

# from joblib import dump, load

# %matplotlib inline

## Google Drive Mounting

In [4]:
# This cell will authenticate you and mount your Drive in the Colab.
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Notebook Settings

### `pandas`

In [5]:
pd.set_option('display.max_colwidth', 300)

### GPU Management

In [14]:
gpus = tf.config.experimental.list_physical_devices('GPU')
print(gpus)
try:
  GPU_ordinal = int(gpus[0].name[-1])
  print(f"GPU_ordinal: {GPU_ordinal}")
  print("Turning on Logging device placement")
  tf.debugging.set_log_device_placement(False)
  tf.test.gpu_device_name()

except:
  print("Not using GPU runtime. Consider changing runtime type.")
  GPU_ordinal = 'cpu'

[]
Not using GPU runtime. Consider changing runtime type.


# INPUT: Temporary Example

In [34]:
# Data inputs

input = {'plotInformation': '12-year-old Conor OMalley has a close bond with his seriously ill mother and maintains the household during her regular chemotherapy treatments at the hospital. His grandmother often visits, and suggests he come live with her in the event of his mother passing. One day, he wins the lottery and buys himself a new beautiful home!',
        'writers': ['tinah', 'Steven Spielberg', 'A. Martin Zweiback', 'tinah'],
        'genre': ['Crime', ' Drama', ' Mystery', 'Sci-Fi'],
        'topics': [u'fun', u'friendship', u'success'],
        'adaptation': False,
        'releaseYear': u'2023',
        'releaseQuarter': u'4'}

In [35]:
# Convert to pandas dataframe
plot = input.get('plotInformation')
writer = input.get('writers')
genres = input.get('genre')
topics = input.get('topics')
based_on = input.get('adaptation')
releaseYear = input.get('releaseYear')
releaseQuarter = input.get('releaseQuarter')

df = pd.DataFrame({
        'plot': [plot],
        'writer': [writer],
        'genres': [genres],
        'topics': [topics],
        'based_on': [based_on],
        'releaseYear': [releaseYear],
        'releaseQuarter': [releaseQuarter]
    })
df

Unnamed: 0,plot,writer,genres,topics,based_on,releaseYear,releaseQuarter
0,"12-year-old Conor OMalley has a close bond with his seriously ill mother and maintains the household during her regular chemotherapy treatments at the hospital. His grandmother often visits, and suggests he come live with her in the event of his mother passing. One day, he wins the lottery and b...","[tinah, Steven Spielberg, A. Martin Zweiback, tinah]","[Crime, Drama, Mystery, Sci-Fi]","[fun, friendship, success]",False,2023,4


# INPUT: Final Plot Column

In [36]:
# Final Plot column
plot_column = 'plot'

# Sentiment Analysis

In [37]:
#### Load sentiment model
sentiment_pipeline = pipeline("sentiment-analysis", device=GPU_ordinal)

#### Split plot into sentences
df['plot_sentences'] = df[plot_column].apply(sent_tokenize)

#### Sentiment by sentence
df['sentiment_outputs'] = df.plot_sentences.apply(sentiment_pipeline)
df['sentiment_labels'] = df.sentiment_outputs.apply(lambda x: [1 if output['label'] == 'POSITIVE' else 0 for output in x])
df['sentiment_scores'] = df.sentiment_outputs.apply(lambda x: [output['score'] for output in x])

#### Average Sentiment
df['average_sentiment'] = df.sentiment_labels.apply(lambda x: sum(x)/len(x))

#### Weighted-Average Sentiment
def weighted_average_cols(row, values_col, weights_col):
  values = list(row[values_col])
  weights = list(row[weights_col])
  numerator = [w * v for w, v in zip(weights, values)]
  denominator = sum(values)
  if denominator != 0:
    return sum(numerator) / sum(values)
  else:
    return 0

df['average_weighted_sentiment'] = df.apply(lambda row: weighted_average_cols(row, 'sentiment_labels', 'sentiment_scores'), axis=1)

#### Primary Sentiment
def primary_sentiment(row, avg_col, weighted_avg_col):
  if row[avg_col] == 0.5:
    # If there's a tie with 0.50, break it with weighted average
    if row[weighted_avg_col] > 0.5:
      return 'POSITIVE'
    else:
      return 'NEGATIVE'
  elif row[avg_col] > 0.5:
    return 'POSITIVE'
  else:
    return 'NEGATIVE'

df['primary_sentiment'] = df.apply(lambda x: primary_sentiment(x, 'average_sentiment', 'average_weighted_sentiment'), axis=1)

#### Sentiment Shifts
def sentiment_shifts(sents):
  sents = list(sents)
  shifts = 0
  this_sent = sents.pop(0)
  while sents:
    next_sent = sents[0]
    if next_sent != this_sent:
      shifts += 1
    this_sent = sents.pop(0)
  return shifts

df['sentiment_shifts'] = df.sentiment_labels.apply(sentiment_shifts)

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


In [38]:
df

Unnamed: 0,plot,writer,genres,topics,based_on,releaseYear,releaseQuarter,plot_sentences,sentiment_outputs,sentiment_labels,sentiment_scores,average_sentiment,average_weighted_sentiment,primary_sentiment,sentiment_shifts
0,"12-year-old Conor OMalley has a close bond with his seriously ill mother and maintains the household during her regular chemotherapy treatments at the hospital. His grandmother often visits, and suggests he come live with her in the event of his mother passing. One day, he wins the lottery and b...","[tinah, Steven Spielberg, A. Martin Zweiback, tinah]","[Crime, Drama, Mystery, Sci-Fi]","[fun, friendship, success]",False,2023,4,"[12-year-old Conor OMalley has a close bond with his seriously ill mother and maintains the household during her regular chemotherapy treatments at the hospital., His grandmother often visits, and suggests he come live with her in the event of his mother passing., One day, he wins the lottery an...","[{'label': 'POSITIVE', 'score': 0.9982689619064331}, {'label': 'POSITIVE', 'score': 0.9840226173400879}, {'label': 'POSITIVE', 'score': 0.9997602105140686}]","[1, 1, 1]","[0.9982689619064331, 0.9840226173400879, 0.9997602105140686]",1.0,0.994017,POSITIVE,0


# Emotion Analysis

In [39]:
#### Load emotion model
emotion_pipeline = pipeline("text-classification", model='j-hartmann/emotion-english-distilroberta-base', device=GPU_ordinal)

#### Emotion by sentence
df['emotion_outputs'] = df.plot_sentences.apply(lambda x: [emotion_pipeline(sent) for sent in x])
df['emotion_labels'] = df.emotion_outputs.apply(lambda x: [output[0]['label'] for output in x])
df['emotion_scores'] = df.emotion_outputs.apply(lambda x: [output[0]['score'] for output in x])

#### Primary Emotions
def primary_emotion_calculator(row, emotion_labels_col, emotion_scores_col, top=2, ignore_neutral=False):
  emotions = list(row[emotion_labels_col])
  emotions_weighted = list(row[emotion_scores_col])

  if ignore_neutral and 'neutral' in emotions:
    ids = [pair[0] for pair in enumerate(emotions) if pair[1] == 'neutral']
    emotions = [em for em in emotions if em != 'neutral']
    emotions_weighted = [pair[1] for pair in enumerate(emotions_weighted) if pair[0] not in ids]

  if len(emotions) == 0:
    return None

  emo_counts = {k: 0 for k in emotions}
  emo_weighted_counts = {k: 0 for k in emotions}

  for emo, weight in zip(emotions, emotions_weighted):
    emo_counts.update(
        {emo: emo_counts[emo] + 1}
    )
    emo_weighted_counts.update(
        {emo: emo_weighted_counts[emo] + weight}
    )

  max_count = max(emo_counts.values())
  # max_emos = {emo: (count, emo_weighted_counts[emo]) for emo, count in emo_counts.items() if count == max_count}
  max_emos = {emo: (count, emo_weighted_counts[emo]) for emo, count in emo_counts.items()}

  # Return `top` number of emotions, sorted by weighted counts for tiebreaking
  return sorted(max_emos, key=lambda x: max_emos[x][1], reverse=True)[:top]


df['primary_nonneutral_emotions'] = df.apply(lambda x: primary_emotion_calculator(x,
                                                                                  'emotion_labels',
                                                                                  'emotion_scores',
                                                                                  2,
                                                                                  ignore_neutral=True),
                                             axis=1)


In [40]:
df

Unnamed: 0,plot,writer,genres,topics,based_on,releaseYear,releaseQuarter,plot_sentences,sentiment_outputs,sentiment_labels,sentiment_scores,average_sentiment,average_weighted_sentiment,primary_sentiment,sentiment_shifts,emotion_outputs,emotion_labels,emotion_scores,primary_nonneutral_emotions
0,"12-year-old Conor OMalley has a close bond with his seriously ill mother and maintains the household during her regular chemotherapy treatments at the hospital. His grandmother often visits, and suggests he come live with her in the event of his mother passing. One day, he wins the lottery and b...","[tinah, Steven Spielberg, A. Martin Zweiback, tinah]","[Crime, Drama, Mystery, Sci-Fi]","[fun, friendship, success]",False,2023,4,"[12-year-old Conor OMalley has a close bond with his seriously ill mother and maintains the household during her regular chemotherapy treatments at the hospital., His grandmother often visits, and suggests he come live with her in the event of his mother passing., One day, he wins the lottery an...","[{'label': 'POSITIVE', 'score': 0.9982689619064331}, {'label': 'POSITIVE', 'score': 0.9840226173400879}, {'label': 'POSITIVE', 'score': 0.9997602105140686}]","[1, 1, 1]","[0.9982689619064331, 0.9840226173400879, 0.9997602105140686]",1.0,0.994017,POSITIVE,0,"[[{'label': 'sadness', 'score': 0.7424512505531311}], [{'label': 'neutral', 'score': 0.544970691204071}], [{'label': 'joy', 'score': 0.6234496235847473}]]","[sadness, neutral, joy]","[0.7424512505531311, 0.544970691204071, 0.6234496235847473]","[sadness, joy]"


# Plot Embedding

## Download BERT model

In [41]:
# # BERT base CASED
bert_cased_tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
bert_cased_model = TFBertModel.from_pretrained('bert-base-cased')

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

## Predict ROI with `plot_embed` Model

In [24]:
! ls "drive/MyDrive/210 Capstone Team Drive/models/plot_embedding/"

230709_plot_embed_class_300_epoch30.keras
230709_plot_embed_class_300_epoch4.keras
230709_plot_embed_regress_class_300_epoch10.keras
230710_plot_embed_regress_class_scaled_epoch30.keras
230710_plot_embed_regress_class_scaled_epoch8.keras
230710_roi_train_pipeline.pkl
230714_plot_embed_1993_LogMinMax_300-300-300_MSE_epoch8.keras
230714_plot_embed_2013_LogMinMax_100_MAE_epoch8.keras
230714_plot_embed_2013_LogMinMax_300-300-300_MSE_epoch8.keras
230714_plot_embed_2013_MinMaxScaler_epoch8.keras
230714_roi_train_pipeline_1993-2013.pkl
230714_roi_train_pipeline_2013.pkl


In [52]:
#### Load Embed Model
model_dir = "drive/MyDrive/210 Capstone Team Drive/models/plot_embedding/"
model_filepath = model_dir + "230714_plot_embed_2013_LogMinMax_300-300-300_MSE_epoch8.keras"

plot_embed_model = keras.models.load_model(model_filepath,
                                           custom_objects={'TFBertModel': transformers.TFBertModel})


#### Load SciKit-Learn Pipeline for transformation inversing
model_dir = "drive/MyDrive/210 Capstone Team Drive/models/plot_embedding/"
pipeline_filepath = model_dir + "230714_roi_train_pipeline_2013.pkl"

# Log Transform functions
def log_transform(x):
  return np.log(x + 1)

def inverse_log_transform(x):
  return np.exp(x) - 1

roi_train_pipe = load(pipeline_filepath)

# Scikit Learn Pipeline to do inverse of Log Transform and MinMaxScale transforming
def roi_inverse_transform(arr, train_pipe=roi_train_pipe):
  return train_pipe.inverse_transform(arr.reshape(-1, 1)).flatten()


#### Predict ROI from Plot
def predict_roi_from_plot(row, plot_col, roi_binary_col, roi_col, bert_tokenizer, plot_model):
  """
  To be used in df.apply() method by row.

  Function to call loaded model and predict:
    - ROI binary classification
    - ROI regression

  Takes as input:
    - This row
    - Column names
    - Loaded BERT model (bert-base-cased)
    - Loaded PlotEmbed model (bert_cls_regress_class_model)
  """

  raw_plot = row[plot_col]
  tokenized_plot = bert_tokenizer(raw_plot,
                                  max_length=512,
                                  truncation=True,
                                  padding='max_length',
                                  return_tensors='tf')

  # roi_binary = np.array(row[roi_binary_col])
  # roi = np.array(row[roi_col])

  pred_roi_binary_prob, pred_roi_transform = plot_model.predict(
      [tokenized_plot.input_ids,
       tokenized_plot.token_type_ids,
       tokenized_plot.attention_mask],
      verbose=0 #silent, do not show progress bars
  )

  pred_roi_binary_prob = pred_roi_binary_prob.flatten()[0]

  # Use Scikit roi_train_pipe to inverse_transform ROI predictions
  pred_roi = roi_inverse_transform(pred_roi_transform, roi_train_pipe)[0]

  # print(pred_roi_transform, pred_roi)

  # Convert probability to binary outcome
  pred_roi_binary = round(pred_roi_binary_prob)

  return pred_roi_binary, pred_roi_binary_prob, pred_roi

df[['pred_roi_binary', 'pred_roi_binary_prob', 'pred_roi']] = df.apply(lambda row: predict_roi_from_plot(row,
                                                                                                         plot_col=plot_column,
                                                                                                         roi_binary_col='roi_binary',
                                                                                                         roi_col='roi',
                                                                                                         bert_tokenizer=bert_cased_tokenizer,
                                                                                                         plot_model=plot_embed_model),
                                                                       axis=1,
                                                                       result_type='expand')



In [53]:
df

Unnamed: 0,plot,writer,genres,topics,based_on,releaseYear,releaseQuarter,plot_sentences,sentiment_outputs,sentiment_labels,...,average_weighted_sentiment,primary_sentiment,sentiment_shifts,emotion_outputs,emotion_labels,emotion_scores,primary_nonneutral_emotions,pred_roi_binary,pred_roi_binary_prob,pred_roi
0,"12-year-old Conor OMalley has a close bond with his seriously ill mother and maintains the household during her regular chemotherapy treatments at the hospital. His grandmother often visits, and suggests he come live with her in the event of his mother passing. One day, he wins the lottery and b...","[tinah, Steven Spielberg, A. Martin Zweiback, tinah]","[Crime, Drama, Mystery, Sci-Fi]","[fun, friendship, success]",False,2023,4,"[12-year-old Conor OMalley has a close bond with his seriously ill mother and maintains the household during her regular chemotherapy treatments at the hospital., His grandmother often visits, and suggests he come live with her in the event of his mother passing., One day, he wins the lottery an...","[{'label': 'POSITIVE', 'score': 0.9982689619064331}, {'label': 'POSITIVE', 'score': 0.9840226173400879}, {'label': 'POSITIVE', 'score': 0.9997602105140686}]","[1, 1, 1]",...,0.994017,POSITIVE,0,"[[{'label': 'sadness', 'score': 0.7424512505531311}], [{'label': 'neutral', 'score': 0.544970691204071}], [{'label': 'joy', 'score': 0.6234496235847473}]]","[sadness, neutral, joy]","[0.7424512505531311, 0.544970691204071, 0.6234496235847473]","[sadness, joy]",0.0,0.448076,1.438755
