In [34]:
import pandas as pd
from imblearn.over_sampling import RandomOverSampler
import numpy as np
from sklearn.model_selection import train_test_split
import tensorflow_hub as hub
import tensorflow_text as text
import pickle

In [35]:
#import data
headlines_annotated = pd.read_csv("./data/headlines_annotated.tsv", sep="\t")

In [36]:
#select only relevant columns
headlines_annotated = headlines_annotated[['headline', 'dominant_emotion']]

In [37]:
headlines_annotated

Unnamed: 0,headline,dominant_emotion
0,Cops in One Village Have Been Convicted of 70 ...,anger
1,DIY penis enlargements are a 'nationwide probl...,negative_surprise
2,Dam breaking: New Epstein accuser comes forward,anger
3,David Beckham gets six-month driving ban for u...,negative_surprise
4,Dead sea turtle found with spear through head ...,sadness
...,...,...
4995,‘NOW do abortion. We’ll wait.’ Ilhan Omar doub...,negative_surprise
4996,‘Triad’ Thugs Use Clubs to Punish Hong Kong’s ...,negative_surprise
4997,'For real': High school football season begins...,positive_anticipation_including_optimism
4998,100 years on: the picture that changed our vie...,positive_surprise


In [38]:
headlines_annotated["dominant_emotion"].value_counts()

dominant_emotion
negative_surprise                            893
positive_surprise                            582
anger                                        460
annoyance                                    426
fear                                         419
sadness                                      386
disgust                                      382
negative_anticipation_including_pessimism    323
positive_anticipation_including_optimism     319
joy                                          264
guilt                                        175
trust                                        124
shame                                        119
pride                                         65
love_including_like                           63
Name: count, dtype: int64

In [39]:
#replace emotions with optimism and pessimism

replace_with_optimisim = ["positive_surprise", "positive_anticipation_including_optimism", "joy", 
                          "trust", "pride", "love_including_like"]

replace_with_pessimism = ["negative_surprise", "negative_anticipation_including_pessimism", "fear",
                          "anger", "annoyance", "sadness", "disgust", "guilt", "shame"]

optimism_dict = {k: "optimism" for k in replace_with_optimisim}

pessimism_dict = {k: "pessimism" for k in replace_with_pessimism}

headlines_annotated["dominant_emotion"] = headlines_annotated["dominant_emotion"].replace(optimism_dict)
headlines_annotated["dominant_emotion"] = headlines_annotated["dominant_emotion"].replace(pessimism_dict)

headlines_annotated["dominant_emotion"].value_counts()

dominant_emotion
pessimism    3583
optimism     1417
Name: count, dtype: int64

In [40]:
#convert to binary classification
headlines_annotated["dominant_emotion"] = headlines_annotated["dominant_emotion"].replace({"optimism": 1, "pessimism": 0})

  headlines_annotated["dominant_emotion"] = headlines_annotated["dominant_emotion"].replace({"optimism": 1, "pessimism": 0})


In [41]:
#separate into individual numpy arrays
X = headlines_annotated["headline"].values
y = headlines_annotated["dominant_emotion"].values

In [42]:
#oversample the positive class
ros = RandomOverSampler(random_state=69)
X_resampled, y_resampled = ros.fit_resample(X.reshape(-1, 1), y)


In [43]:
#split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=69)

In [44]:
#export to pickle
with open("./data/X_train.pkl", "wb") as f:
    pickle.dump(X_train, f)

with open("./data/X_test.pkl", "wb") as f:
    pickle.dump(X_test, f)
    
with open("./data/y_train.pkl", "wb") as f:
    pickle.dump(y_train, f)

with open("./data/y_test.pkl", "wb") as f:
    pickle.dump(y_test, f)
    
