In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!git clone https://github.com/harshil0217/BERT_headline_classifier_v2.git
import os
os.chdir("BERT_headline_classifier_v2")

Cloning into 'BERT_headline_classifier_v2'...
remote: Enumerating objects: 58, done.[K
remote: Counting objects: 100% (58/58), done.[K
remote: Compressing objects: 100% (42/42), done.[K
remote: Total 58 (delta 24), reused 38 (delta 14), pack-reused 0[K
Receiving objects: 100% (58/58), 934.12 KiB | 2.50 MiB/s, done.
Resolving deltas: 100% (24/24), done.


In [3]:
!pwd

/content/BERT_headline_classifier_v2


In [4]:
import pandas as pd
from imblearn.under_sampling import RandomUnderSampler
import numpy as np
from sklearn.model_selection import train_test_split

Mounted at /content/drive


In [None]:
import os


In [5]:
#import data
headlines_annotated = pd.read_csv("./data/finance_headlines.csv")

In [7]:
headlines_annotated

Unnamed: 0,neutral,"According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing ."
0,neutral,Technopolis plans to develop in stages an area...
1,negative,The international electronic industry company ...
2,positive,With the new production plant the company woul...
3,positive,According to the company 's updated strategy f...
4,positive,FINANCING OF ASPOCOMP 'S GROWTH Aspocomp is ag...
...,...,...
4840,negative,LONDON MarketWatch -- Share prices ended lower...
4841,neutral,Rinkuskiai 's beer sales fell by 6.5 per cent ...
4842,negative,Operating profit fell to EUR 35.4 mn from EUR ...
4843,negative,Net sales of the Paper segment decreased to EU...


In [6]:
#select only relevant columns
headlines_annotated = headlines_annotated[['headline', 'dominant_emotion']]

KeyError: "None of [Index(['headline', 'dominant_emotion'], dtype='object')] are in the [columns]"

In [None]:
headlines_annotated

Unnamed: 0,headline,dominant_emotion
0,Cops in One Village Have Been Convicted of 70 ...,anger
1,DIY penis enlargements are a 'nationwide probl...,negative_surprise
2,Dam breaking: New Epstein accuser comes forward,anger
3,David Beckham gets six-month driving ban for u...,negative_surprise
4,Dead sea turtle found with spear through head ...,sadness
...,...,...
4995,‘NOW do abortion. We’ll wait.’ Ilhan Omar doub...,negative_surprise
4996,‘Triad’ Thugs Use Clubs to Punish Hong Kong’s ...,negative_surprise
4997,'For real': High school football season begins...,positive_anticipation_including_optimism
4998,100 years on: the picture that changed our vie...,positive_surprise


In [None]:
headlines_annotated["dominant_emotion"].value_counts()

dominant_emotion
negative_surprise                            893
positive_surprise                            582
anger                                        460
annoyance                                    426
fear                                         419
sadness                                      386
disgust                                      382
negative_anticipation_including_pessimism    323
positive_anticipation_including_optimism     319
joy                                          264
guilt                                        175
trust                                        124
shame                                        119
pride                                         65
love_including_like                           63
Name: count, dtype: int64

In [None]:
#replace emotions with optimism and pessimism

replace_with_optimisim = ["positive_surprise", "positive_anticipation_including_optimism"]

replace_with_pessimism = ["negative_surprise", "negative_anticipation_including_pessimism"]


optimism_dict = {k: "optimism" for k in replace_with_optimisim}

pessimism_dict = {k: "pessimism" for k in replace_with_pessimism}

headlines_annotated["dominant_emotion"] = headlines_annotated["dominant_emotion"].replace(optimism_dict)
headlines_annotated["dominant_emotion"] = headlines_annotated["dominant_emotion"].replace(pessimism_dict)

headlines_annotated["dominant_emotion"].value_counts()

dominant_emotion
pessimism              1216
optimism                901
anger                   460
annoyance               426
fear                    419
sadness                 386
disgust                 382
joy                     264
guilt                   175
trust                   124
shame                   119
pride                    65
love_including_like      63
Name: count, dtype: int64

In [None]:
#see value counts
headlines_annotated["dominant_emotion"].value_counts()

dominant_emotion
pessimism              1216
optimism                901
anger                   460
annoyance               426
fear                    419
sadness                 386
disgust                 382
joy                     264
guilt                   175
trust                   124
shame                   119
pride                    65
love_including_like      63
Name: count, dtype: int64

In [None]:
#convert emotions besides pessimism and optimism into "other"
headlines_annotated.loc[~headlines_annotated["dominant_emotion"].isin(["optimism", "pessimism"]), "dominant_emotion"] = "other"

In [None]:
#see value counts
headlines_annotated["dominant_emotion"].value_counts()

dominant_emotion
other        2883
pessimism    1216
optimism      901
Name: count, dtype: int64

In [None]:
#undersample the majority class
rus = RandomUnderSampler(random_state=69)
X_resampled, y_resampled = rus.fit_resample(headlines_annotated["headline"].values.reshape(-1, 1), headlines_annotated["dominant_emotion"].values)


In [None]:
#merge the two arrays back into pandas dataframe
headlines_resampled = pd.DataFrame(np.hstack((X_resampled, y_resampled.reshape(-1, 1))), columns=["headline", "dominant_emotion"])


In [None]:
headlines_resampled

Unnamed: 0,headline,dominant_emotion
0,Elizabeth Warren Lays Out a Theory of Change a...,optimism
1,Emojis That Don’t Exist but Should,optimism
2,Enbridge Is Behind This Front Group Pushing th...,optimism
3,Europe’s Dream: Escaping the Dictatorship of t...,optimism
4,"Getting 'California Sober' Showed Me a Kinder,...",optimism
...,...,...
2698,Rubio says progressive Democrats are 'fully' A...,pessimism
2699,"Segregation Is Back, and It’s Coming From the ...",pessimism
2700,Donald Trump goes completely off the deep end ...,pessimism
2701,"I Make $1,000 a Week Writing Essays for Lazy S...",pessimism


In [None]:
#see value counts
headlines_resampled["dominant_emotion"].value_counts()

dominant_emotion
optimism     901
other        901
pessimism    901
Name: count, dtype: int64

In [None]:
#checking the length of X
len(X_resampled)

2703

In [None]:
#split into training, testing, and validation sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=69)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=69)

In [None]:
#merge the X and ys together into pandas dataframe
train = pd.DataFrame(np.hstack((X_train, y_train.reshape(-1, 1))), columns=["headline", "dominant_emotion"])
test = pd.DataFrame(np.hstack((X_test, y_test.reshape(-1, 1))), columns=["headline", "dominant_emotion"])
val = pd.DataFrame(np.hstack((X_val, y_val.reshape(-1, 1))), columns=["headline", "dominant_emotion"])


In [None]:
#export to csv
train.to_csv("./data/train.csv", index=False)
test.to_csv("./data/test.csv", index=False)
val.to_csv("./data/val.csv", index=False)
