## Inicializacion

Importamos librerias a utilizar

In [1]:
import re
import string
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

keras.backend.clear_session()
tf.random.set_seed(42)
np.random.seed(42)

Descargamos dataset

In [2]:
!wget -P data/full_dataset/ https://storage.googleapis.com/gresearch/goemotions/data/full_dataset/goemotions_1.csv
!wget -P data/full_dataset/ https://storage.googleapis.com/gresearch/goemotions/data/full_dataset/goemotions_2.csv
!wget -P data/full_dataset/ https://storage.googleapis.com/gresearch/goemotions/data/full_dataset/goemotions_3.csv

--2022-11-03 10:29:31--  https://storage.googleapis.com/gresearch/goemotions/data/full_dataset/goemotions_1.csv
Resolving storage.googleapis.com (storage.googleapis.com)... 142.250.176.208, 142.250.80.112, 142.250.80.16, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|142.250.176.208|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 14174600 (14M) [application/octet-stream]
Saving to: ‘data/full_dataset/goemotions_1.csv.18’


2022-11-03 10:29:31 (51.7 MB/s) - ‘data/full_dataset/goemotions_1.csv.18’ saved [14174600/14174600]

--2022-11-03 10:29:32--  https://storage.googleapis.com/gresearch/goemotions/data/full_dataset/goemotions_2.csv
Resolving storage.googleapis.com (storage.googleapis.com)... 142.251.40.144, 142.250.64.112, 142.250.176.208, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|142.251.40.144|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 14173154 (14M) [application/octet-stream]
Saving t

## Preprocesamiento

In [3]:
df_pre_1 = pd.read_csv('data/full_dataset/goemotions_1.csv')
df_pre_2 = pd.read_csv('data/full_dataset/goemotions_2.csv')
df_pre_3 = pd.read_csv('data/full_dataset/goemotions_3.csv')

In [4]:
df0 = pd.concat([df_pre_1, df_pre_2, df_pre_3]).reset_index(drop=True)

In [5]:
df0 = df0[0:100000]

In [6]:
df0


Unnamed: 0,text,id,author,subreddit,link_id,parent_id,created_utc,rater_id,example_very_unclear,admiration,...,love,nervousness,optimism,pride,realization,relief,remorse,sadness,surprise,neutral
0,That game hurt.,eew5j0j,Brdd9,nrl,t3_ajis4z,t1_eew18eq,1.548381e+09,1,False,0,...,0,0,0,0,0,0,0,1,0,0
1,>sexuality shouldn’t be a grouping category I...,eemcysk,TheGreen888,unpopularopinion,t3_ai4q37,t3_ai4q37,1.548084e+09,37,True,0,...,0,0,0,0,0,0,0,0,0,0
2,"You do right, if you don't care then fuck 'em!",ed2mah1,Labalool,confessions,t3_abru74,t1_ed2m7g7,1.546428e+09,37,False,0,...,0,0,0,0,0,0,0,0,0,1
3,Man I love reddit.,eeibobj,MrsRobertshaw,facepalm,t3_ahulml,t3_ahulml,1.547965e+09,18,False,0,...,1,0,0,0,0,0,0,0,0,0
4,"[NAME] was nowhere near them, he was by the Fa...",eda6yn6,American_Fascist713,starwarsspeculation,t3_ackt2f,t1_eda65q2,1.546669e+09,2,False,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,I said nothing of the sort.,edwqt3w,NeatRefrigerator,badunitedkingdom,t3_af96mh,t1_edwoxmr,1.547317e+09,41,False,0,...,0,0,0,0,0,0,0,0,0,0
99996,Damn that must feel great!,efeej4j,UncomfortableRun,seduction,t3_alh1lc,t3_alh1lc,1.548891e+09,40,False,0,...,0,0,0,0,0,0,0,0,0,0
99997,"It's givin' off that disco vibe, and if I unde...",eezotoc,BrotherChe,WatchPeopleDieInside,t3_ajtrfb,t1_eezl315,1.548479e+09,57,False,0,...,0,0,0,0,1,0,0,0,0,0
99998,"That's awful. I'm glad [NAME] doing better, bu...",eeye1pi,59926,MorbidReality,t3_ajmlwx,t1_eexvbz4,1.548445e+09,46,False,1,...,0,0,0,0,0,0,0,0,0,0


In [7]:
POSITIVE_EMOTIONS = [
    'admiration', 'amusement', 'approval', 'caring',
    'excitement', 'gratitude','joy', 'love', 'optimism'
]

NEGATIVE_EMOTIONS = [
    'anger', 'annoyance', 'disappointment', 'disapproval',
    'disgust', 'embarrassment', 'fear', 'grief', 'sadness','remorse'
]

In [8]:
pos_sum_col = df0[POSITIVE_EMOTIONS].sum(axis=1)
neg_sum_col = df0[NEGATIVE_EMOTIONS].sum(axis=1)

positive_col, negative_col, del_rows_col = [], [], []

for i in range(df0.shape[0]):
    if (pos_sum_col[i] < neg_sum_col[i]):
        positive_col.append(0)
        negative_col.append(1)
    elif (neg_sum_col[i] < pos_sum_col[i]):
        positive_col.append(1)
        negative_col.append(0)
    else:
        del_rows_col.append(i)

In [9]:
positive_col = pd.Series(positive_col)
negative_col = pd.Series(negative_col)

In [10]:
text_col = df0['text'].drop(labels=del_rows_col).reset_index()['text']

In [11]:
frame = {
    'text': text_col,
    'positive_emotion': positive_col,
    'negative_emotion': negative_col
}

df = pd.DataFrame(frame)

In [12]:
def clean_text(text):
    text = text.lower()
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = re.sub('[‘’“”…]', '', text)
    text = re.sub('\n', '', text)
    text = re.sub(r'\s+', ' ', text)

    return text

In [13]:
corpus = []
for index, row in df.iterrows():
    og_text = row['text']
    cln_text = clean_text(og_text)
    corpus.append(cln_text)

In [14]:
vectorizer = CountVectorizer(stop_words='english')
docterm_matix = vectorizer.fit_transform(corpus).toarray()

In [15]:
features = vectorizer.get_feature_names_out()

In [16]:
X = docterm_matix

In [17]:
df.columns[-2:]

Index(['positive_emotion', 'negative_emotion'], dtype='object')

In [18]:
Y = []

EMOTIONAL_CATEGORIES = df.columns[-2:]

for index, row in df.iterrows():
    row_y = []
    for emotion in EMOTIONAL_CATEGORIES:
        row_y.append(row[emotion])
    Y.append(row_y)

Y = np.array(Y)

In [19]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.33, random_state=42)

## Modelación

In [20]:
max_features = 2000000
embedding_dim = 128
sequence_length = 500

kernel_init = tf.keras.initializers.HeNormal()
bias_init = tf.keras.initializers.Zeros()

model = keras.Sequential(
    [
        keras.Input(shape=(x_train.shape[1],)),
        #layers.Embedding(max_features, embedding_dim),
        #layers.Dropout(0.5),
        #layers.Bidirectional(layers.LSTM(20,dropout=0.6)),
        #layers.Conv1D(32, 10, padding="valid", activation="selu"),
        #layers.GlobalMaxPooling1D(),
        layers.Dense(6, activation="relu"),
        layers.Dense(6, activation="relu"),
        layers.Dense(6, activation="relu"),
        layers.Dense(6, activation="relu"),
        #layers.Dense(64, activation="relu"),
        #layers.Dropout(0.2),
        layers.Dense(y_train.shape[1], activation="sigmoid")
    ]
)

In [21]:
opt = keras.optimizers.Adam(learning_rate=0.01)
loss_fn = keras.losses.BinaryCrossentropy()

model.compile(loss=loss_fn, optimizer=opt, metrics=['accuracy'])

In [22]:
model.fit(x_train, y_train, epochs=3, validation_split=0.1)

2022-11-03 10:29:55.658536: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 6725379112 exceeds 10% of free system memory.
2022-11-03 10:30:02.618515: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 6725379112 exceeds 10% of free system memory.


Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7f3c026eaf10>

In [24]:
score = model.evaluate(x_test, y_test, verbose=0)
print("Test loss:", score[0])
print("Test accuracy:", score[1])

2022-11-03 10:30:54.904226: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 3680757176 exceeds 10% of free system memory.
2022-11-03 10:30:58.743241: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 3680757176 exceeds 10% of free system memory.


Test loss: 0.48770761489868164
Test accuracy: 0.8014702200889587
