In [1]:
import pandas as pd
import re

from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('base_noticias.csv')
df = df[['Noticia','Tipo']]
df = df[df.Tipo == 0]

In [3]:
df_f1 = pd.read_csv('news_Culimercio.csv')
df_f1 = df_f1[['text']]
df_f1['tipo'] = 1

In [4]:
df_new = df_f1.text.str.findall(re.compile('[^\.]+')).tolist()

for idx in range(len(df_new)):
    if(len(df_new[idx]) > 1):
        df_new[idx] = '.'.join(df_new[idx][0:-1])
    else:
        df_new[idx] = df_new[idx][0]
        
df_f1.text = pd.Series(df_new)

df_f1.columns = ['Noticia','Tipo']

df = pd.concat([df,df_f1])

In [5]:
df_f1 = pd.read_csv('news_Uninverso.csv')
df_f1 = df_f1[['text']]
df_f1['tipo'] = 1

df_f1.columns = ['Noticia','Tipo']
df = pd.concat([df,df_f1])

In [6]:
df_train, df_test = train_test_split(df,train_size=0.8,random_state=5)

In [7]:
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [8]:
tokenizer = Tokenizer(filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)
tokenizer.fit_on_texts(df_train.Noticia.str.lower().values)

x_train = tokenizer.texts_to_sequences(df_train.Noticia.str.lower().values)
x_test = tokenizer.texts_to_sequences(df_test.Noticia.str.lower().values)
x_train = pad_sequences(x_train)
x_test = pad_sequences(x_test,maxlen=x_train.shape[1])

In [16]:
from tensorflow.keras import layers
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
import tensorflow as tf

In [33]:
class TransformerBlock(layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super(TransformerBlock, self).__init__()
        self.att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = keras.Sequential(
            [layers.Dense(ff_dim, activation="relu"), layers.Dense(embed_dim),]
        )
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(rate)
        self.dropout2 = layers.Dropout(rate)

    def call(self, inputs, training):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)
    
    def get_config(self):
        config = super().get_config().copy()
        config.update({
            'att': self.att,
            'ffn': self.ffn,
            'layernorm1': self.layernorm1,
            'layernorm2': self.layernorm2,
            'dropout1': self.dropout1,
            'dropout2': self.dropout2
        })
        return config

In [34]:
class TokenAndPositionEmbedding(layers.Layer):
    def __init__(self, maxlen, vocab_size, embed_dim):
        super(TokenAndPositionEmbedding, self).__init__()
        self.token_emb = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)
        self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=embed_dim)

    def call(self, x):
        maxlen = tf.shape(x)[-1]
        positions = tf.range(start=0, limit=maxlen, delta=1)
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions
    
    def get_config(self):
        config = super().get_config().copy()
        config.update({
            'token_emb': self.token_emb,
            'pos_emb': self.pos_emb
        })
        return config

In [38]:
vocab_size = len(tokenizer.word_index)+1
embed_dim = 100
num_heads = 5
ff_dim = 100

inputs = layers.Input(shape=(x_train.shape[1],))
embedding_layer = TokenAndPositionEmbedding(x_train.shape[1], vocab_size, embed_dim)
x = embedding_layer(inputs)
transformer_block = TransformerBlock(embed_dim, num_heads, ff_dim)
x = transformer_block(x)
x = layers.GlobalAveragePooling1D()(x)
x = layers.Dropout(0.1)(x)
x = layers.Dense(20, activation="relu")(x)
x = layers.Dropout(0.1)(x)
outputs = layers.Dense(1, activation="softmax")(x)

model = keras.Model(inputs=inputs, outputs=outputs)

In [39]:
model.compile(loss='msle', optimizer='nadam', metrics=['acc'])

In [40]:
callbacks = [
    EarlyStopping(monitor='val_acc', patience=100, min_delta=0),
    ModelCheckpoint('lstm_model.h5',monitor="val_acc",save_best_only=True)
]

weights = {
    0: df_train.Tipo.value_counts()[1],
    1: df_train.Tipo.value_counts()[0]
}

history = model.fit(x_train,df_train.Tipo,epochs=1000,validation_data=(x_test,df_test.Tipo),callbacks=callbacks,
                    batch_size=64,class_weight=weights)

Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000

KeyboardInterrupt: 