In [6]:
# Obtained guidance from: https://www.youtube.com/watch?v=wp9BudYGZyA
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [50]:
import numpy as np
import pandas as pd
import tensorflow as tf

from tqdm.auto import tqdm
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification

In [8]:
mx_political_df = pd.read_csv('mx_political.csv')
mx_political_df.head()

Unnamed: 0,user,tweet
0,AccionNacional,"¡Lo hicimos, Durango! El municipio sigue pintá..."
1,AccionNacional,¡Ganamos Durango con @EVillegasV!\n\nGracias a...
2,AccionNacional,¡Muchas felicidades @TereJimenezE por tu triun...
3,AccionNacional,RT @TereJimenezE: ¡Hoy ganaron las propuestas ...
4,AccionNacional,RT @EVillegasV: ¡Durango ganó con valor! Simpl...


In [9]:
mx_political_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19169 entries, 0 to 19168
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   user    19169 non-null  object
 1   tweet   19169 non-null  object
dtypes: object(2)
memory usage: 299.6+ KB


In [10]:
mx_political_df['user'].value_counts()

partidoverdemex    3200
PRI_Nacional       3199
AccionNacional     3198
PartidoMorenaMx    3197
PRDMexico          3192
MovCiudadanoMX     3183
Name: user, dtype: int64

In [11]:
# Transforming dtype of user to integer
user_map = {
    'partidoverdemex' : 0,
    'PRI_Nacional' : 1,
    'AccionNacional' : 2, 
    'PartidoMorenaMx' : 3, 
    'PRDMexico' : 4, 
    'MovCiudadanoMX': 5
}
user_name = mx_political_df['user'].map(user_map)

mx_political_df['user_name'] = user_name


In [12]:
mx_political_df.head()

Unnamed: 0,user,tweet,user_name
0,AccionNacional,"¡Lo hicimos, Durango! El municipio sigue pintá...",2
1,AccionNacional,¡Ganamos Durango con @EVillegasV!\n\nGracias a...,2
2,AccionNacional,¡Muchas felicidades @TereJimenezE por tu triun...,2
3,AccionNacional,RT @TereJimenezE: ¡Hoy ganaron las propuestas ...,2
4,AccionNacional,RT @EVillegasV: ¡Durango ganó con valor! Simpl...,2


In [13]:
mx_df = mx_political_df.copy()

In [14]:
mx_df = mx_df[['tweet', 'user_name']]

In [15]:
mx_df.head()

Unnamed: 0,tweet,user_name
0,"¡Lo hicimos, Durango! El municipio sigue pintá...",2
1,¡Ganamos Durango con @EVillegasV!\n\nGracias a...,2
2,¡Muchas felicidades @TereJimenezE por tu triun...,2
3,RT @TereJimenezE: ¡Hoy ganaron las propuestas ...,2
4,RT @EVillegasV: ¡Durango ganó con valor! Simpl...,2


In [16]:
mx_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19169 entries, 0 to 19168
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   tweet      19169 non-null  object
 1   user_name  19169 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 299.6+ KB


In [17]:
'''

The user_name labels are:
0 - partidoverdemex
1 - pri_nacional
2 - accionnacional
3 - partidomorenamx
4 - prdmexico
5 - movciudadanomx
'''


'\n\nThe user_name labels are:\n0 - partidoverdemex\n1 - pri_nacional\n2 - accionnacional\n3 - partidomorenamx\n4 - prdmexico\n5 - movciudadanomx\n'

In [18]:
tokenizer = AutoTokenizer.from_pretrained("M47Labs/spanish_news_classification_headlines_untrained")

Downloading:   0%|          | 0.00/577 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/236k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/713k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [19]:
mx_political_df['tweet'].iloc[320]

'Presentamos una denuncia contra @mario_delgado, dirigente nacional de Morena, por el financiamiento irregular a la campaña del candidato\xa0al gobierno de Tamaulipas, @Dr_AVillarreal, así como posibles actos de enriquecimiento ilícito y asociación delictuosa.\nhttps://t.co/KuoAZhDlN5'

In [20]:
token = tokenizer.encode_plus(
    mx_df['tweet'].iloc[320], 
    max_length = 280, 
    truncation = True, 
    padding = 'max_length',
    add_special_tokens = True, 
    return_tensors = 'tf'
)

In [21]:
token

{'input_ids': <tf.Tensor: shape=(1, 280), dtype=int32, numpy=
array([[    4, 26241,  1305,  1108, 11218,  1534,   968,  4352, 30933,
          948, 25156,  1017, 21868,  2596,  1008, 24790,  1532,  1017,
         1096,  1040, 20806, 19062,  1013,  1030,  5325,  1072,  9738,
         1091,  2966,  1008, 27733,  7693,  2761,  1017,   968,  4208,
          948, 23089,  3533, 30936, 28777,  1017,  1506,  1184,  6305,
         5261,  1008, 30739, 14275,  1042,  6468,  9631, 21100,  1009,
         1045,  4640,  5269,  1181,   972,   972,  1031,  1009,  1345,
          972,   991, 17700, 30963,   983, 30952, 30971, 30938, 30969,
        30993,     5,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,
            1, 

In [22]:
X_input_ids = np.zeros((len(mx_df), 280))
X_attn_masks = np.zeros((len(mx_df), 280))

In [23]:
# 280 characters is the maximum length of each tweet (document)
X_input_ids.shape

(19169, 280)

In [24]:
def generate_training_data(df, ids, masks, tokenizer):
  for i, text in tqdm(enumerate(df['tweet'])):
    tokenized_text = tokenizer.encode_plus(
        text, 
        max_length = 280, 
        truncation=True, 
        padding='max_length',
        add_special_tokens=True,
        return_tensors='tf'
    )
    ids[i, :] = tokenized_text.input_ids
    masks[i, :] = tokenized_text.attention_mask

  return ids, masks

In [25]:
X_input_ids, X_attn_masks = generate_training_data(mx_df, X_input_ids, X_attn_masks, tokenizer)

0it [00:00, ?it/s]

In [26]:
# Using these labels as OHE vector
labels = np.zeros((len(mx_df), 6))
labels.shape

(19169, 6)

In [27]:
labels[np.arange(len(mx_df)), mx_df['user_name'].values] = 1

In [28]:
labels

array([[0., 0., 1., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0.],
       ...,
       [0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 1., 0., 0.]])

In [29]:
dataset = tf.data.Dataset.from_tensor_slices((X_input_ids, X_attn_masks, labels))

In [30]:
dataset.take(1)

<TakeDataset element_spec=(TensorSpec(shape=(280,), dtype=tf.float64, name=None), TensorSpec(shape=(280,), dtype=tf.float64, name=None), TensorSpec(shape=(6,), dtype=tf.float64, name=None))>

In [31]:
def UserDatasetMapFunction(input_ids, attn_masks, labels):
  return {
      'input_ids': input_ids, 
      'attention_mask': attn_masks
  }, labels

In [32]:
dataset = dataset.map(UserDatasetMapFunction)

In [33]:
dataset.take(1)

<TakeDataset element_spec=({'input_ids': TensorSpec(shape=(280,), dtype=tf.float64, name=None), 'attention_mask': TensorSpec(shape=(280,), dtype=tf.float64, name=None)}, TensorSpec(shape=(6,), dtype=tf.float64, name=None))>

In [34]:
dataset = dataset.shuffle(10000).batch(16, drop_remainder=True)

In [35]:
p = .8
train_size = int((len(mx_df)//16)*p)

In [36]:
train_size

958

In [37]:
train_dataset = dataset.take(train_size)
val_dataset = dataset.skip(train_size)

In [38]:

model = TFAutoModelForSequenceClassification.from_pretrained("M47Labs/spanish_news_classification_headlines", from_pt = 'True')

Downloading:   0%|          | 0.00/1.11k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/419M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertForSequenceClassification: ['bert.embeddings.position_ids']
- This IS expected if you are initializing TFBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForSequenceClassification for predictions without further training.


In [39]:
input_ids = tf.keras.layers.Input(shape=(280,), name='input_ids', dtype='int32')
attn_masks = tf.keras.layers.Input(shape=(280,), name='attention_mask', dtype='int32')

model_embds = model.bert(input_ids, attention_mask=attn_masks)[1]
intermediate_layer = tf.keras.layers.Dense(512, activation = 'relu', name = 'intermediate_layer')(model_embds)
output_layer = tf.keras.layers.Dense(6, activation='softmax', name='output_layer')(intermediate_layer)

twitter_user_model = tf.keras.Model(inputs=[input_ids, attn_masks], outputs=output_layer)
twitter_user_model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 280)]        0           []                               
                                                                                                  
 attention_mask (InputLayer)    [(None, 280)]        0           []                               
                                                                                                  
 bert (TFBertMainLayer)         TFBaseModelOutputWi  109850880   ['input_ids[0][0]',              
                                thPoolingAndCrossAt               'attention_mask[0][0]']         
                                tentions(last_hidde                                               
                                n_state=(None, 280,                                           

In [40]:
optim = tf.keras.optimizers.Adam(learning_rate=1e-5, decay=1e-6)
loss_func = tf.keras.losses.CategoricalCrossentropy()
acc = tf.keras.metrics.CategoricalAccuracy('accuracy')

In [41]:
twitter_user_model.compile(optimizer=optim, loss=loss_func, metrics=[acc])

In [53]:
hist = twitter_user_model.fit(
    train_dataset,
    validation_data=val_dataset,
    epochs=10
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [54]:
twitter_user_model.save('twitter_user_model')



INFO:tensorflow:Assets written to: twitter_user_model/assets


INFO:tensorflow:Assets written to: twitter_user_model/assets


Prediction

In [65]:
twitter_user_model = tf.keras.models.load_model('twitter_user_model')

tokenizer = AutoTokenizer.from_pretrained("M47Labs/spanish_news_classification_headlines_untrained")

def prepare_data(input_text, tokenizer):
    token = tokenizer.encode_plus(
        input_text,
        max_length=280, 
        truncation=True, 
        padding='max_length', 
        add_special_tokens=True,
        return_tensors='tf'
    )
    return {
        'input_ids': tf.cast(token.input_ids, tf.float64),
        'attention_mask': tf.cast(token.attention_mask, tf.float64)
    }

def make_prediction(model, processed_data, classes=['partidoverdemex', 'pri_nacional', 'accionnacional', 'partidomorenamx', 'prdmexico', 'movciudadanomx']):
    probs = model.predict(processed_data)[0]
    return classes[np.argmax(probs)]

In [66]:
input_text = input('Enter tweet from official Twitter account of a Mexican political party: ')
processed_data = prepare_data(input_text, tokenizer)
result = make_prediction(twitter_user_model, processed_data=processed_data)
print(f"Predicted Twitter account: {result}")

Enter tweet from official Twitter account of a Mexican political party: El 80 % de la contaminación en los océanos es causada por los humanos.   Juntos podemos cambiar esta situación.  En el Verde hemos sido los principales impulsores de la disminución del uso de plástico y otras propuestas para proteger la vida marina.   #DíaMundialDeLosOcéanos
Predicted Twitter account: partidoverdemex


In [49]:
twitter_user_model.save('tensorflow_model', save_format='h5')
