In [1]:
# import data

# 
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [None]:
!pip install cleantext

In [2]:
from cleantext import clean # helps to remove imoji in text
import pandas as pd
import re

# 
def clean( text ):
  '''clean tweet texts and remove links, usernamas'''
  text = text.lower()
  text = ' '.join( text.split() )
  text = ' '.join( [ re.sub("^@\w+", " ", t) for t in text.split(' ') ] ) # remove usernames
  # text = ' '.join( [ re.sub("^@\w+", " ", t) for t in text.split(' ') ] ) # remove hashtags
  text = ' '.join( [ re.sub("^http\w+", " ", t) for t in text.split(' ') ] ) # remove links
  text = clean(text, no_emoji=True)
  return ' '.join( text.split() )

# make classes
def make_label( class_ ):
  '''
  neu   - 0
  pos   - 1
  neg   - 2
  vpos  - 3
  vneg  - 4
  '''
  class_ = class_.lower()
  if class_ == 'vneg': return 4
  elif class_ == 'neu': return 0
  elif class_ == 'neg': return 2
  elif class_ == 'vpos': return 3
  elif class_ == 'pos': return 1

# feature processing
df = pd.read_csv("drive/MyDrive/datasets/traindata1.1.csv",engine="python")
df.drop(axis=1, inplace=True, columns=['UserID','Date/Time'] )
df.drop_duplicates(inplace=True)
# print( df.columns )
df['shona_cleaned'] = df['SN(Original Shona Tweet)'].apply( clean ) # clean shona tweets.
df['Label5'] = df['finalLabel5Classes'].apply( make_label )
df.head()

Unnamed: 0,topic,SearchTerm,emoticonBased,lexiconBased,Annotation 1,Annotation 2,Annotation 3,finalLabel5Classes,finalLabel3Classes,SN(Original Shona Tweet),ENGoogleTranslate,shona_cleaned,Label5
0,Education,Vana,POS,NEG,VNEG,VNEG,NEU,VNEG,NEG,@GombaGuru @__vigie 😂😂 ah mudhara inzwaiwo tsi...,'g oh mammal feel sorry for the kids wod out w...,😂😂 ah mudhara inzwaiwo tsitsi vana vatambura k...,4
1,Agriculture,kudya,UNK,UNK,NEU,NEU,VPOS,NEU,NEU,@ChinyandeGeorge @ngadziore @nancynjenge @tapc...,'my message is a response to your demand that ...,waigona kuhusunga kana kuti achihuisa pane zva...,0
2,Sanitation,Vanhu,UNK,POS,NEG,NEU,NEG,NEG,NEG,Munenge muchiseka vanhu vari single imi muchii...,'you are making fun of people who are single y...,munenge muchiseka vanhu vari single imi muchii...,2
3,Finance,uyu,UNK,UNK,NEU,NEU,VNEG,NEU,NEU,@Sharonrose918 @habeeb_zw uyu oita sei,'g is how to do',uyu oita sei,0
4,Home_Affairs,Baba,POS,POS,NEU,VPOS,VPOS,VPOS,POS,@baba_nyenyedzi Am honest and practical questi...,'g am honest and prectical quetical quems perm...,am honest and practical question perm sec . ku...,3


In [3]:
# imports
import numpy as np 
import tensorflow as tf
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential
from keras.layers import Dense, Flatten, LSTM, Conv1D, MaxPooling1D, Dropout, Activation, Embedding
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

# load and evaluate a saved model
from keras.models import load_model

In [4]:
# preprocessing
data = pd.DataFrame()
data['Shona'] = df['shona_cleaned']
# data['Label'] = LabelEncoder().fit_transform(df['finalLabel5Classes'])
data['Label'] = df['Label5']


# 
# X_train, X_test, y_train, y_test = train_test_split(data['Shona'].values, data['Label'].values, test_size=.2, random_state=42 )
train, test = train_test_split(data, test_size=.2, random_state=42 )
train.head()

Unnamed: 0,Shona,Label
9237,pakuzofananidza winky d nezvinhu zvakaita sana...,4
1762,phone yanga ine mwana 😭🤣🤣,2
2865,"dembare iri kutamba kwete kana ""ounce"" imwe ch...",2
5191,manje unozviitwa nani nhai sirivhiya iwe uchit...,2
4523,wangu vanhu havaite ava tenge tiri kuma terrac...,4


In [None]:
!pip install transformers

In [None]:
# load bert pre-trained and sequence classifier.
# will build model with sequence classifier and tokenizer with bert-tokenizer.
from transformers import BertTokenizer, TFBertForSequenceClassification, InputExample, InputFeatures, TFBertForMultipleChoice
num_labels = 5
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = TFBertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=num_labels, problem_type="multi_label_classification")

In [8]:
# check summary of bert model
model.summary()

Model: "tf_bert_for_sequence_classification"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  109482240 
                                                                 
 dropout_37 (Dropout)        multiple                  0         
                                                                 
 classifier (Dense)          multiple                  3845      
                                                                 
Total params: 109,486,085
Trainable params: 109,486,085
Non-trainable params: 0
_________________________________________________________________


In [9]:
test.head()

Unnamed: 0,Shona,Label
2427,@ chitowamombe14 cde gwasai lecture yaisapera ...,0
3199,"atova nekutonga. misa mutongo, misa misa, ita ...",2
1164,akuhumana uyu,4
4099,kusvika wati eke . chinhu chawo wawe kuchida .,2
1027,usamhanye mwana weafrica🤣🤣🤣 ://t.co/bfbkflmab4,1


In [10]:
# create input sequence.
# 
# InputExample(guid=None, text_a='hello world', text_b=None, label=1)

def convert_data_to_examples(train, test, DATA_COLUMN, LABEL_COLUMN): 
  train_InputExamples = train.apply(lambda x: InputExample(guid=None, # Globally unique ID for bookkeeping, unused in this case
                                                          text_a = x[DATA_COLUMN], 
                                                          text_b = None,
                                                          label = x[LABEL_COLUMN]), axis = 1)
  
  validation_InputExamples = test.apply(lambda x: InputExample(guid=None, # Globally unique ID for bookkeeping, unused in this case
                                                          text_a = x[DATA_COLUMN], 
                                                          text_b = None,
                                                          label = x[LABEL_COLUMN]), axis = 1)
  return train_InputExamples, validation_InputExamples

# train_InputExamples, validation_InputExamples = convert_data_to_examples(train, test, 'Shona', 'Label')


def convert_examples_to_tf_dataset(examples, tokenizer, max_length=128):
    features = [] # -> will hold InputFeatures to be converted later

    for e in examples:
        # Documentation is really strong for this method, so please take a look at it
        input_dict = tokenizer.encode_plus(
            e.text_a,
            add_special_tokens=True,
            max_length=max_length, # truncates if len(s) > max_length
            return_token_type_ids=True,
            return_attention_mask=True,
            pad_to_max_length=True, # pads to the right by default # CHECK THIS for pad_to_max_length
            truncation=True
        )

        input_ids, token_type_ids, attention_mask = (input_dict["input_ids"],
            input_dict["token_type_ids"], input_dict['attention_mask'])

        features.append(
            InputFeatures(
                input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, label=e.label
            )
        )

    def gen():
        for f in features:
            yield (
                {
                    "input_ids": f.input_ids,
                    "attention_mask": f.attention_mask,
                    "token_type_ids": f.token_type_ids,
                },
                f.label,
            )

    return tf.data.Dataset.from_generator(
        gen,
        ({"input_ids": tf.int32, "attention_mask": tf.int32, "token_type_ids": tf.int32}, tf.int64),
        (
            {
                "input_ids": tf.TensorShape([None]),
                "attention_mask": tf.TensorShape([None]),
                "token_type_ids": tf.TensorShape([None]),
            },
            tf.TensorShape([]),
        ),
    )




In [11]:
# run convert examples to dataset function.
DATA_COLUMN = 'Shona'
LABEL_COLUMN = 'Label'
train_InputExamples, validation_InputExamples = convert_data_to_examples(train, test, DATA_COLUMN, LABEL_COLUMN)

train_data = convert_examples_to_tf_dataset(list(train_InputExamples), tokenizer)
train_data = train_data.shuffle(100).batch(32).repeat(2)

validation_data = convert_examples_to_tf_dataset(list(validation_InputExamples), tokenizer)
validation_data = validation_data.batch(32)

train_data

<RepeatDataset element_spec=({'input_ids': TensorSpec(shape=(None, None), dtype=tf.int32, name=None), 'attention_mask': TensorSpec(shape=(None, None), dtype=tf.int32, name=None), 'token_type_ids': TensorSpec(shape=(None, None), dtype=tf.int32, name=None)}, TensorSpec(shape=(None,), dtype=tf.int64, name=None))>

In [None]:
# Adam for optimization. categorical crossentropy as the loss function.
# sparse categorical accuracy as our accuracy metric.
# sparse_categorical_crossentropy/SparseSoftmaxCrossEntropyWithLogits/SparseSoftmaxCrossEntropyWithLogits
epochs = 2
batch = 32
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0), 
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False),
              metrics=[tf.keras.metrics.SparseCategoricalAccuracy('accuracy')]
              )

model.fit(train_data, epochs=epochs, batch_size=batch, validation_data=validation_data)

Epoch 1/2
    104/Unknown - 4704s 45s/step - loss: 4.6483 - accuracy: 0.2293