In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
%pip install keras-metrics

Collecting keras-metrics
  Downloading https://files.pythonhosted.org/packages/32/c9/a87420da8e73de944e63a8e9cdcfb1f03ca31a7c4cdcdbd45d2cdf13275a/keras_metrics-1.1.0-py2.py3-none-any.whl
Installing collected packages: keras-metrics
Successfully installed keras-metrics-1.1.0


In [3]:
import numpy as np
import pandas as pd

import gensim
from gensim.models import Word2Vec
LabeledSentence = gensim.models.doc2vec.LabeledSentence 

from tqdm import tqdm
tqdm.pandas(desc="progress-bar")

from nltk.tokenize import TweetTokenizer
tokenizer = TweetTokenizer()

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

# **df_test preprocessing**

In [4]:
df_test = pd.read_csv ('/content/drive/My Drive/arabic sentiment analysis/arabic-sentiment-analysis/full_labeled_data.csv')
df_test

Unnamed: 0,text,sentiment
0,@nas_alharbi8 والله حسب الأرقام سيكون مخيب للآ...,Positive
1,"""الزعل بيغير ملامحك ، بيغير نظرة العين ، بيغير...",Neutral
2,الحب الحقيقي هو اقتسام بعض نفسك مع شخص أخر أقر...,Positive
3,@Mo_Fat7 النهضة في فتيل 😂,Positive
4,@halgawi @DmfMohe ليس حباً في ايران بقدر ماهو ...,Neutral
...,...,...
54996,@MCgovSA تم رفع بلاغ بتاريخ 24-8 وتم اعتماده و...,Neutral
54997,#تصنيف_دايم_للتخصصات_الصحيه٥ بالامس تحدثونا عن...,Neutral
54998,@MCgovSA اتمنى تحلون المشكله وش يعني لما نلغ...,Neutral
54999,@mhrsd_care حاب استفسر اذا منشآتي من ضمن المنش...,Neutral


In [5]:
df_test.columns = ['text' , 'label' ]
for i in range (len(df_test['label'])):
  if (df_test['label'][i] == 'Neutral'):
    df_test['label'][i] = 0
  elif (df_test['label'][i] == 'Positive'):
    df_test['label'][i] = 1
  elif (df_test['label'][i] == 'Negative'):
   df_test['label'][i] = 2

df_test

Unnamed: 0,text,label
0,@nas_alharbi8 والله حسب الأرقام سيكون مخيب للآ...,1
1,"""الزعل بيغير ملامحك ، بيغير نظرة العين ، بيغير...",0
2,الحب الحقيقي هو اقتسام بعض نفسك مع شخص أخر أقر...,1
3,@Mo_Fat7 النهضة في فتيل 😂,1
4,@halgawi @DmfMohe ليس حباً في ايران بقدر ماهو ...,0
...,...,...
54996,@MCgovSA تم رفع بلاغ بتاريخ 24-8 وتم اعتماده و...,0
54997,#تصنيف_دايم_للتخصصات_الصحيه٥ بالامس تحدثونا عن...,0
54998,@MCgovSA اتمنى تحلون المشكله وش يعني لما نلغ...,0
54999,@mhrsd_care حاب استفسر اذا منشآتي من ضمن المنش...,0


In [6]:
df_test = df_test.dropna()

In [7]:
df_test

Unnamed: 0,text,label
0,@nas_alharbi8 والله حسب الأرقام سيكون مخيب للآ...,1
1,"""الزعل بيغير ملامحك ، بيغير نظرة العين ، بيغير...",0
2,الحب الحقيقي هو اقتسام بعض نفسك مع شخص أخر أقر...,1
3,@Mo_Fat7 النهضة في فتيل 😂,1
4,@halgawi @DmfMohe ليس حباً في ايران بقدر ماهو ...,0
...,...,...
54996,@MCgovSA تم رفع بلاغ بتاريخ 24-8 وتم اعتماده و...,0
54997,#تصنيف_دايم_للتخصصات_الصحيه٥ بالامس تحدثونا عن...,0
54998,@MCgovSA اتمنى تحلون المشكله وش يعني لما نلغ...,0
54999,@mhrsd_care حاب استفسر اذا منشآتي من ضمن المنش...,0


In [8]:
df_train = df_test.sample(n=50000)
df_test =df_test.sample(n=5000)

In [9]:
# data preprocessing: clear and clean the data from nonsense letters and words

import re

lst= ['#','%','@','*', ':']
pat1 = r'@[A-Za-z0-9]+'

def tokenize(tweet):
    num = re.compile(r'[0-9]+')
    tweet = re.sub(num, "", str(tweet))
    tokens = tokenizer.tokenize(str(tweet))
    for i in lst:
        if i in tokens:
            tokens.remove(i)
    return tokens
                                
#df_total['tokens'] = df_total['text'].apply(tokenize)
df_test['tokens'] = df_test['text'].apply(tokenize)
df_train['tokens'] = df_train['text'].apply(tokenize)

In [10]:
# Data segmentation, 
#X_train, X_test, y_train, y_test = train_test_split(df_total['tokens'], df_total['label'], test_size=0.33, random_state=42)

## Use the trained Word2Vec model 

In [11]:
w2v = Word2Vec.load('/content/drive/My Drive/arabic sentiment analysis/arabic-sentiment-analysis/cbow.bin')

In [12]:
# Test word and sees it similarity
w2v.wv.most_similar('العهد')

[('ولي', 0.9438014030456543),
 ('عهده', 0.8968819975852966),
 ('عهدنا', 0.88190758228302),
 ('عهد', 0.8595527410507202),
 ('الامير', 0.8186361193656921),
 ('سيدي', 0.8035902976989746),
 ('لولي', 0.794856071472168),
 ('خادم', 0.7939093112945557),
 ('الشاب', 0.7750820517539978),
 ('حكم', 0.768715500831604)]

In [13]:
pretrained_weights = w2v.wv.vectors

In [14]:
vocab_size, emdedding_size = pretrained_weights.shape

In [15]:
def word2idx(word):
    return w2v.wv.vocab[word].index

def idx2word(idx):
    return w2v.wv.index2word[idx]

In [16]:
# To find importance of the word with respect to the corpus.
# see: http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html

print('building tf-idf matrix ...')
vectorizer = TfidfVectorizer(analyzer=lambda x: x, min_df=10)
matrix = vectorizer.fit_transform([x for x in df_train["text"]])
tfidf = dict(zip(vectorizer.get_feature_names(), vectorizer.idf_))
print('vocab size :', len(tfidf))

building tf-idf matrix ...
vocab size : 388


In [17]:
# convert tokens to vector of words

def buildWordVector(tokens, size):
    vec = np.zeros(size).reshape((1, size))
    count = 0.
    for word in tokens:
        try:
            vec += w2v[word].reshape((1, size)) * tfidf[word]
            count += 1.
        except KeyError: # handling the case where the token is not
            continue
    if count != 0:
        vec /= count
    return vec

In [18]:
# Scale data and build words vector
from sklearn.preprocessing import scale

n_dim = 200

train_vecs_w2v = np.concatenate([buildWordVector(z, n_dim) for z in tqdm(map(lambda x: x, df_train["text"]))])
train_vecs_w2v = scale(train_vecs_w2v)

test_vecs_w2v = np.concatenate([buildWordVector(z, n_dim) for z in tqdm(map(lambda x: x, df_test["text"]))])
test_vecs_w2v = scale(test_vecs_w2v)


  
50000it [00:37, 1320.27it/s]
  
5000it [00:03, 1296.96it/s]


In [19]:
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding, Activation, Flatten, Bidirectional
from keras.utils import to_categorical
import keras_metrics
df_train["label"] = df_train["label"].replace(np.nan, 0)
df_test["label"] = df_test["label"].replace(np.nan, 0)

# convent labels to categorical classes
cy_train = to_categorical(df_train["label"])
cy_test = to_categorical(df_test["label"])

In [20]:
from keras import backend as K


# calculate f score
def f1(y_true, y_pred):
    def recall(y_true, y_pred):
        """Recall metric.

        Only computes a batch-wise average of recall.

        Computes the recall, a metric for multi-label classification of
        how many relevant items are selected.
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall

    def precision(y_true, y_pred):
        """Precision metric.

        Only computes a batch-wise average of precision.

        Computes the precision, a metric for multi-label classification of
        how many selected items are relevant.
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision
    precision = precision(y_true, y_pred)
    recall = recall(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))



# LSTM

In [21]:
import keras 

In [22]:
lstm_model = Sequential()
lstm_model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2 , return_sequences=True))
lstm_model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2,))
lstm_model.add(Dense(3, activation='softmax'))
lstm_model.compile(optimizer=keras.optimizers.Adam(),
              loss='categorical_crossentropy',
              metrics=['accuracy', 
                       keras_metrics.precision(),
                      keras_metrics.recall(),
                      f1])

lstm_his = lstm_model.fit(train_vecs_w2v.reshape((train_vecs_w2v.shape[0],1,200)), cy_train, epochs=30, batch_size=32, verbose=2)
lstm_model.summary()

Epoch 1/30




1563/1563 - 18s - loss: 0.6771 - accuracy: 0.7452 - precision: 0.7577 - recall: 0.8577 - f1: 0.7291
Epoch 2/30
1563/1563 - 12s - loss: 0.6415 - accuracy: 0.7542 - precision: 0.7723 - recall: 0.9283 - f1: 0.7481
Epoch 3/30
1563/1563 - 11s - loss: 0.6311 - accuracy: 0.7586 - precision: 0.7772 - recall: 0.9277 - f1: 0.7527
Epoch 4/30
1563/1563 - 11s - loss: 0.6232 - accuracy: 0.7611 - precision: 0.7790 - recall: 0.9323 - f1: 0.7560
Epoch 5/30
1563/1563 - 11s - loss: 0.6182 - accuracy: 0.7628 - precision: 0.7776 - recall: 0.9317 - f1: 0.7576
Epoch 6/30
1563/1563 - 11s - loss: 0.6147 - accuracy: 0.7621 - precision: 0.7776 - recall: 0.9347 - f1: 0.7575
Epoch 7/30
1563/1563 - 12s - loss: 0.6106 - accuracy: 0.7637 - precision: 0.7803 - recall: 0.9319 - f1: 0.7599
Epoch 8/30
1563/1563 - 11s - loss: 0.6076 - accuracy: 0.7659 - precision: 0.7801 - recall: 0.9357 - f1: 0.7619
Epoch 9/30
1563/1563 - 12s - loss: 0.6041 - accuracy: 0.7675 - precision: 0.7808 - recall: 0.9300 - f1: 0.7632
Epoch 10/30


In [23]:
score = lstm_model.evaluate(test_vecs_w2v.reshape((test_vecs_w2v.shape[0], 1, test_vecs_w2v.shape[1])), cy_test, batch_size=128, verbose=2)
print('LSTM model acuracy: {}'.format(score[1]*100))





40/40 - 1s - loss: 0.5495 - accuracy: 0.7850 - precision: 0.7824 - recall: 0.9560 - f1: 0.7851
LSTM model acuracy: 78.50000262260437


In [24]:
y_pred = lstm_model.predict(test_vecs_w2v.reshape((test_vecs_w2v.shape[0], 1, test_vecs_w2v.shape[1])))
y_pred = np.argmax(y_pred , axis = 1)
y_test = np.argmax(cy_test, axis=1)

In [25]:
from sklearn import metrics
print(metrics.confusion_matrix(y_test, list(y_pred)))

[[3231   73   38]
 [ 301  498   13]
 [ 631   19  196]]


In [26]:
import numpy as np
from sklearn.metrics import f1_score
print(f1_score(y_test, y_pred, average=None))

[0.86102598 0.71041369 0.35864593]


In [27]:
from sklearn.metrics import recall_score
print(recall_score(y_test, y_pred, average=None))

[0.96678636 0.61330049 0.23167849]
