In [24]:
import pandas as pd
import numpy as np

df_main = pd.read_csv("full_labeled_data.csv", names = ["Tweet", "Sentiment"])
df_main = df_main.sample(frac=1)


x_train = df_main["Tweet"][:50000].to_numpy()
y_train = df_main["Sentiment"][:50000].to_numpy()

x_test = df_main["Tweet"][50000:].to_numpy()
y_test = df_main["Sentiment"][50000:].to_numpy()


In [3]:
!pip install gensim spacy nltk




In [4]:
import gensim
import re
import spacy

# Clean/Normalize Arabic Text
def clean_str(text):
    search = ["أ","إ","آ","ة","_","-","/",".","،"," و "," يا ",'"',"ـ","'","ى","\\",'\n', '\t','&quot;','?','؟','!']
    replace = ["ا","ا","ا","ه"," "," ","","",""," و"," يا","","","","ي","",' ', ' ',' ',' ? ',' ؟ ',' ! ']
    
    #remove tashkeel
    p_tashkeel = re.compile(r'[\u0617-\u061A\u064B-\u0652]')
    text = re.sub(p_tashkeel,"", text)
    
    #remove longation
    p_longation = re.compile(r'(.)\1+')
    subst = r"\1\1"
    text = re.sub(p_longation, subst, text)
    
    text = text.replace('وو', 'و')
    text = text.replace('يي', 'ي')
    text = text.replace('اا', 'ا')
    
    for i in range(0, len(search)):
        text = text.replace(search[i], replace[i])
    
    #trim    
    text = text.strip()

    return text


In [5]:
!wget "https://bakrianoo.s3-us-west-2.amazonaws.com/aravec/full_uni_cbow_300_twitter.zip"
!unzip "full_uni_cbow_300_twitter.zip"

--2021-02-22 01:21:16--  https://bakrianoo.s3-us-west-2.amazonaws.com/aravec/full_uni_cbow_300_twitter.zip
Resolving bakrianoo.s3-us-west-2.amazonaws.com (bakrianoo.s3-us-west-2.amazonaws.com)... 52.218.178.81
Connecting to bakrianoo.s3-us-west-2.amazonaws.com (bakrianoo.s3-us-west-2.amazonaws.com)|52.218.178.81|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2833686412 (2.6G) [application/zip]
Saving to: ‘full_uni_cbow_300_twitter.zip’


2021-02-22 01:22:35 (34.4 MB/s) - ‘full_uni_cbow_300_twitter.zip’ saved [2833686412/2833686412]

Archive:  full_uni_cbow_300_twitter.zip
  inflating: full_uni_cbow_300_twitter.mdl  
  inflating: full_uni_cbow_300_twitter.mdl.trainables.syn1neg.npy  
  inflating: full_uni_cbow_300_twitter.mdl.wv.vectors.npy  


In [6]:
model = gensim.models.Word2Vec.load("full_uni_cbow_300_twitter.mdl")
print("We've",len(model.wv.index2word),"vocabularies")

We've 1259756 vocabularies


In [7]:
%mkdir spacyModel


In [8]:
model.wv.save_word2vec_format("./spacyModel/aravec.txt")


In [9]:
!gzip ./spacyModel/aravec.txt


In [10]:
!python -m spacy  init-model ar spacy.aravec.model --vectors-loc ./spacyModel/aravec.txt.gz

⠙ Creating model...[2K[38;5;2m✔ Successfully created model[0m
⠙ Reading vectors from spacyModel/aravec.txt.gztcmalloc: large alloc 1511710720 bytes == 0x4098000 @  0x7f60e40e8001 0x7f60e1c664ff 0x7f60e1cb6b08 0x7f60e1cbaac7 0x7f60e1d591a3 0x50a4a5 0x50cc96 0x5095c8 0x50a2fd 0x50beb4 0x507be4 0x509900 0x50a2fd 0x50beb4 0x507be4 0x588d41 0x59fd0e 0x50d256 0x507be4 0x509900 0x50a2fd 0x50beb4 0x507be4 0x509900 0x50a2fd 0x50beb4 0x507be4 0x5161c5 0x50a12f 0x50beb4 0x507be4
1259756it [01:59, 10583.89it/s]
[2K[38;5;2m✔ Loaded vectors from spacyModel/aravec.txt.gz[0m
[38;5;2m✔ Sucessfully compiled vocab[0m
1259946 entries, 1259756 vectors


In [11]:
nlp = spacy.load("./spacy.aravec.model/")

In [12]:
class Preprocessor:
    def __init__(self, tokenizer, **cfg):
        self.tokenizer = tokenizer

    def __call__(self, text):
        preprocessed = clean_str(text)
        return self.tokenizer(preprocessed)

In [13]:
nlp.tokenizer = Preprocessor(nlp.tokenizer)

In [29]:
x_train2=[0]*50000
y_train2=[0]*len(y_train)

i=0
for i in range(len(y_train)):
  if y_train[i] == 'Positive':

    y_train2[i]=1

  elif y_train[i] == 'Negative':

    y_train2[i]=-1

  else:
    y_train2[i]=0



In [31]:
train_arrays=[0]*len(x_train)
for i in range(len(x_train)):
  train_arrays[i]=nlp(x_train[i]).vector

In [32]:
test_arrays=[0]*len(x_test)
for i in range(len(x_test)):
  test_arrays[i]=nlp(x_test[i]).vector


In [33]:
from sklearn.linear_model import LogisticRegression


classifier =  LogisticRegression(multi_class='multinomial',max_iter=1000, solver='lbfgs')
classifier.fit(train_arrays, y_train2)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='multinomial', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [34]:
x_test2=[0]*5002  
y_test2=[0]*5002

i=0
for i in range(len(x_test)):
  if y_test[i] == 'Positive':
    x_test2[i] = nlp(x_test[i]).vector
    y_test2[i]=1

  elif y_test[i] == 'Negative':
    x_test2[i] = nlp(x_test[i]).vector

    y_test2[i]=-1

  else:
    x_test2[i] = nlp(x_test[i]).vector
    y_test2[i]=0

In [35]:
prediction = classifier.predict(x_test2)


In [36]:
import sklearn
sklearn.metrics.recall_score(y_test2, prediction, average = None)
sklearn.metrics.f1_score(y_test2, prediction, average = None)

array([0.46577017, 0.91610443, 0.60258065])