# Arabic Tweets Sentiment Analysis with NLTK

In [1]:
import pandas as pd
import numpy as np
import random
import nltk
from nltk.classify import NaiveBayesClassifier
from nltk.classify import SklearnClassifier
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.naive_bayes import MultinomialNB,BernoulliNB
from sklearn.linear_model import LogisticRegression,SGDClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from nltk.classify import ClassifierI
from statistics import mode

In [2]:
df = pd.read_csv("Data/tweets_sent_ar.csv")
df['language'].unique()

array(['ar', 'en', 'no', '\tar', 'zr', 'en 2', 'fa'], dtype=object)

### Dropping English and Persian tweets, leaving only Arabic. I made some mistakes when I labelled the training set which I have to fix.

In [3]:
df.drop(df[df['language'].isin(["en", 'en 2', 'no', 'fa'])].index, inplace=True)
df['language'] = 'ar'

### Dropping the unnecessary characters and unifying the different veriety of characters. 

In [4]:
def remove_unnecessary(txt):
    txt = ' '.join(txt.split())
    
    replA = ("أ", "إ", "آ")    
    chars_to_drop = ( 'ّ', 'َ', 'ِ', 'ُ', 'ْ', 'ً', 'ٍ', 'ٌ', 'ـ' )
    chars_to_clean = ('…','#', '@', '-', '_')
    txt = txt.replace("ة","ه")
    for a in replA:
        txt = txt.replace(a, "ا")
    for d in chars_to_drop:
        txt = txt.replace(d,"")
    for c in chars_to_clean:
        txt = txt.replace(c," ")
    return txt.strip()

In [5]:
df['cleaned_text'] = df['cleaned_text'].apply(remove_unnecessary)

### Splitting the data into labelled and unlabelled sets

In [6]:
df_s = df[df['sentiment'].isna()]
df_t = df.dropna(subset=['sentiment'])

In [7]:
df_t.shape

(966, 19)

In [8]:
df_t.reset_index(drop = True, inplace = True)
df_s.reset_index(drop = True, inplace = True)

In [9]:
df_t.drop(df_t[df_t['sentiment'] == 2].index, inplace=True)
df_t.loc[df_t['sentiment'] == 0, 'sentiment'] = 'N'
df_t.loc[df_t['sentiment'] == 1, 'sentiment'] = 'O'
df_t.loc[df_t['sentiment'] == -1, 'sentiment'] = 'S'
df_t.reset_index(drop = True, inplace = True)
print(df_t['sentiment'].unique())
len(df_t)

['S' 'O' 'N']


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


937

### Dropping the neutral tweets

In [10]:
df_train_final = df_t.drop(df_t[df_t['sentiment'] == 'N'].index)
df_train_final.reset_index(drop = True, inplace = True)

### Preparing the text for training

In [11]:
df_train_final['tokenized_text'] = df_train_final['cleaned_text'].apply(nltk.word_tokenize)
documents = [tuple(x) for x in df_train_final[['tokenized_text', 'sentiment']].to_numpy()]

In [12]:
random.shuffle(documents)
documents[:3]

[(['قال',
   'السعوديين',
   'عم',
   'يتمسخروا',
   'انو',
   'كيف',
   'بيتشيع',
   'الشهيد',
   'قاسم',
   'سليماني',
   'بسياره',
   'شيفروليه',
   'امريكيه',
   'الصنع',
   'وقال',
   'ليش',
   'ما',
   'بيشيعوه',
   'بسياره',
   'ايرانيه',
   'الصنع',
   '.',
   'طيب',
   'خلص',
   'تزعلوش',
   'بكرا',
   'بس',
   'الله',
   'ياخد',
   'بن',
   'سلمان',
   'ابقوا',
   'شيعوه',
   'عالجمل',
   'وهيك',
   'منكون',
   'لاول',
   'مره',
   'بالتاريخ',
   'منشوف',
   'فيها',
   'جمل',
   'حامل',
   'بغل'],
  'O'),
 (['الهلال',
   'الفيصلي',
   'قاسم',
   'سليماني',
   'هلاك',
   'الارهابي',
   'قاسم',
   'سليماني',
   'نطالب',
   'بالتشهير',
   'للمتحرش',
   'ليله',
   'عباديات'],
  'S'),
 (['قاسم',
   'سليماني',
   'اجمل',
   'خبر',
   'ممكن',
   'تسمعه',
   'باين',
   'سنه',
   '2020',
   'سنه',
   'افراح'],
  'S')]

In [13]:
all_words = []
for row in df_train_final['tokenized_text']:
    all_words += row

In [14]:
all_words = nltk.FreqDist(all_words)

In [15]:
def find_features(document):
    words = set(document)
    features = {}
    for w in all_words:
        features[w] = (w in words)
    return features

### Splitting the data into training and testing sets

In [16]:
featuresets = [(find_features(rev), category) for (rev, category) in documents]
training_set = featuresets[:500]
testing_set = featuresets[500:]

### Training different models

In [17]:
BernoulliNB_classifier = SklearnClassifier(BernoulliNB())
BernoulliNB_classifier.train(training_set)
print("BernoulliNB_classifier accuracy percent:", (nltk.classify.accuracy(BernoulliNB_classifier, testing_set))*100)

BernoulliNB_classifier accuracy percent: 70.62937062937063


In [18]:
classifier = NaiveBayesClassifier.train(training_set)
print("Original Naive Bayes Algo accuracy percent:", (nltk.classify.accuracy(classifier, testing_set))*100)

#SVC_classifier = SklearnClassifier(SVC())
#SVC_classifier.train(training_set)
#print("SVC_classifier accuracy percent:", (nltk.classify.accuracy(SVC_classifier, testing_set))*100)

Original Naive Bayes Algo accuracy percent: 78.32167832167832


In [19]:
BernoulliNB_classifier = SklearnClassifier(BernoulliNB())
BernoulliNB_classifier.train(training_set)
print("BernoulliNB_classifier accuracy percent:", (nltk.classify.accuracy(BernoulliNB_classifier, testing_set))*100)

BernoulliNB_classifier accuracy percent: 70.62937062937063


In [20]:
LogisticRegression_classifier = SklearnClassifier(LogisticRegression())
LogisticRegression_classifier.train(training_set)
print("LogisticRegression_classifier accuracy percent:", (nltk.classify.accuracy(LogisticRegression_classifier, testing_set))*100)

LogisticRegression_classifier accuracy percent: 83.91608391608392


In [21]:
SGDClassifier_classifier = SklearnClassifier(SGDClassifier())
SGDClassifier_classifier.train(training_set)
print("SGDClassifier_classifier accuracy percent:", (nltk.classify.accuracy(SGDClassifier_classifier, testing_set))*100)

SGDClassifier_classifier accuracy percent: 83.21678321678321


In [22]:
LinearSVC_classifier = SklearnClassifier(LinearSVC())
LinearSVC_classifier.train(training_set)
print("LinearSVC_classifier accuracy percent:", (nltk.classify.accuracy(LinearSVC_classifier, testing_set))*100)

LinearSVC_classifier accuracy percent: 86.01398601398601


In [23]:
NuSVC_classifier = SklearnClassifier(NuSVC())
NuSVC_classifier.train(training_set)
print("NuSVC_classifier accuracy percent:", (nltk.classify.accuracy(NuSVC_classifier, testing_set))*100)

NuSVC_classifier accuracy percent: 81.11888111888112


In [24]:
MNB_classifier = SklearnClassifier(MultinomialNB())
MNB_classifier.train(training_set)
print("MNB_classifier accuracy percent:", (nltk.classify.accuracy(MNB_classifier, testing_set))*100)

MNB_classifier accuracy percent: 86.01398601398601


### Creating a voting system for the classifiers 

In [25]:
class VoteClassifier(ClassifierI):
    def __init__(self, *classifiers):
        self._classifiers = classifiers
        
    def classify(self, features):
        votes = []
        for c in self._classifiers:
            v = c.classify(features)
            votes.append(v)
        return mode(votes)
    
    def confidence(self, features):
        votes = []
        for c in self._classifiers:
            v = c.classify(features)
            votes.append(v)

        choice_votes = votes.count(mode(votes))
        conf = choice_votes / len(votes)
        return conf

In [26]:
voted_classifier = VoteClassifier(classifier,
                                  NuSVC_classifier,
                                  LinearSVC_classifier,
                                  SGDClassifier_classifier,
                                  MNB_classifier,
                                  BernoulliNB_classifier,
                                  LogisticRegression_classifier)

print("voted_classifier accuracy percent:", (nltk.classify.accuracy(voted_classifier, testing_set))*100)


print("Classification:", voted_classifier.classify(testing_set[0][0]), "Confidence %:",voted_classifier.confidence(testing_set[0][0])*100)
print("Classification:", voted_classifier.classify(testing_set[1][0]), "Confidence %:",voted_classifier.confidence(testing_set[1][0])*100)
print("Classification:", voted_classifier.classify(testing_set[2][0]), "Confidence %:",voted_classifier.confidence(testing_set[2][0])*100)
print("Classification:", voted_classifier.classify(testing_set[3][0]), "Confidence %:",voted_classifier.confidence(testing_set[3][0])*100)
print("Classification:", voted_classifier.classify(testing_set[4][0]), "Confidence %:",voted_classifier.confidence(testing_set[4][0])*100)
print("Classification:", voted_classifier.classify(testing_set[5][0]), "Confidence %:",voted_classifier.confidence(testing_set[5][0])*100)

voted_classifier accuracy percent: 83.91608391608392
Classification: O Confidence %: 85.71428571428571
Classification: S Confidence %: 100.0
Classification: S Confidence %: 100.0
Classification: S Confidence %: 100.0
Classification: S Confidence %: 100.0
Classification: S Confidence %: 100.0


### Classifying 

In [27]:
def sentiment(text):
    feats = find_features(text)
    v = voted_classifier.classify(feats)
    conf = voted_classifier.confidence(feats)
    if conf*100 < 75:
        return 'N'
    return v

In [28]:
df_s['sentiment'] = df_s['cleaned_text'].apply(sentiment)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [29]:
df_op = df_s[df_s['sentiment'] == 'O']
print("Oppose: ", df_op.shape[0])
df_su = df_s[df_s['sentiment'] == 'S']
print("Support: ", df_su.shape[0])
df_neu = df_s[df_s['sentiment'] == 'N']
print("Neutral: ", df_neu.shape[0])
print('====================')
print("""
Without ML
Oppose:  21537
Suport:  34007
Neutral:  58620""")

Oppose:  0
Support:  114140
Neutral:  24

Without ML
Oppose:  21537
Suport:  34007
Neutral:  58620


#### As you can see, the accuracy of the model is not impressive at all

In [30]:
df_final_arabic = pd.concat([df_s, df_t])
df_final_arabic.to_csv("Data/df_arabic_classified.csv", index = False)