In [None]:
import pandas as pd
import numpy as np
import random
import nltk
from nltk.classify import NaiveBayesClassifier
from nltk.classify import SklearnClassifier
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.naive_bayes import MultinomialNB,BernoulliNB
from sklearn.linear_model import LogisticRegression,SGDClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from nltk.classify import ClassifierI
from statistics import mode

In [None]:
df = pd.read_csv("tweets_sent_ar.csv")
df['language'].unique()

In [None]:
df.drop(df[df['language'].isin(["en", 'en 2', 'no', 'fa'])].index, inplace=True)
df['language'] = 'ar'

In [None]:
def remove_unnecessary(txt):
    txt = ' '.join(txt.split())
    
    replA = ("أ", "إ", "آ")    
    chars_to_drop = ( 'ّ', 'َ', 'ِ', 'ُ', 'ْ', 'ً', 'ٍ', 'ٌ', 'ـ' )
    chars_to_clean = ('…','#', '@', '-', '_')
    txt = txt.replace("ة","ه")
    for a in replA:
        txt = txt.replace(a, "ا")
    for d in chars_to_drop:
        txt = txt.replace(d,"")
    for c in chars_to_clean:
        txt = txt.replace(c," ")
    return txt.strip()

In [None]:
df['cleaned_text'] = df['cleaned_text'].apply(remove_unnecessary)

In [None]:
df_s = df[df['sentiment'].isna()]
df_t = df.dropna(subset=['sentiment'])

In [None]:
df_t.shape

In [None]:
df_t.reset_index(drop = True, inplace = True)
df_s.reset_index(drop = True, inplace = True)

In [None]:
df_t.drop(df_t[df_t['sentiment'] == 2].index, inplace=True)
df_t.loc[df_t['sentiment'] == 0, 'sentiment'] = 'N'
df_t.loc[df_t['sentiment'] == 1, 'sentiment'] = 'O'
df_t.loc[df_t['sentiment'] == -1, 'sentiment'] = 'S'
df_t.reset_index(drop = True, inplace = True)
print(df_t['sentiment'].unique())
len(df_t)

In [None]:
df_train_final = df_t.drop(df_t[df_t['sentiment'] == 'N'].index)
df_train_final.reset_index(drop = True, inplace = True)

In [None]:
df_train_final['tokenized_text'] = df_train_final['cleaned_text'].apply(nltk.word_tokenize)
documents = [tuple(x) for x in df_train_final[['tokenized_text', 'sentiment']].to_numpy()]

In [None]:
random.shuffle(documents)
documents[:3]

In [None]:
all_words = []
for row in df_train_final['tokenized_text']:
    all_words += row

In [None]:
all_words = nltk.FreqDist(all_words)

In [None]:
def find_features(document):
    words = set(document)
    features = {}
    for w in all_words:
        features[w] = (w in words)
    return features

In [None]:
featuresets = [(find_features(rev), category) for (rev, category) in documents]
training_set = featuresets[:500]
testing_set = featuresets[500:]

In [None]:
BernoulliNB_classifier = SklearnClassifier(BernoulliNB())
BernoulliNB_classifier.train(training_set)
print("BernoulliNB_classifier accuracy percent:", (nltk.classify.accuracy(BernoulliNB_classifier, testing_set))*100)

In [None]:
classifier = NaiveBayesClassifier.train(training_set)
print("Original Naive Bayes Algo accuracy percent:", (nltk.classify.accuracy(classifier, testing_set))*100)

#SVC_classifier = SklearnClassifier(SVC())
#SVC_classifier.train(training_set)
#print("SVC_classifier accuracy percent:", (nltk.classify.accuracy(SVC_classifier, testing_set))*100)

In [None]:
BernoulliNB_classifier = SklearnClassifier(BernoulliNB())
BernoulliNB_classifier.train(training_set)
print("BernoulliNB_classifier accuracy percent:", (nltk.classify.accuracy(BernoulliNB_classifier, testing_set))*100)

In [None]:
LogisticRegression_classifier = SklearnClassifier(LogisticRegression())
LogisticRegression_classifier.train(training_set)
print("LogisticRegression_classifier accuracy percent:", (nltk.classify.accuracy(LogisticRegression_classifier, testing_set))*100)

In [None]:
SGDClassifier_classifier = SklearnClassifier(SGDClassifier())
SGDClassifier_classifier.train(training_set)
print("SGDClassifier_classifier accuracy percent:", (nltk.classify.accuracy(SGDClassifier_classifier, testing_set))*100)

In [None]:
LinearSVC_classifier = SklearnClassifier(LinearSVC())
LinearSVC_classifier.train(training_set)
print("LinearSVC_classifier accuracy percent:", (nltk.classify.accuracy(LinearSVC_classifier, testing_set))*100)

In [None]:
NuSVC_classifier = SklearnClassifier(NuSVC())
NuSVC_classifier.train(training_set)
print("NuSVC_classifier accuracy percent:", (nltk.classify.accuracy(NuSVC_classifier, testing_set))*100)

In [None]:
MNB_classifier = SklearnClassifier(MultinomialNB())
MNB_classifier.train(training_set)
print("MNB_classifier accuracy percent:", (nltk.classify.accuracy(MNB_classifier, testing_set))*100)

In [None]:
class VoteClassifier(ClassifierI):
    def __init__(self, *classifiers):
        self._classifiers = classifiers
        
    def classify(self, features):
        votes = []
        for c in self._classifiers:
            v = c.classify(features)
            votes.append(v)
        return mode(votes)
    
    def confidence(self, features):
        votes = []
        for c in self._classifiers:
            v = c.classify(features)
            votes.append(v)

        choice_votes = votes.count(mode(votes))
        conf = choice_votes / len(votes)
        return conf

In [None]:
voted_classifier = VoteClassifier(classifier,
                                  NuSVC_classifier,
                                  LinearSVC_classifier,
                                  SGDClassifier_classifier,
                                  MNB_classifier,
                                  BernoulliNB_classifier,
                                  LogisticRegression_classifier)

print("voted_classifier accuracy percent:", (nltk.classify.accuracy(voted_classifier, testing_set))*100)


print("Classification:", voted_classifier.classify(testing_set[0][0]), "Confidence %:",voted_classifier.confidence(testing_set[0][0])*100)
print("Classification:", voted_classifier.classify(testing_set[1][0]), "Confidence %:",voted_classifier.confidence(testing_set[1][0])*100)
print("Classification:", voted_classifier.classify(testing_set[2][0]), "Confidence %:",voted_classifier.confidence(testing_set[2][0])*100)
print("Classification:", voted_classifier.classify(testing_set[3][0]), "Confidence %:",voted_classifier.confidence(testing_set[3][0])*100)
print("Classification:", voted_classifier.classify(testing_set[4][0]), "Confidence %:",voted_classifier.confidence(testing_set[4][0])*100)
print("Classification:", voted_classifier.classify(testing_set[5][0]), "Confidence %:",voted_classifier.confidence(testing_set[5][0])*100)

In [None]:
def sentiment(text):
    feats = find_features(text)
    v = voted_classifier.classify(feats)
    conf = voted_classifier.confidence(feats)
    if conf*100 < 75:
        return 'N'
    return v

In [None]:
df_s['sentiment'] = df_s['cleaned_text'].apply(sentiment)

In [None]:
df_op = df_s[df_s['sentiment'] == 'O']
print("Oppose: ", df_op.shape[0])
df_su = df_s[df_s['sentiment'] == 'S']
print("Support: ", df_su.shape[0])
df_neu = df_s[df_s['sentiment'] == 'N']
print("Neutral: ", df_neu.shape[0])
print('====================')
print("""
Without ML
Oppose:  21537
Suport:  34007
Neutral:  58620""")

In [None]:
df_final_arabic = pd.concat([df_s, df_t])
df_final_arabic.to_csv("df_arabic_classified.csv", index = False)

In [None]:
df_neu.reset_index(drop = True, inplace = True)
n = random.sample(range(df_neu.shape[0]-1), 100)
for i in n:
    print(df_neu.loc[i]['cleaned_text'])
    print("===============================")

In [None]:
df_op.reset_index(drop = True, inplace = True)
n = random.sample(range(df_op.shape[0]-1), 100)
for i in n:
    print(df_op.loc[i]['cleaned_text'])
    print("===============================")

In [None]:
df_su.reset_index(drop = True, inplace = True)
n = random.sample(range(df_su.shape[0]-1), 100)
for i in n:
    print(df_su.loc[i]['cleaned_text'])
    print("===============================")