# Sentiment Analysis in Arabic tweets using sklearn ML algorithms and 1,2,3 gram features

In [7]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import random

import sklearn 
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
import random

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print('\n'.join(os.listdir("../input")))

# Any results you write to the current directory are saved as output.

kaust-arabic-nlp
arabicnlp


# define functions

In [4]:

def load2(file):
#     pos_training = '../input/train_Arabic_tweets_positive_20190413.tsv'
#     neg_training = '../input/train_Arabic_tweets_negative_20190413.tsv'
    
#     pos_train_data, pos_train_labels = read_tsv(pos_train_file)
#     neg_train_data, neg_train_labels = read_tsv(neg_train_file)

    df_main = pd.read_csv(file)
    df_main = df_main.sample(frac=1)
    df_main['sentiment'] = df_main['sentiment'].str.lower()
    df_main = df_main.replace(['positive', 'negative', 'neutral'], [0, 1, 2])
    
    
    df_test = df_main[50000:]
    df_train = df_main[:50000]

    x_train = df_train['text'].tolist()
    y_train = df_train['sentiment'].tolist()

    x_test = df_test['text'].tolist()
    y_test = df_test['sentiment'].tolist()
    
   
    
    print("data sample")
#     print(df_main.head())
#     print('train data: # of pos:{}\t# of neg:{}\t'.format(y_train.count('pos'), y_train.count('neg')))
#     print('test data: # of pos:{}\t# of neg:{}\t'.format(y_test.count('pos'), y_test.count('neg')))
#     print('number of classese = ' + str())
    print('------------------------------------')
    return x_train, y_train, x_test, y_test

####################################################

def do_sa(n, my_classifier, name, my_data):
    x_train, y_train, x_test, y_test = my_data
    print('parameters')
    print('n grams:', n)
    print('classifier:', my_classifier.__class__.__name__)
    print('------------------------------------')

    pipeline = Pipeline([
        ('vect', CountVectorizer()),
        ('clf', my_classifier),
    ])

# TfidfVectorizer(min_df=0.0001, max_df=0.95,
#                                  analyzer='word', lowercase=True,
#                                  ngram_range=(1, n))),
    pipeline.fit(x_train, y_train)
    feature_names = pipeline.named_steps['vect'].get_feature_names()

    y_predicted = pipeline.predict(x_test)

    # Print the classification report
    print(metrics.classification_report(y_test, y_predicted,
                                        target_names=['pos', 'neg', 'neut']))

    # Print the confusion matrix
    cm = metrics.confusion_matrix(y_test, y_predicted)
    print(cm)
    print('# of features:', len(feature_names))
    print('sample of features:', random.sample(feature_names, 40))
    f1 = f1_score(y_test, y_predicted, average = None)
    accuracy = accuracy_score(y_test, y_predicted)
    precision = (precision_score(y_test, y_predicted, average="weighted"))
    recall =  (recall_score(y_test, y_predicted, average=None))
    return name, n, accuracy, precision, recall, (f1[0] + f1[1])/2


In [5]:
dataset = load2("../input/kaust-arabic-nlp/full_labeled_data.csv")
# dataset[0]

data sample
------------------------------------


In [6]:
labels = dataset[1]
print("TRAIN DATASET")
print("Positve, negative , neutral count = ")
print(labels.count(0), labels.count(1), labels.count(2))
print('-------------------------------------')
labels = dataset[3]
print("TEST DATASSET")
print("Positve, negative , neutral count = ")
print(labels.count(0), labels.count(1), labels.count(2))

TRAIN DATASET
Positve, negative , neutral count = 
8029 8051 33920
-------------------------------------
TEST DATASSET
Positve, negative , neutral count = 
792 769 3439


# Setup experiments 

In [9]:
ngrams = (1, 2, 3)
results = []


classifiers = [SVC(), LogisticRegression(),
               RandomForestClassifier(max_depth=20, n_estimators=10, max_features=3),
               KNeighborsClassifier(5)
               ]
for g in ngrams:
    dataset = load2("../input/kaust-arabic-nlp/full_labeled_data.csv")
    for alg in classifiers:
        alg_name = alg.__class__.__name__
        r = do_sa(g, alg, alg_name, dataset)
        results.append(r)
        

data sample
------------------------------------
parameters
n grams: 1
classifier: LinearSVC
------------------------------------
              precision    recall  f1-score   support

         pos       0.55      0.48      0.51       812
         neg       0.53      0.43      0.47       809
        neut       0.80      0.86      0.82      3379

    accuracy                           0.73      5000
   macro avg       0.62      0.59      0.60      5000
weighted avg       0.71      0.73      0.72      5000

[[ 393   66  353]
 [  76  345  388]
 [ 249  238 2892]]
# of features: 112635
sample of features: ['بالعناية', 'يكشف', 'التعاقدات', 'ومسيان', 'إحزاني', 'أتركك', 'النحاسية', 'دكاتره', 'مديني', 'الملهى', 'أحل', '63', 'بتجميع', 'julb9xyymg', 'فيلق', 'اطلعي', 'فهدنا', 'چوى', 'وكفيل', 'يعقل', '0vjtfmce3o', 'حرمته', 'استخيرو', 'البريد', 'ومشاءاللهه', 'لمختلف', 'الرشيدة', 'يأمر', 'متاخرين', 'والاطفال', 'جاهك', 'نيتكم', 'الحقـد', 'بريلش', 'حاجة', 'سلطان_الغنام', 'دنيئه', 'حيقابلك', 'الإنتظار',

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


              precision    recall  f1-score   support

         pos       0.61      0.44      0.52       812
         neg       0.59      0.40      0.48       809
        neut       0.79      0.90      0.84      3379

    accuracy                           0.74      5000
   macro avg       0.66      0.58      0.61      5000
weighted avg       0.73      0.74      0.73      5000

[[ 361   47  404]
 [  57  327  425]
 [ 170  177 3032]]
# of features: 112635
sample of features: ['ب90000الف', 'gcfrphmlww', 'قيصري', 'الوتر_جنة_القلوب', 'نعيها', 'لبتس', 'واجعلهم', 'alsaifgallery', 'xopjmzjqj2', 'yarb44360835', 'توجل', 'لتغطية', 'مضحك', '٢٠٣٠', 'قبيلته', 'وتميل', 'أبلغت', 'أعر', 'nالحين', 'يستاهل', 'n2402347971', 'الفاتور', 'لمباشرة', 'الغموض', 'تسمعيها', 'جامعية', 'الانصاف', 'shaggiekay', 'يجزيكم', '0544779488', 'نوته', 'تاريخ٢', 'برياضة', '٧٧', 'والمحل', 'وطارت', 'استعملت', 'ارضيتي', 'سأخبركم', 'اتفضلي']
parameters
n grams: 1
classifier: RandomForestClassifier
--------------------------------

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

         pos       0.00      0.00      0.00       812
         neg       0.00      0.00      0.00       809
        neut       0.68      1.00      0.81      3379

    accuracy                           0.68      5000
   macro avg       0.23      0.33      0.27      5000
weighted avg       0.46      0.68      0.55      5000

[[   0    0  812]
 [   0    0  809]
 [   0    0 3379]]
# of features: 112635
sample of features: ['الشد', 'زهران', 'الإستفاده', 'دمت', 'يخبر', 'الافتتاح', 'الوو', 'alshammry_saja', 'شاي', 'تنتحر', 'اللافت', 'y6nuiqxfjm', 'كفيلة', 'اقولهم', 'مستغانمي', 'احوالي', 'algadiarawi', 'ويخليه', 'coffeeraeq', 'tiribark_', 'لجوازي', 'غسلت', 'إنطفأ', 'ban7erhfc', 'وزادت', 'لمواجهه', 'حاسوبية', 'اكوون', 'أضخم', 'قليلا', 'يأسك', 'شقو', 'أكسب', 'تجر', 'بارجع', 'طفولته', 'لدخول', 'osuzdtplkm', 'بمكتب', 'أتجول']
parameters
n grams: 1
classifier: KNeighborsClassifier
------------------------------------
              precision   



              precision    recall  f1-score   support

         pos       0.56      0.48      0.51       796
         neg       0.52      0.42      0.46       830
        neut       0.79      0.85      0.82      3374

    accuracy                           0.72      5000
   macro avg       0.62      0.58      0.60      5000
weighted avg       0.71      0.72      0.71      5000

[[ 382   72  342]
 [  56  348  426]
 [ 250  248 2876]]
# of features: 112411
sample of features: ['a45xq3oo8v', 'اليوقا', 'mawhiba_care', 'xh5fzs8ie4', 'يبصره', 'abdulaziztf', 'التأثر', 'بلهيب', 'الطبيه', 'درهت', '893', 'اعيت', 'أهتمامگ', 'psycfact', 'بـعضنا', 'وعزل', 'جيكم', 'وحفظا', 'أدير', 'كشف', 'عمله', 'برعايه', 'نمبسط', 'يستانس', 'اللوجو', 'omaryou91129630', 'لث', 'احبـاب', 'rhlraetglc', 'e_balady', 'والتقاليد', 'نكتة', 'يهوي', 'بسلبياتك', 'فسوف', 'تـمـنـيت', 'ftj3fh1aph', 'المتفتح', 'الروم', 'رفح']
parameters
n grams: 2
classifier: SVC
------------------------------------
              precision    recall

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


              precision    recall  f1-score   support

         pos       0.59      0.44      0.50       796
         neg       0.59      0.39      0.47       830
        neut       0.78      0.89      0.83      3374

    accuracy                           0.74      5000
   macro avg       0.66      0.57      0.60      5000
weighted avg       0.72      0.74      0.72      5000

[[ 348   54  394]
 [  48  326  456]
 [ 189  169 3016]]
# of features: 112411
sample of features: ['مخالفاتي', 'حقوقك', 'والمواطنين', 'wie0jqouex', 'nامر', 'أصدر', 'قراد', 'المتواصل', 'بنرفع', 'المشتكى', 'escapesaudi', 'احتري', 'لرسولنا', 'ماارتحت', 'نشتغل', 'المتبدل', 'واخد', 'shame', 'والسيطرة', 'ينفذ', 'صناعة_الترفيه', 'nهههههه', 'lo5h2jafkp', 'استشهاد', 'eztos9mfzl', 'توصف', 'جميييييله', 'العارض', 'ومساهماتهم', 'جعجعه', 'az__2133', 'قطـر', 'واستفيد', 'بزلة', 'خذاله', 'والإحتجاز', 'وتقب', 'هالصورة', '0019104', 'jbreel_faqyh90']
parameters
n grams: 2
classifier: RandomForestClassifier
--------------------------

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

         pos       0.00      0.00      0.00       796
         neg       0.00      0.00      0.00       830
        neut       0.67      1.00      0.81      3374

    accuracy                           0.67      5000
   macro avg       0.22      0.33      0.27      5000
weighted avg       0.46      0.67      0.54      5000

[[   0    0  796]
 [   0    0  830]
 [   0    0 3374]]
# of features: 112411
sample of features: ['باسقاط', 'مابقدر', 'وماتغير', 'واخواتك', 'سكنها', 'معالفها', 'بالتوصيل', 'وإنتهى', 'التعرض', 'omarmuktar15', 'والطاعة', 'النهاءي', 'الكرتوني', 'تتصرف', 'nمابي', 'تجود', 'تطورنا', 'حجم', 'مسامحة', 'مآفي', 'رابطة_المتكممين', 'زيف', '٢٥٠٠', 'ستغرم', 'أسهب', 'بشفائه', 'rowadalkhaleej', 'بإعلان', 'ندري', 'o6imkl0bub', 'بالتوظيف', 'هتتركب', 'بدوخخخه', 'imamuelearn', 'انجلوس', 'مودريتش', 'متحمسة', 'ملوي', 'خسرناها', 'وجا']
parameters
n grams: 2
classifier: KNeighborsClassifier
------------------------------------
        

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


              precision    recall  f1-score   support

         pos       0.61      0.46      0.52       813
         neg       0.60      0.40      0.48       822
        neut       0.78      0.89      0.83      3365

    accuracy                           0.74      5000
   macro avg       0.67      0.58      0.61      5000
weighted avg       0.72      0.74      0.73      5000

[[ 371   42  400]
 [  54  328  440]
 [ 179  176 3010]]
# of features: 112511
sample of features: ['البطولية', 'القرمزية', 'الثور', 'إبصار', 'أتهم', 'تلخبطها', 'وذكريات', 'دمنه', '4q', 'والاغرب', 'انتحالف', 'بالسقف', 'فاتورة', 'حام', 'بموضوعية', 'جابه', 'بالسنة', 'ألهمني', 'بحياة', 'عقولهم', 'اعطاها', 'أرضاني', 'تعافت', 'w88jheeiac', 'vs3', 'حدوثها', 'املنا', 'البطانيات', 'سنتصل', 'وفرنسا', 'nأريد', 'الشخصية', 'للزلام', 'pc3', 'مـاتت', 'تهزر', 'الشآن', 'مستحقاته', 'تجمعنا', 'البلطان']
parameters
n grams: 3
classifier: RandomForestClassifier
------------------------------------


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

         pos       0.00      0.00      0.00       813
         neg       0.00      0.00      0.00       822
        neut       0.67      1.00      0.80      3365

    accuracy                           0.67      5000
   macro avg       0.22      0.33      0.27      5000
weighted avg       0.45      0.67      0.54      5000

[[   0    0  813]
 [   0    0  822]
 [   0    0 3365]]
# of features: 112511
sample of features: ['إببرييل', 'يصبرها', 'مصارعة', 'ناااااس', 'هيتنسي', 'حسو', 'الندوة', 'الاجنبية', 'لخوياه', 'غريميو', 'القرار', 'اوده', 'باصصلي', 'أشوا', 'الموظف2460519040', 'والبدر', 'للدكه', 'مرتضي', 'مغلفة', 'nوكيفكم', 'nاسبوعين', 'وأقبلك', 'اجلالا', 'الخدم', 'بحوائج', 'i_2il', 'القوات', 'ازهرها', 'اصدار', 'مسموعة', 'بالحوار', '1jcryymga9', 'ميبقاش', 'تتعودين', 'خدو', 'واقولك', 'يدويا', 'aeyfq', 'غدت', 'بختهم']
parameters
n grams: 3
classifier: KNeighborsClassifier
------------------------------------
              precision    r

 #  Results Summary

In [18]:
print('{0:25}{1:10}{2:10}{3:15}{4:10}{5:10}{6:10}{7:10}{8:10}'.format('algorithm', 'ngram', 'accuracy', 'precision', 'p_rec','n_rec', 'u_rec', 'avg_rec','f1_pn'))
print('------------------------------------------------------------------------------------------------------------')
for r in results:
    print('{0:25}{1:10}{2:10.3f}{3:10.3f}{4:10.3f}{5:10.3f}{6:10.3f}{7:10.3f}{8:10.3f}'.format(r[0], r[1], r[2], r[3], r[4][0], r[4][1], r[4][2], sum(r[4])/3, r[5]))

algorithm                ngram     accuracy  precision      p_rec     n_rec     u_rec     avg_rec   f1_pn     
------------------------------------------------------------------------------------------------
LinearSVC                         1     0.726     0.713     0.484     0.426     0.856     0.589     0.493
SVC                               1     0.742     0.745     0.324     0.206     0.970     0.500     0.389
LogisticRegression                1     0.744     0.726     0.445     0.404     0.897     0.582     0.498
RandomForestClassifier            1     0.676     0.457     0.000     0.000     1.000     0.333     0.000
KNeighborsClassifier              1     0.673     0.618     0.166     0.061     0.941     0.389     0.170
LinearSVC                         2     0.721     0.707     0.480     0.419     0.852     0.584     0.490
SVC                               2     0.731     0.729     0.310     0.199     0.962     0.490     0.370
LogisticRegression                2     0.738     