# Arabic Text Classification
coded by: Haidhi Angkawijana Tedja <br>
email : haidhiangkawijana@gmail.com

In this project I did some experiment with several vectorization method such as TF-IDF, CountVectorized, and word2vec. The final result is word2vec not good enough if we use classic machine learning as model, but it's good enough if we use it with neural network like LSTM, RNN, etc. Decision tree also isn't good enough for text classification task, due to it's disadvantages *Curse of dimensionality*

The datasets I used:
1. SANAD : Single-label Arabic News Articles Dataset
2. HARD : hotel reviews in Arabic language
3. OCLAR : Opinion Corpus for Lebanese Arabic Reviews


# Depedencies

In [1]:
%%time

import re
import nltk
import string
import numpy as np
import pandas as pd
from tqdm import tqdm
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

from mlxtend.classifier import StackingClassifier

from gensim.models import Word2Vec

import warnings
warnings.filterwarnings("ignore")
tqdm.pandas()

CPU times: total: 9.02 s
Wall time: 22.6 s


# FUNCTION

In [5]:
#using base model only
def baseModelOnly(Xtrain,Xtest,ytrain,ytest, includeNB=True):
  logreg = LogisticRegression()
  logreg.fit(Xtrain, ytrain)
  logreg_acc = accuracy_score(ytest,logreg.predict(Xtest))

  svm = LinearSVC()
  svm.fit(Xtrain, ytrain)
  svm_acc = accuracy_score(ytest,svm.predict(Xtest))

  dt = DecisionTreeClassifier()
  dt.fit(Xtrain, ytrain)
  dt_acc = accuracy_score(ytest,dt.predict(Xtest))

  final_result = [logreg_acc,svm_acc,dt_acc]

  if includeNB == True:
    nb = MultinomialNB()
    nb.fit(Xtrain, ytrain)
    nb_acc = accuracy_score(ytest,nb.predict(Xtest))
    final_result.append(nb_acc)

  return final_result


In [6]:
def make_corpus(data_series):
  word_list = data_series.apply(lambda x: x.split())
  corpus = []
  for i in word_list:
    corpus.append(i)
  return corpus

In [7]:
def sentence2vectorSum(text, model, aggregate='sum'):
  text = text.split()
  sentence_vector = []
  for word in text:
    if word in model.wv.key_to_index:
      sentence_vector.append(model.wv[word])

  if sentence_vector != []:
    len_of_vector = len(sentence_vector[0])
    matrics = np.array(sentence_vector)
    max_vector = []
    min_vector = []
    average_vector = []
    sum_vector = []
    for num in range(0,len_of_vector):
        max_vector.append(max(matrics[:, num]))
        min_vector.append(min(matrics[:, num]))
        average_vector.append(np.mean(matrics[:, num]))
        sum_vector.append(np.sum(matrics[:, num]))

    if aggregate == 'sum':
        return max_vector
    elif aggregate == 'average':
        return average_vector
    elif aggregate == 'max':
        return max_vector
    elif aggregate == 'min':
        return min_vector

  else:
    VECTOR_SIZE = model.vector_size
    return np.array([0 for i in range(0,VECTOR_SIZE+1)])

# OCLAR

In [2]:
df = pd.read_csv("OCLAR_CLEAN.csv")

df = df.dropna()
df['category_encoded'] = df['rating']-1
df['category_encoded'].unique()

print(f"Jumlah Label : {len(df['rating'].unique())}")
print(len(df))
df.head()

Jumlah Label : 5
3895


Unnamed: 0,review,rating,clean,category_encoded
0,هذا الفندق ينقصه بعض الاشياء داخل الغرف مثلا ع...,2,ندق نقص شيء دخل غرف ثلا وضح قنو تلفزيونية عطل ...,1
1,لطيف ولكن الغرف الفندقية تحتاج صيانة كادر الخد...,4,لطف غرف ندق حاج صين كدر خدم يجب ستى طلب,3
2,مكان جميل جدا وحسن الخلق والضيافه,5,جمل وحس خلق ضيف,4
3,بحاجة الى اعادة تأهيل للمفروشات,3,بحج أهل فرش,2
4,فندق ممتاز ومعاملة راقية جدا,5,ندق متز عمل رقي,4


In [9]:
X = df['clean']
y = df['category_encoded']

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)

In [10]:
#tfidf
tfidf = TfidfVectorizer()
X_train_idf = tfidf.fit_transform(X_train)
X_test_idf = tfidf.transform(X_test)

#count vectorizer
cv = CountVectorizer()
X_train_cv = cv.fit_transform(X_train)
X_test_cv = cv.transform(X_test)

tfidf_clf = baseModelOnly(X_train_idf,X_test_idf,y_train,y_test)
cv_clf = baseModelOnly(X_train_cv,X_test_cv,y_train,y_test)

oclar_base_model_result = pd.DataFrame({'TFIDF':tfidf_clf,'CV':cv_clf})
oclar_base_model_result.index = ['Logreg','SVM','Decision Tree','NB']
oclar_base_model_result.style.background_gradient(cmap ='Greens')


Unnamed: 0,TFIDF,CV
Logreg,0.617458,0.621309
SVM,0.599487,0.598203
Decision Tree,0.544288,0.563543
NB,0.593068,0.608472


## OCLAR Word2Vec

In [8]:
oclar_corpus = make_corpus(df.clean)
max_len = max([len(sentence) for sentence in oclar_corpus])
print('MAX LEN : {}'.format(max_len))

MAX LEN : 227


In [9]:
w2v_oclar = Word2Vec(min_count=10,window=15,sample=6e-5,alpha=0.03,min_alpha=0.0007,negative=25)
w2v_oclar.build_vocab(oclar_corpus, progress_per=10000)
w2v_oclar.train(oclar_corpus, total_examples=w2v_oclar.corpus_count, epochs=30, report_delay=1)

(78553, 695910)

In [10]:
AGG_FUNC = ['min','max','sum','average']

all_aggregate_result = []
for func in AGG_FUNC:
  oclar_w2v = [sentence2vectorSum(i, w2v_oclar, aggregate=func) for i in df['clean']]
  oclar_w2v = pd.DataFrame(oclar_w2v).drop(100, axis=1)

  X = oclar_w2v
  y = df['category_encoded']

  X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)
  result = baseModelOnly(X_train,X_test,y_train,y_test, includeNB=False)
  """
  Ntar jadi nya gini:
  [[logreg,svm,dt,nb],[logreg,svm,dt,nb],[logreg,svm,dt,nb],[logreg,svm,dt,nb]]
  setiap list itu buat satu aggregate function & setiap list itu dibuat perkolom
  """
  all_aggregate_result.append(result)


OCLAR_W2V_FINAL_RESULT = pd.DataFrame({AGG_FUNC[i]: all_aggregate_result[i] for i in range(len(AGG_FUNC))})
OCLAR_W2V_FINAL_RESULT.index = ['Logreg','SVM','Decision Tree']
OCLAR_W2V_FINAL_RESULT.style.background_gradient(cmap ='Greens')

Unnamed: 0,min,max,sum,average
Logreg,0.620026,0.582798,0.578947,0.577664
SVM,0.620026,0.582798,0.578947,0.577664
Decision Tree,0.554557,0.508344,0.509628,0.50706


# HARD

In [3]:
df = pd.read_csv('HARD_CLEAN.csv')
df = df.dropna()
df['category_encoded'] = df['rating']-1

print(f"Jumlah Label : {len(df['rating'].unique())}")
print(len(df))
df.head()

Jumlah Label : 4
105618


Unnamed: 0,review,rating,clean,category_encoded
0,“ممتاز”. النظافة والطاقم متعاون.,2,متز نظف طقم تعا,1
1,استثنائي. سهولة إنهاء المعاملة في الاستقبال. ل...,5,استثنائي سهل عمل اشئ,4
2,استثنائي. انصح بأختيار الاسويت و بالاخص غرفه ر...,5,استثنائي نصح أختيار اسي اخص غرف رقم نوع ارض,4
3,“استغرب تقييم الفندق كخمس نجوم”. لا شي. يستحق ...,1,غرب قيم ندق كخمس نجم شي سحق نجم,0
4,جيد. المكان جميل وهاديء. كل شي جيد ونظيف بس كا...,4,جيد جمل هاديء شي جيد نظف بس حوض سبح عمل هذي فت...,3


In [13]:
X = df['clean']
y = df['category_encoded']

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)

In [14]:
#tfidf
tfidf = TfidfVectorizer()
X_train_idf = tfidf.fit_transform(X_train)
X_test_idf = tfidf.transform(X_test)

#count vectorizer
cv = CountVectorizer()
X_train_cv = cv.fit_transform(X_train)
X_test_cv = cv.transform(X_test)

tfidf_clf = baseModelOnly(X_train_idf,X_test_idf,y_train,y_test)
cv_clf = baseModelOnly(X_train_cv,X_test_cv,y_train,y_test)

hard_base_model_result = pd.DataFrame({'TFIDF':tfidf_clf,'CV':cv_clf})
hard_base_model_result.index = ['Logreg','SVM','Decision Tree','NB']
hard_base_model_result.style.background_gradient(cmap ='Greens')

Unnamed: 0,TFIDF,CV
Logreg,0.707205,0.693382
SVM,0.689689,0.675866
Decision Tree,0.618775,0.618728
NB,0.627438,0.693145


## HARD word2vec

In [15]:
hard_corpus = make_corpus(df.clean)
max_len = max([len(sentence) for sentence in hard_corpus])
print('MAX LEN : {}'.format(max_len))

MAX LEN : 395


In [16]:
w2v_hard = Word2Vec(min_count=10,window=15,sample=6e-5,alpha=0.03,min_alpha=0.0007,negative=25)
w2v_hard.build_vocab(hard_corpus, progress_per=10000)
w2v_hard.train(hard_corpus, total_examples=w2v_hard.corpus_count, epochs=30, report_delay=1)

(17203318, 52493160)

### Train

In [17]:
all_aggregate_result = []
for func in AGG_FUNC:
  hard_w2v = [sentence2vectorSum(i, w2v_hard, aggregate=func) for i in df['clean']]
  hard_w2v = pd.DataFrame(hard_w2v).drop(100, axis=1)

  X = hard_w2v
  y = df['category_encoded']

  X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)
  result = baseModelOnly(X_train,X_test,y_train,y_test, includeNB=False)
  """
  Ntar jadi nya gini:
  [[logreg,svm,dt,nb],[logreg,svm,dt,nb],[logreg,svm,dt,nb],[logreg,svm,dt,nb]]
  setiap list itu buat satu aggregate function & setiap list itu dibuat perkolom
  """
  all_aggregate_result.append(result)


HARD_W2V_FINAL_RESULT = pd.DataFrame({AGG_FUNC[i]: all_aggregate_result[i] for i in range(len(AGG_FUNC))})
HARD_W2V_FINAL_RESULT.index = ['Logreg','SVM','Decision Tree']
HARD_W2V_FINAL_RESULT.style.background_gradient(cmap ='Greens')

Unnamed: 0,min,max,sum,average
Logreg,0.593496,0.576359,0.578536,0.653475
SVM,0.559884,0.567364,0.551979,0.645332
Decision Tree,0.538014,0.526463,0.520025,0.512829


# SANAD

In [4]:
df = pd.read_csv("SANAD_CLEAN.csv")
df['category_encoded'] = df['category'].map({'culture':0,'finance':1,'medical':2,'politics':3,'religion':4,'sports':5,'tech':6})

print(f"Jumlah Label : {len(df['category'].unique())}")
print(len(df))
df.head()

Jumlah Label : 7
45500


Unnamed: 0,text,category,clean,category_encoded
0,بيروت: «الخليج» وحدها القدس تتصدر غلاف العدد 1...,culture,يرو خلج وحد قدس صدر غلف جلة درس فلسطينية كتب ا...,0
1,هل هناك قصة جديدة إماراتية؟ ما الإضافات التي ت...,culture,قصة جدد اماراتية اضف سجل قصة لمح الخ سئل بتت ف...,0
2,موقف مقدّر ذلك الذي اتخذه اتحاد كتاب وأدباء ال...,culture,قدر تحد كتب أدباء امر نشر صحف شرح حدث شهد نطق ...,0
3,"القاهرة - ""الخليج"":التقى وزير الثقافة المصري د...",culture,قهر خليجالتقى وزر ثقف صري جبر عصفور وزر ثقف لن...,0
4,مسقط: «الخليج» عائشة الفزاري واسمها المستعار «...,culture,سقط خلج عئش فزر وسم عار خفا روح شعر عمن صيل كت...,0


In [19]:
X = df['clean']
y = df['category_encoded']

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)

In [20]:
#tfidf
tfidf = TfidfVectorizer()
X_train_idf = tfidf.fit_transform(X_train)
X_test_idf = tfidf.transform(X_test)

#count vectorizer
cv = CountVectorizer()
X_train_cv = cv.fit_transform(X_train)
X_test_cv = cv.transform(X_test)

tfidf_clf = baseModelOnly(X_train_idf,X_test_idf,y_train,y_test)
cv_clf = baseModelOnly(X_train_cv,X_test_cv,y_train,y_test)

sanad_base_model_result = pd.DataFrame({'TFIDF':tfidf_clf,'CV':cv_clf})
sanad_base_model_result.index = ['Logreg','SVM','Decision Tree','NB']
sanad_base_model_result.style.background_gradient(cmap ='Greens')

Unnamed: 0,TFIDF,CV
Logreg,0.970879,0.965714
SVM,0.975055,0.961758
Decision Tree,0.872418,0.870549
NB,0.95022,0.94956


## SANAD word2vec

In [9]:
sanad_corpus = make_corpus(df.clean)
max_len = max([len(sentence) for sentence in sanad_corpus])
print('MAX LEN : {}'.format(max_len))

MAX LEN : 3816


In [10]:
%%time
w2v_sanad = Word2Vec(min_count=10,window=15,sample=6e-5,alpha=0.03,min_alpha=0.0007,negative=25)
w2v_sanad.build_vocab(sanad_corpus, progress_per=10000)
w2v_sanad.train(sanad_corpus, total_examples=w2v_sanad.corpus_count, epochs=30, report_delay=1)

CPU times: total: 40min 24s
Wall time: 16min 9s


(163561258, 339972570)

In [17]:
%%time
AGG_FUNC = ['min','max','sum','average']

all_aggregate_result = []
for func in AGG_FUNC:
  sanad_w2v = [sentence2vectorSum(i, w2v_sanad, aggregate=func) for i in df['clean']]
  sanad_w2v = pd.DataFrame(sanad_w2v)

  X = sanad_w2v
  y = df['category_encoded']

  X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)
  result = baseModelOnly(X_train,X_test,y_train,y_test, includeNB=False)
  """
  Ntar jadi nya gini:
  [[logreg,svm,dt,nb],[logreg,svm,dt,nb],[logreg,svm,dt,nb],[logreg,svm,dt,nb]]
  setiap list itu buat satu aggregate function & setiap list itu dibuat perkolom
  """
  all_aggregate_result.append(result)


SANAD_W2V_FINAL_RESULT = pd.DataFrame({AGG_FUNC[i]: all_aggregate_result[i] for i in range(len(AGG_FUNC))})
SANAD_W2V_FINAL_RESULT.index = ['Logreg','SVM','Decision Tree']
SANAD_W2V_FINAL_RESULT.style.background_gradient(cmap ='Greens')

CPU times: total: 1h 9min 1s
Wall time: 1h 12min 48s


Unnamed: 0,min,max,sum,average
Logreg,0.877912,0.886703,0.884945,0.964505
SVM,0.836923,0.847912,0.691429,0.963516
Decision Tree,0.734396,0.716264,0.713736,0.889451
