In [1]:
# -*- coding: utf-8 -*-

import os
import sys
import re
import pickle
import string

import numpy as np
import pandas as pd
import math

import matplotlib.pyplot as plt
import seaborn as sns

# Scikit Learn
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer

from sklearn.dummy import DummyClassifier
from sklearn.naive_bayes import MultinomialNB, ComplementNB
from sklearn.linear_model import LogisticRegressionCV, SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

import nltk
from nltk.tokenize.regexp import RegexpTokenizer

In [2]:
import camel_tools
from camel_tools.morphology.database import MorphologyDB
from camel_tools.morphology.analyzer import Analyzer
from camel_tools.morphology.generator import Generator
from camel_tools.morphology.reinflector import Reinflector

from camel_tools.utils.normalize import normalize_unicode
from camel_tools.utils.dediac import dediac_ar

from camel_tools.disambig.mle import MLEDisambiguator
from camel_tools.tokenizers.morphological import MorphologicalTokenizer
from camel_tools.tokenizers.word import simple_word_tokenize

In [3]:
TEST_SIZE = 0.2
SEED = 42

In [4]:
with open('raw_data/stopwords.txt') as f:
    stopwords = f.read().split('\n')

In [5]:
with open('raw_data/Culture/0000.txt') as f:
    text = f.read()
## Analyzer  
# db = MorphologyDB.builtin_db()
# analyzer = Analyzer(db)

text_norm = normalize_unicode(text)
print(text, text_norm, sep='\n\n')

ينظم معهد الشارقة للفنون معرضاً فنياً تحت عنوان باقة الفن، وذلك عند الساعة السابعة من مساء اليوم في مقر المعهد في منطقة الفنون في حي الشويهين في الشارقة، وتتلاقى في المعرض إبداعات 62 طالباً من المنتسبين للدراسة في المعهد في كافة التخصصات الفنية .

ينظم معهد الشارقة للفنون معرضاً فنياً تحت عنوان باقة الفن، وذلك عند الساعة السابعة من مساء اليوم في مقر المعهد في منطقة الفنون في حي الشويهين في الشارقة، وتتلاقى في المعرض إبداعات 62 طالباً من المنتسبين للدراسة في المعهد في كافة التخصصات الفنية .


In [6]:
re_tokenizer = RegexpTokenizer(r'[\u0621-\u064A]+')
re_tokens = re_tokenizer.tokenize(text_norm)

tokens_filtered = list(filter(lambda x: (x not in string.punctuation) and (x not in stopwords), re_tokens))

mled = MLEDisambiguator.pretrained()
disambig = mled.disambiguate(tokens_filtered)

final_tokens = [d.analyses[0].analysis['lex'] for d in disambig]
print(*final_tokens, sep=',  ')

نَظَّم,  مَعْهَد,  شارِقَة,  فَنّ,  مَعْرِض,  فَنِّيّ,  عُنْوان,  باقَة,  فَنّ,  ذٰلِكَ,  ساعَة,  سابِع,  مَقَرّ,  مَعْهَد,  مِنْطَقَة,  فَنّ,  حَيّ,  الشويهين,  شارِقَة,  تَلاقَى,  مَعْرِض,  إِبْداع,  طالِب,  مُنْتَسِب,  دِراسَة,  مَعْهَد,  كافَّة,  تَخَصُّص,  فَنِّيّ


In [7]:
def tokenize(text):
    text_norm = normalize_unicode(text)
    
    re_tokenizer = RegexpTokenizer(r'[\u0621-\u064A]+')
    tokens_re = re_tokenizer.tokenize(text_norm)
    
    tokens_filtered = list(filter(lambda x: x not in stopwords, tokens_re))
        
    mled = MLEDisambiguator.pretrained()
    disambig = mled.disambiguate(tokens_filtered)
    tokens_lem = [d.analyses[0].analysis['lex'] for d in disambig]
    
    return tokens_lem

In [8]:
tokenize(text)

['نَظَّم',
 'مَعْهَد',
 'شارِقَة',
 'فَنّ',
 'مَعْرِض',
 'فَنِّيّ',
 'عُنْوان',
 'باقَة',
 'فَنّ',
 'ذٰلِكَ',
 'ساعَة',
 'سابِع',
 'مَقَرّ',
 'مَعْهَد',
 'مِنْطَقَة',
 'فَنّ',
 'حَيّ',
 'الشويهين',
 'شارِقَة',
 'تَلاقَى',
 'مَعْرِض',
 'إِبْداع',
 'طالِب',
 'مُنْتَسِب',
 'دِراسَة',
 'مَعْهَد',
 'كافَّة',
 'تَخَصُّص',
 'فَنِّيّ']

## Full Corpus

In [9]:
with open('raw_data/corpus_df.pkl', 'rb') as f:
    corpus_df = pickle.load(f)

In [10]:
corpus_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 45500 entries, 0 to 45499
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   cls     45500 non-null  object
 1   text    45500 non-null  object
dtypes: object(2)
memory usage: 1.0+ MB


In [11]:
from sklearn.model_selection import train_test_split
X = corpus_df.text
y = corpus_df.cls

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SIZE, random_state=SEED, stratify=y)

In [12]:
sample = X_train.sample(frac=0.01, random_state=SEED)

In [13]:
countvec = CountVectorizer(
    lowercase=False,
    stop_words=stopwords,
    max_features=10000
)

In [14]:
X_train_cv = countvec.fit_transform(X_train)
X_test_cv = countvec.transform(X_test)



In [15]:
pd.DataFrame(X_train_cv.todense(), columns=countvec.vocabulary_)

Unnamed: 0,تشير,خبراء,برامج,الحماية,أجهزة,الكمبيوتر,الإمارات,ارتفاع,معدلات,الهجمات,...,بلاتيني,التطعيم,توام,مشغلي,المصاحبة,كردستان,النمل,دارفور,بوسكي,حديثي
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36395,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
36396,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
36397,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
36398,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [16]:
nb = MultinomialNB()

nb.fit(X_train_cv, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [17]:
y_pred = nb.predict(X_train_cv)
y_pred_test = nb.predict(X_test_cv)

In [18]:
from sklearn.metrics import classification_report

In [19]:
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

     culture       0.91      0.97      0.94      5200
     finance       0.99      0.91      0.95      5200
     medical       0.95      0.99      0.97      5200
    politics       0.97      0.98      0.97      5200
    religion       0.97      0.89      0.93      5200
      sports       1.00      0.98      0.99      5200
        tech       0.92      0.98      0.95      5200

    accuracy                           0.96     36400
   macro avg       0.96      0.96      0.96     36400
weighted avg       0.96      0.96      0.96     36400



In [20]:
print(classification_report(y_test, y_pred_test))

              precision    recall  f1-score   support

     culture       0.91      0.96      0.93      1300
     finance       0.99      0.90      0.95      1300
     medical       0.94      0.98      0.96      1300
    politics       0.97      0.97      0.97      1300
    religion       0.97      0.88      0.92      1300
      sports       1.00      0.98      0.99      1300
        tech       0.90      0.98      0.94      1300

    accuracy                           0.95      9100
   macro avg       0.95      0.95      0.95      9100
weighted avg       0.95      0.95      0.95      9100



In [23]:
with open('outputs/first_model.pkl', 'wb') as m:
    pickle.dump(nb, m)