In [4]:
# -*- coding: utf-8 -*-

import os
import sys
import re
import pickle
import string

import numpy as np
import pandas as pd
import math

import matplotlib.pyplot as plt
import seaborn as sns

# Scikit Learn
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer

from sklearn.dummy import DummyClassifier
from sklearn.naive_bayes import MultinomialNB, ComplementNB
from sklearn.linear_model import LogisticRegressionCV, SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

import nltk
from nltk.tokenize.regexp import RegexpTokenizer

from tools import misc

In [2]:
import camel_tools
from camel_tools.morphology.database import MorphologyDB
from camel_tools.morphology.analyzer import Analyzer
from camel_tools.morphology.generator import Generator
from camel_tools.morphology.reinflector import Reinflector

from camel_tools.utils.normalize import normalize_unicode
from camel_tools.utils.dediac import dediac_ar

from camel_tools.disambig.mle import MLEDisambiguator
from camel_tools.tokenizers.morphological import MorphologicalTokenizer
from camel_tools.tokenizers.word import simple_word_tokenize

In [7]:
data = misc.load('raw_data/corpus_df.pkl')
X_train, X_test, y_train, y_test = train_test_split(data.text, data.cls, random_state=misc.SEED, test_size=0.2)

In [8]:
from pprint import pprint
from time import time

pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB())
])

param_grid = {
    'vect__max_df': (0.5, 0.75, 1.0),
    'tfidf__use_idf': (True, False),
    'tfidf__norm': ('l1', 'l2'),
    'clf__alpha': (0.1, 0.5, 1)
}

def main():
    grid_search = GridSearchCV(pipeline, param_grid, n_jobs=-1, verbose=1)
    
    print("Grid Searching...")
    print("PIPELINE:")
    print(*[f'\t{name}' for name, _ in pipeline.steps], sep='\n')
    print("PARAMS:")
    pprint(param_grid)
    t0 = time()
    grid_search.fit(X_train, y_train)
    print(f'done in {time() - t0:.3f}')
    print()
    
    print(f'Best score: {grid_search.best_score_}')
    print("Best parameters:")
    best_parameters = grid_search.best_estimator_.get_params()
    pprint(best_parameters)
main()

Grid Searching...
PIPELINE:
	vect
	tfidf
	clf
PARAMS:
{'clf__alpha': (0.1, 0.5, 1),
 'tfidf__norm': ('l1', 'l2'),
 'tfidf__use_idf': (True, False),
 'vect__max_df': (0.5, 0.75, 1.0)}
Fitting 5 folds for each of 36 candidates, totalling 180 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed:  8.2min finished


done in 500.847

Best score: 0.960576923076923
Best parameters:
{'clf': MultinomialNB(alpha=0.1, class_prior=None, fit_prior=True),
 'clf__alpha': 0.1,
 'clf__class_prior': None,
 'clf__fit_prior': True,
 'memory': None,
 'steps': [('vect',
            CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=0.5, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)),
           ('tfidf',
            TfidfTransformer(norm='l2', smooth_idf=True, sublinear_tf=False, use_idf=True)),
           ('clf', MultinomialNB(alpha=0.1, class_prior=None, fit_prior=True))],
 'tfidf': TfidfTransformer(norm='l2', smooth_idf=True, sublinear_tf=False, use_idf=True),
 'tfidf__norm': 'l2',
 'tfidf__smooth_idf': T

In [5]:
with open('raw_data/Culture/0000.txt') as f:
    text = f.read()
## Analyzer  
# db = MorphologyDB.builtin_db()
# analyzer = Analyzer(db)

text_norm = normalize_unicode(text)
print(text, text_norm, sep='\n\n')

ينظم معهد الشارقة للفنون معرضاً فنياً تحت عنوان باقة الفن، وذلك عند الساعة السابعة من مساء اليوم في مقر المعهد في منطقة الفنون في حي الشويهين في الشارقة، وتتلاقى في المعرض إبداعات 62 طالباً من المنتسبين للدراسة في المعهد في كافة التخصصات الفنية .

ينظم معهد الشارقة للفنون معرضاً فنياً تحت عنوان باقة الفن، وذلك عند الساعة السابعة من مساء اليوم في مقر المعهد في منطقة الفنون في حي الشويهين في الشارقة، وتتلاقى في المعرض إبداعات 62 طالباً من المنتسبين للدراسة في المعهد في كافة التخصصات الفنية .


In [6]:
re_tokenizer = RegexpTokenizer(r'[\u0621-\u064A]+')
re_tokens = re_tokenizer.tokenize(text_norm)

tokens_filtered = list(filter(lambda x: (x not in string.punctuation) and (x not in stopwords), re_tokens))

mled = MLEDisambiguator.pretrained()
disambig = mled.disambiguate(tokens_filtered)

final_tokens = [d.analyses[0].analysis['lex'] for d in disambig]
print(*final_tokens, sep=',  ')

نَظَّم,  مَعْهَد,  شارِقَة,  فَنّ,  مَعْرِض,  فَنِّيّ,  عُنْوان,  باقَة,  فَنّ,  ذٰلِكَ,  ساعَة,  سابِع,  مَقَرّ,  مَعْهَد,  مِنْطَقَة,  فَنّ,  حَيّ,  الشويهين,  شارِقَة,  تَلاقَى,  مَعْرِض,  إِبْداع,  طالِب,  مُنْتَسِب,  دِراسَة,  مَعْهَد,  كافَّة,  تَخَصُّص,  فَنِّيّ


In [7]:
def tokenize(text):
    text_norm = normalize_unicode(text)
    
    re_tokenizer = RegexpTokenizer(r'[\u0621-\u064A]+')
    tokens_re = re_tokenizer.tokenize(text_norm)
    
    tokens_filtered = list(filter(lambda x: x not in stopwords, tokens_re))
        
    mled = MLEDisambiguator.pretrained()
    disambig = mled.disambiguate(tokens_filtered)
    tokens_lem = [d.analyses[0].analysis['lex'] for d in disambig]
    
    return tokens_lem

In [8]:
tokenize(text)

['نَظَّم',
 'مَعْهَد',
 'شارِقَة',
 'فَنّ',
 'مَعْرِض',
 'فَنِّيّ',
 'عُنْوان',
 'باقَة',
 'فَنّ',
 'ذٰلِكَ',
 'ساعَة',
 'سابِع',
 'مَقَرّ',
 'مَعْهَد',
 'مِنْطَقَة',
 'فَنّ',
 'حَيّ',
 'الشويهين',
 'شارِقَة',
 'تَلاقَى',
 'مَعْرِض',
 'إِبْداع',
 'طالِب',
 'مُنْتَسِب',
 'دِراسَة',
 'مَعْهَد',
 'كافَّة',
 'تَخَصُّص',
 'فَنِّيّ']

## Full Corpus

In [9]:
with open('raw_data/corpus_df.pkl', 'rb') as f:
    corpus_df = pickle.load(f)

In [10]:
corpus_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 45500 entries, 0 to 45499
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   cls     45500 non-null  object
 1   text    45500 non-null  object
dtypes: object(2)
memory usage: 1.0+ MB


In [11]:
from sklearn.model_selection import train_test_split
X = corpus_df.text
y = corpus_df.cls

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SIZE, random_state=SEED, stratify=y)

In [12]:
sample = X_train.sample(frac=0.01, random_state=SEED)

In [13]:
countvec = CountVectorizer(
    lowercase=False,
    stop_words=stopwords,
    max_features=10000
)

In [14]:
X_train_cv = countvec.fit_transform(X_train)
X_test_cv = countvec.transform(X_test)



In [15]:
pd.DataFrame(X_train_cv.todense(), columns=countvec.vocabulary_)

Unnamed: 0,تشير,خبراء,برامج,الحماية,أجهزة,الكمبيوتر,الإمارات,ارتفاع,معدلات,الهجمات,...,بلاتيني,التطعيم,توام,مشغلي,المصاحبة,كردستان,النمل,دارفور,بوسكي,حديثي
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36395,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
36396,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
36397,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
36398,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
