# Model Building

In [17]:
import pickle

import nltk

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC

from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, classification_report

from tools import misc

import warnings
warnings.filterwarnings("ignore", category=UserWarning)

In [2]:
with open('raw_data/corpus_df.pkl', 'rb') as f:
    data = pickle.load(f)

In [3]:
PATTERN = r'[\u0621-\u064A]+'
MAX_FEATURES = 10_000
STOPWORDS_URL = 'https://raw.githubusercontent.com/mohataher/arabic-stop-words/master/list.txt'
TEST_SIZE = 0.2
SEED = 42

In [4]:
data.head()

Unnamed: 0_level_0,cls,text
id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,sports,أعلن المدرب النمسوي لبوروسيا دورتموند بيتر شتو...
1,sports,ذكرت وسائل الإعلام البلغارية الجمعة ان العداءة...
2,sports,برز اسم نجم مانشستر يونايتد رايان غيغز (36 عام...
3,sports,قال مدرب نادي انتر ميلان الإيطالي خوزيه موريني...
4,sports,بيتر تشيك: حارس مرمى تشيكي ولد في 20 مايو عام ...


In [5]:
X = data['text']
y = data['cls']

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=TEST_SIZE, 
                                                    random_state=SEED,)

### Vectorizers

In [7]:
stopwords = misc.get_stopwords(STOPWORDS_URL)

In [8]:
countvec = CountVectorizer(
    lowercase=False,
    stop_words=stopwords,
    strip_accents='unicode',
    token_pattern=PATTERN,
    max_features=MAX_FEATURES
)

tfidfvec = TfidfVectorizer(
    lowercase=False,
    stop_words=stopwords,
    strip_accents='unicode',
    token_pattern=PATTERN,
    max_features=MAX_FEATURES
)

In [9]:
# X_train_cv = countvec.fit_transform(X_train)
# X_test_cv = countvec.transform(X_test)
# cv_vocab = countvec.vocabulary_

# X_train_tf = tfidfvec.fit_transform(X_train)
# X_test_tf = tfidfvec.transform(X_test)
# tf_vocab = tfidfvec.vocabulary_

### Model Pipelines

In [10]:
pipe_nb = Pipeline([
    ('Count Vectorizer', countvec),
    ('Naive Bayes', MultinomialNB())
], verbose=True)

pipe_rfc = Pipeline([
    ('Count Vectorizer', countvec),
    ('Random Forest Classifier', RandomForestClassifier())
], verbose=True)

pipe_gbc = Pipeline([
    ('Count Vectorizer', countvec),
    ('Gradient Boosting Classifier', GradientBoostingClassifier())
], verbose=True)

pipe_svc = Pipeline([
    ('Count Vectorizer', countvec),
    ('Support Vector Classifier', SVC())
], verbose=True)

In [11]:
pipe_nb_tf = Pipeline([
    ('TF-IDF Vectorizer', tfidfvec),
    ('Naive Bayes', MultinomialNB())
], verbose=True)

pipe_rfc_tf = Pipeline([
    ('TF-IDF Vectorizer', tfidfvec),
    ('Random Forest Classifier', RandomForestClassifier())
], verbose=True)

pipe_gbc_tf = Pipeline([
    ('TF-IDF Vectorizer', tfidfvec),
    ('Gradient Boosting Classifier', GradientBoostingClassifier())
], verbose=True)

pipe_svc_tf = Pipeline([
    ('TF-IDF Vectorizer', tfidfvec),
    ('Support Vector Classifier', SVC())
], verbose=True)

In [12]:
pipelines_cv = [pipe_nb, pipe_rfc, pipe_gbc, pipe_svc]
pipelines_tf = [pipe_nb_tf, pipe_rfc_tf, pipe_gbc_tf, pipe_svc_tf]

In [13]:
pipelines_fitted_cv = []
pipelines_fitted_tf = []

for i in range(len(pipelines_cv)):
    cv_fitted = pipelines_cv[i].fit(X_train, y_train)
    pipelines_fitted_cv.append(cv_fitted)
    misc.save(cv_fitted, f'outputs/models/cv{i}.pkl')
    
    tf_fitted = pipelines_cv[i].fit(X_train, y_train)
    pipelines_fitted_tf.append(tf_fitted)
    misc.save(tf_fitted, f'outputs/models/tfidf{i}.pkl')

[Pipeline] .. (step 1 of 2) Processing Count Vectorizer, total=  15.8s
[Pipeline] ....... (step 2 of 2) Processing Naive Bayes, total=   0.1s
[Pipeline] .. (step 1 of 2) Processing Count Vectorizer, total=  15.4s
[Pipeline] ....... (step 2 of 2) Processing Naive Bayes, total=   0.1s
[Pipeline] .. (step 1 of 2) Processing Count Vectorizer, total=  15.6s
[Pipeline]  (step 2 of 2) Processing Random Forest Classifier, total=  56.8s
[Pipeline] .. (step 1 of 2) Processing Count Vectorizer, total=  15.7s
[Pipeline]  (step 2 of 2) Processing Random Forest Classifier, total=  56.3s
[Pipeline] .. (step 1 of 2) Processing Count Vectorizer, total=  15.5s
[Pipeline]  (step 2 of 2) Processing Gradient Boosting Classifier, total= 3.3min
[Pipeline] .. (step 1 of 2) Processing Count Vectorizer, total=  16.1s
[Pipeline]  (step 2 of 2) Processing Gradient Boosting Classifier, total= 3.4min
[Pipeline] .. (step 1 of 2) Processing Count Vectorizer, total=  15.8s
[Pipeline]  (step 2 of 2) Processing Support 

In [16]:
import matplotlib.
import seaborn as sns

def plot_confusion_matrices(model, X_train, X_test, y_train, y_test):
    y_hat_train = model.predict(X_train)
    y_hat_test = model.predict(X_test)
    
    cm_train = confusion_matrix(y_train, y_hat_train, labels=misc.CLASSES)
    cm_test = confustion_matrix(y_test, y_hat_test, labels=misc.CLASSES)
    
    fig, (ax_train, ax_test) = plt.subplots(1, 2, figsize=(15, 8))
    
    sns.heatmap(
        
    )
    
    

array(['culture', 'finance', 'medical', 'politics', 'religion', 'sports',
       'tech'], dtype='<U8')

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix

fig, axes = plt.subplots(1, 2, figsize=(15, 8))
sns.heatmap(
    data=confusion_matrix(y_train, y_hat_trains[0], labels=pipelines_fitted[0].steps[1][1].classes_),
    annot=True,
    cmap='Blues',
    fmt='.0f',
    ax=axes[0]
)
axes[0].set_title('Training')
print(pipelines_fitted[0].steps[1][1].classes_)