In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold, train_test_split, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.naive_bayes import MultinomialNB, ComplementNB
from sklearn.linear_model import SGDClassifier
import nltk
from joblib import dump, load
from collections import Counter
import pickle

In [None]:
data = pd.read_csv("../data/train_data.csv")

### Collecting stop words

In [None]:
all_words = []
for sen in data.text.values:
    all_words.extend(sen.strip().split(" "))
    
word_counts = dict(Counter(all_words).most_common())

In [None]:
n_stop_words = 26
stop_words = []
for key,value in enumerate(word_counts,0):
    if(key<n_stop_words):
        stop_words.append(value)
    
len(stop_words)

26

### Encoding labels
Classes are encoded with int labels from 0 to 17

In [None]:
le = LabelEncoder()
data["dialect"] = le.fit_transform(data["dialect"])
data.head()

Unnamed: 0,text,length,dialect
0,لكن بالنهايه ينتفض يغير,4,4
1,يعني هذا محسوب علي البشر حيونه وحشيه وتطلبون م...,15,4
2,مبين من كلامه خليجي,4,4
3,يسلملي مرورك وروحك الحلوه,4,4
4,وين هل الغيبه اخ محمد,5,4


In [None]:
labels_dict = {key:value for key,value in enumerate(le.classes_,0)}
print(labels_dict)

{0: 'AE', 1: 'BH', 2: 'DZ', 3: 'EG', 4: 'IQ', 5: 'JO', 6: 'KW', 7: 'LB', 8: 'LY', 9: 'MA', 10: 'OM', 11: 'PL', 12: 'QA', 13: 'SA', 14: 'SD', 15: 'SY', 16: 'TN', 17: 'YE'}


### Preparing data for training
- Splitting training and test data
- Initializing label probabilities for the ComplementNB model used

In [None]:
train_X,test_X,train_y,test_y = train_test_split(data["text"],
                                                 data["dialect"],
                                                 test_size=0.1,
                                                 random_state=42,
                                                 stratify=data["dialect"])


In [None]:
labels = data.groupby("dialect")["text"].count()
prob = (labels/labels.sum()).values

### Creating and training the model
Creating a ComplementNB model with a grid search cross validation (10 folds). Text is represented in a TFIDF matrix.

In [None]:
cv = GridSearchCV(ComplementNB(),{"norm":[True,False],"class_prior":[prob,None],"alpha":np.arange(0.1,1.0,0.2)},cv=10,n_jobs=-1,scoring="accuracy")
model = Pipeline(steps=[("count",TfidfVectorizer(ngram_range=(1,2),stop_words = stop_words)),("model",cv)])

In [None]:
model.fit(train_X.values,train_y)

### Evaluating and saving the best model
Generating predictions for the test set and evaluating model performance with classification report. Tracking experiments in a simple .txt file.
Saving the best model manually

In [None]:
preds = model.predict(test_X)
score = accuracy_score(test_y,preds)
with open("results.txt","a",encoding="utf-8") as file:
    score_text = str(model.named_steps) + "\n  accuracy %s \n\n" %(score)
    file.write(score_text)
    
print(score_text)
print(classification_report(test_y,preds))

{'count': TfidfVectorizer(ngram_range=(1, 2),
                stop_words=['من', 'في', 'ما', 'اللي', 'و', 'علي', 'الله', 'بس',
                            'يا', 'انا', 'كل', 'مش', 'ولا', 'لا', 'والله', 'هه',
                            'ان', 'لو', 'شي', 'انت', 'مع', 'عن', 'كان', 'الي',
                            'ع', '؟']), 'model': GridSearchCV(cv=10, estimator=ComplementNB(), n_jobs=-1,
             param_grid={'alpha': array([0.1, 0.3, 0.5, 0.7, 0.9]),
                         'class_prior': [array([0.05745602, 0.05739255, 0.03530048, 0.12581362, 0.03382534,
       0.06085059, 0.09204081, 0.06025747, 0.07958749, 0.0252065 ,
       0.04171755, 0.09540474, 0.06790894, 0.05862037, 0.03135218,
       0.03544055, 0.02019234, 0.02163246]),
                                         None],
                         'norm': [True, False]},
             scoring='accuracy')}
  accuracy 0.5371079643693506 


              precision    recall  f1-score   support

           0       0.49      0.40 

In [None]:
dump(model,"../models/cnb_054.joblib")
with open("../models/labels_dict.pickle","rb") as file:
  pickle.dump(labels_dict,file,protocol=pickle.HIGHEST_PROTOCOL)
  

['cnb_054.joblib']

In [None]:
# cnb = load("cnb_053.joblib")

In [None]:
# accuracy_score(test_y,preds)

0.5365608106629314