In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.sequence import pad_sequences
import regex as re
from string import punctuation
from collections import Counter
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelBinarizer
from sklearn.metrics import classification_report
import pickle

In [5]:
file_path = "../data/train_data.csv"
data = pd.read_csv(file_path)
data.head()

Unnamed: 0,text,length,dialect
0,لكن بالنهايه ينتفض يغير,4,IQ
1,يعني هذا محسوب علي البشر حيونه وحشيه وتطلبون م...,15,IQ
2,مبين من كلامه خليجي,4,IQ
3,يسلملي مرورك وروحك الحلوه,4,IQ
4,وين هل الغيبه اخ محمد,5,IQ


### Building vocab, word2idx, idx2word

In [6]:
all_words = []
for sen in data.text.values:
  for word in sen.split():
    all_words.append(word)

word_counts = dict(Counter(all_words).most_common())
vocab = set(all_words)
vocab_size = len(vocab)

In [7]:
idx2word = {k:v for k, v in enumerate(vocab,1)}
word2idx = {v:k for k, v in idx2word.items()}
len(word2idx)

384718

In [8]:
def create_sequence(text):
  seq = []
  for word in text.split():
    seq.append(word2idx[word])

  return seq

In [9]:
data["sequence"] = data.text.apply(create_sequence)
data.head()

Unnamed: 0,text,length,dialect,sequence
0,لكن بالنهايه ينتفض يغير,4,IQ,"[172935, 140888, 295891, 35701]"
1,يعني هذا محسوب علي البشر حيونه وحشيه وتطلبون م...,15,IQ,"[360600, 197728, 357121, 376645, 350968, 14251..."
2,مبين من كلامه خليجي,4,IQ,"[337280, 114447, 264086, 97023]"
3,يسلملي مرورك وروحك الحلوه,4,IQ,"[251918, 350104, 263315, 251639]"
4,وين هل الغيبه اخ محمد,5,IQ,"[259902, 20333, 209035, 234356, 297674]"


### Removing invalid data

In [None]:
n_words = 25
stop_words = list(word_counts.keys())[:n_words]
stop_idx = [word2idx[w] for w in stop_words]


In [None]:
def remove_stop_words(seq):
  s = seq
  for el in s:
    if el in stop_words:
      s.remove(s)

  return s

In [None]:
data.sequence = data.sequence.apply(remove_stop_words)

In [None]:
data["length"] = data.sequence.str.len()

### Preparing data for training

In [None]:
longest_seq = data.length.max()
shortest_seq = data.length.min()
avg_seq = data.length.mean()

print("Longest text: %s\nShortest text: %s\nAverage text length: %s\n"%(longest_seq,shortest_seq,avg_seq))

In [None]:
seq_size = 20
embedding_size = 300

In [None]:
lb = LabelBinarizer()
labels = lb.fit_transform(data.dialect)
sequences = pad_sequences(data.sequence,maxlen=seq_size)

class_labels = {k:v for k,v in enumerate(lb.classes_,0)}

In [None]:
train_X, remain_X, train_y, remain_y = train_test_split(sequences,labels,train_size=0.9,stratify=labels,random_state=42)
test_X, val_X, test_y, val_y = train_test_split(remain_X,remain_y, train_size=0.5, stratify=remain_y, random_state=42)

print("Train size: %s\nTest size: %s\nValidation size: %s\n"%(len(train_X),len(test_X),len(val_X)))

In [None]:
train_data = tf.data.Dataset.from_tensor_slices((train_X, train_y))
test_data = tf.data.Dataset.from_tensor_slices((test_X,test_y))
val_data = tf.data.Dataset.from_tensor_slices((val_X,val_y))

In [None]:
batch_size = 1000

train_data = train_data.shuffle(100).batch(batch_size)
test_data = test_data.batch(batch_size)
val_data = val_data.shuffle(100).batch(batch_size)

### Creating the model

In [None]:
training_hist = pd.DataFrame(columns=["summary","loss","acc"])

In [None]:
model = keras.Sequential()

model.add(keras.layers.Embedding(vocab_size+1,embedding_size,embeddings_initializer=keras.initializers.GlorotNormal()))
model.add(keras.layers.LSTM(100,return_sequences=True))
model.add(keras.layers.LSTM(50))
model.add(keras.layers.Dropout(0.5))
model.add(keras.layers.Dense(100,activation="relu"))
model.add(keras.layers.Dense(18,activation="softmax"))

loss = keras.losses.CategoricalCrossentropy()
optimizer = keras.optimizers.Nadam(learning_rate=0.001)
model.compile(loss=loss,optimizer=optimizer, metrics=["accuracy"])
model.summary()

### Training, evaluating, and saving the best model
Training the model with early stopping and checkpoint monitored by validation accuracy. Evaluation is done through classification report.
All experiments are added to a dataframe along with their accuracy and loss values.
Saving best model and the word2idx dict for deployment

In [None]:
callbacks = [keras.callbacks.EarlyStopping(patience=5,monitor="val_accuracy",mode="max",restore_best_weights=True),
             keras.callbacks.ModelCheckpoint(filepath='./models/model{val_accuracy:.3f}.h5',save_best_only=True, monitor="val_accuracy",mode="max")]

In [None]:
history = model.fit(train_data,validation_data=val_data,epochs=500,callbacks=callbacks)

scores = model.evaluate(test_X,test_y)
stats =pd.DataFrame({"summary":[[layer.name for layer in model.layers]],"loss":scores[0],"acc":scores[1]})
training_hist = pd.concat((training_hist, stats), axis=0)

preds = np.argmax(model.predict(test_X),axis=1)
truth = np.argmax(test_y,axis=1)
print(classification_report(truth,preds))
training_hist.tail()

In [10]:
with open("../models/word2idx.pickle","wb") as handle:
    pickle.dump(word2idx,handle,protocol=pickle.HIGHEST_PROTOCOL)