In [91]:
from IPython.core.debugger import set_trace
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import time
import re
import string
import nltk
import tensorflow as tf
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, LSTM, Dense, Dropout
plt.style.use(style="seaborn")
%matplotlib inline

In [92]:
path_csv_file = "datas-zemberek.csv"

df = pd.read_csv(path_csv_file)
df.head()

Unnamed: 0,ictihat,NUM_LABEL
0,hüküm hüküm açık geri temyiz istek ret yerel m...,0
1,hüküm hüküm açık geri temyiz istek ret yerel m...,1
2,hüküm hüküm açık geri temyiz istek ret yerel m...,2
3,hüküm beraat yerel mahkeme bozmak üzerine verm...,1
4,hüküm beraat yerel mahkeme bozmak üzerine verm...,0


In [93]:
df.isnull().sum()

ictihat      0
NUM_LABEL    0
dtype: int64

In [94]:
X = df.ictihat
Y = df.NUM_LABEL
X.shape

(19269,)

In [95]:
Y.shape

(19269,)

In [96]:
X_train, X_test, y_train, y_test = train_test_split(X,Y, test_size=0.2, random_state=123)

In [97]:
freq_df = X_train.str.split(expand=True).stack().value_counts().reset_index()
 
freq_df.columns = ['Word', 'Frequency'] 
 
freq_df

Unnamed: 0,Word,Frequency
0,hüküm,69443
1,madde,64506
2,sayılı,55694
3,sanık,51577
4,temyiz,51276
...,...,...
7046,Rapid,1
7047,prepisium,1
7048,hemostazı,1
7049,sirkumsizyon,1


In [98]:
train_v_size = len(freq_df)
train_v_size

7051

In [99]:
#fit tokenizer on training data
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)
#get train sequences
train_seqs = tokenizer.texts_to_sequences(X_train)
train_seqs_max_size = max([len(seq) for seq in train_seqs])
#get test sequences
test_seqs = tokenizer.texts_to_sequences(X_test)
test_seqs_max_size = max([len(seq) for seq in test_seqs])

In [100]:
train_padded = pad_sequences(train_seqs, maxlen=train_seqs_max_size, padding="post", truncating="post")
test_padded = pad_sequences(test_seqs, maxlen=train_seqs_max_size, padding="post", truncating="post")

In [101]:
train_seqs_max_size, test_seqs_max_size

(2020, 1518)

In [102]:
X_train_tokenized = [[word for word in document.split()] for document in X_train]
X_train_tokenized

[['i̇lk',
  'derece',
  'mahkeme',
  'Gaziantep',
  'ağır',
  'ceza',
  'mahkeme',
  'tarih',
  'sayılı',
  'karar',
  'hüküm',
  'sayılı',
  'Kanu',
  'madde',
  'uyarınca',
  'mahkûmiyet',
  'karar',
  'yönelik',
  'istinaf',
  'başvuru',
  'düzeltmek',
  'esas',
  'ret',
  'bölge',
  'adliye',
  'mahkeme',
  'vermek',
  'hüküm',
  'temyiz',
  'temyiz',
  'ede',
  'başvuru',
  'kararmak',
  'nitelik',
  'temyiz',
  'sebep',
  'göre',
  'dosya',
  'gerek',
  'sanık',
  'üzerine',
  'Atıl',
  'silâh',
  'örgüt',
  'suç',
  'suç',
  'sayılı',
  'Türk',
  'ceza',
  'kânun',
  'madde',
  'öngörmek',
  'hapis',
  'ceza',
  'alt',
  'sınır',
  'nazar',
  'sayılı',
  'ceza',
  'muhakeme',
  'kânun',
  'madde',
  'uyarınca',
  'sanık',
  'baro',
  'zorunlu',
  'Müdafii',
  'tayin',
  'etmek',
  'gerekmek',
  'Kanu',
  'sanık',
  'temyiz',
  'itiraz',
  'itibar',
  'yerinde',
  'görmek',
  'sair',
  'yön',
  'ince',
  'hüküm',
  'öncelikle',
  'sebep',
  'dolayı',
  'tarih',
  'yürürlük',
  'g

In [None]:
from gensim.models import Word2Vec, FastText
word_model = Word2Vec(X_train_tokenized, vector_size=100)

#build matrix 
embedding_matrix_w2v = np.random.random(((train_v_size) + 1, 100))
for word,i in tokenizer.word_index.items():  
    try:
        embedding_matrix_w2v[i] = word_model.wv[word]
    except:
        pass

# create layer
embedding_layer_w2v = Embedding((train_v_size) + 1, output_dim=100, 
                            weights=[embedding_matrix_w2v], trainable=True)

In [None]:
ft = FastText(vector_size=300)
ft.build_vocab(X_train_tokenized)
ft.train(tokenizer.word_index, total_examples=ft.corpus_count, epochs=10)

# build matrix
embedding_matrix_ft = np.random.random(((train_v_size) + 1, ft.vector_size))
for word,i in tokenizer.word_index.items(): 
    try:
        embedding_matrix_ft[i] = ft.wv[word]
    except:
        pass

# create layer
embedding_layer_ft = Embedding((train_v_size) + 1, output_dim=300, 
                            weights=[embedding_matrix_ft], trainable=True)

In [None]:
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, Dropout, Conv1D, GlobalMaxPooling1D
from keras.initializers import Constant
from keras.optimizers import Adam, SGD
from tensorflow.keras.regularizers import L2

In [None]:
def lstm_model(embeddings, classification=True):
    model = Sequential()
    model.add(embeddings)
    model.add(LSTM(64, dropout=0.1))
    model.add(Dense(1, activation="sigmoid"))

    adam_opt = Adam(learning_rate=3e-4)
    if classification:
        model.compile(loss="binary_crossentropy", optimizer=adam_opt, metrics=["accuracy"])
    else: 
        model.compile(loss="mean_squared_error", optimizer=adam_opt, metrics=["mse"])
        
    return model 

def train_model(model, train_padded, test_padded, y_train, y_test):
    early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
    history = model.fit( train_padded, y_train, epochs=20,
                            validation_data=(test_padded, y_test), callbacks=[early_stopping])
    
    return history
    
    
def evaluate_model(model, test_padded, y_test):
    results = model.evaluate(test_padded, y_test, batch_size=128)
    return results 

def plot_history(history, loss):
    if loss:
        x, y = 'loss', 'val_loss'
        title = 'Loss Vs Val_loss'
        y_label = 'Loss'
    else: 
        x, y = 'accuracy', 'val_accuracy'
        title = 'Accuracy Vs VAl_accuracy'
        y_label = 'Accuracy'
        
    plt = history[[x, y]].plot(lw=2, colormap='jet', marker='.', markersize=10, title=title)
    plt.set_xlabel('Epochs')
    plt.set_ylabel(y_label)
    

In [74]:
model = lstm_model(embedding_layer_w2v)    
history = train_model(model, train_padded, test_padded, y_train, y_test)

Epoch 1/20

Exception ignored in: <function WeakKeyDictionary.__init__.<locals>.remove at 0x0000027912F81940>
Traceback (most recent call last):
  File "C:\Users\murat\AppData\Local\Programs\Python\Python39\lib\weakref.py", line 371, in remove
    self = selfref()
KeyboardInterrupt: 


KeyboardInterrupt: 

In [None]:
w2v_lstm_status_history = pd.DataFrame(history.history)
w2v_lstm_status_history

In [None]:
evaluate_model(model, test_padded, y_test)

In [None]:
model = lstm_model(embedding_layer_ft)    
history = train_model(model, train_padded, test_padded, y_train, y_test)

In [None]:
ft_lstm_status_history = pd.DataFrame(history.history)
ft_lstm_status_history

In [None]:
evaluate_model(model, test_padded, y_test)