In [None]:
from IPython.core.debugger import set_trace
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import time
import re
import string
import nltk
import tensorflow as tf
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, LSTM, Dense, Dropout
plt.style.use(style="seaborn")
%matplotlib inline

In [None]:
path_csv_file = "latest_hukums_with_classes_csv_file1.csv"
df = pd.read_csv(path_csv_file)
df.head()

In [None]:
label_map = {}
count = 0
for label in df.YeniSuclar.unique():
    label_map[label] = count
    count += 1
df['NUM_LABEL'] = df.YeniSuclar.map(label_map)
#print(label_map)
df.head()

In [None]:
df = df.dropna(how='any',axis=0)

In [None]:
df.isnull().sum()

In [None]:
def clean_text(text):
    alphabetic_only = [word for word in text.split() if word.isalpha()]
    lower_case_only = [word.lower() for word in alphabetic_only]
    stopwords_tr = set(stopwords.words("turkish"))    
    return [word for word in lower_case_only if word not in stopwords_tr]

In [None]:
df.iloc[:, 2] = df.iloc[:, 2].apply(lambda x: " ".join(clean_text(x)))
df = df[['ictihat', 'NUM_LABEL']]
df

In [None]:
X = df.ictihat
Y = df.NUM_LABEL
X.shape

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,Y, test_size=0.2, random_state=123)

In [None]:
freq_df = X_train.str.split(expand=True).stack().value_counts().reset_index()
 
freq_df.columns = ['Word', 'Frequency'] 
 
freq_df

In [None]:
train_v_size = len(freq_df)
train_v_size

In [None]:
#fit tokenizer on training data
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)
#get train sequences
train_seqs = tokenizer.texts_to_sequences(X_train)
train_seqs_max_size = max([len(seq) for seq in train_seqs])
#get test sequences
test_seqs = tokenizer.texts_to_sequences(X_test)
test_seqs_max_size = max([len(seq) for seq in test_seqs])

In [None]:
train_padded = pad_sequences(train_seqs, maxlen=train_seqs_max_size, padding="post", truncating="post")
test_padded = pad_sequences(test_seqs, maxlen=train_seqs_max_size, padding="post", truncating="post")

In [None]:
train_seqs_max_size, test_seqs_max_size

In [None]:
X_train_tokenized = [[word for word in document.split()] for document in X_train]
X_train_tokenized

In [None]:
from gensim.models import Word2Vec, FastText
word_model = Word2Vec(X_train_tokenized, vector_size=100)

#build matrix 
embedding_matrix_w2v = np.random.random(((train_v_size) + 1, 100))
for word,i in tokenizer.word_index.items():  
    try:
        embedding_matrix_w2v[i] = word_model.wv[word]
    except:
        pass

# create layer
embedding_layer_w2v = Embedding((train_v_size) + 1, output_dim=100, 
                            weights=[embedding_matrix_w2v], trainable=True)

In [None]:
ft = FastText(vector_size=300)
ft.build_vocab(X_train_tokenized)
ft.train(tokenizer.word_index, total_examples=ft.corpus_count, epochs=10)

# build matrix
embedding_matrix_ft = np.random.random(((train_v_size) + 1, ft.vector_size))
for word,i in tokenizer.word_index.items(): 
    try:
        embedding_matrix_ft[i] = ft.wv[word]
    except:
        pass

# create layer
embedding_layer_ft = Embedding((train_v_size) + 1, output_dim=300, 
                            weights=[embedding_matrix_ft], trainable=True)

In [None]:
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, Dropout, Conv1D, GlobalMaxPooling1D
from keras.initializers import Constant
from keras.optimizers import Adam, SGD
from tensorflow.keras.regularizers import L2

In [None]:
def lstm_model(embeddings, classification=True):
    model = Sequential()
    model.add(embeddings)
    model.add(LSTM(64, dropout=0.1))
    model.add(Dense(1, activation="sigmoid"))

    adam_opt = Adam(learning_rate=3e-4)
    if classification:
        model.compile(loss="binary_crossentropy", optimizer=adam_opt, metrics=["accuracy"])
    else: 
        model.compile(loss="mean_squared_error", optimizer=adam_opt, metrics=["mse"])
        
    return model 

def train_model(model, train_padded, test_padded, y_train, y_test):
    early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
    history = model.fit( train_padded, y_train, epochs=20,
                            validation_data=(test_padded, y_test), callbacks=[early_stopping])
    
    return history
    
    
def evaluate_model(model, test_padded, y_test):
    results = model.evaluate(test_padded, y_test, batch_size=128)
    return results 


        
    

In [None]:
model = lstm_model(embedding_layer_w2v)    
history = train_model(model, train_padded, test_padded, y_train, y_test)

In [None]:
w2v_lstm_status_history = pd.DataFrame(history.history)
w2v_lstm_status_history

In [None]:
evaluate_model(model, test_padded, y_test)

In [None]:
model = lstm_model(embedding_layer_ft)    
history = train_model(model, train_padded, test_padded, y_train, y_test)

Epoch 1/20

In [None]:
ft_lstm_status_history = pd.DataFrame(history.history)
ft_lstm_status_history