# LSTM with Word2Vec

In [None]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

import numpy as np
import pandas as pd
import tensorflow as tf
import gensim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer
from tensorflow.keras import preprocessing as kprocessing
from tensorflow.keras import models, layers, optimizers
import gensim.downloader as api

# Suppress "SettingWithCopyWarning"
pd.options.mode.chained_assignment = None 

# https://medium.com/@claude.feldges/text-classification-with-tf-idf-lstm-bert-a-quantitative-comparison-b8409b556cb3

In [None]:
clean_spacy_mapaffil = pd.read_parquet("data/clean_spacy_mapaffil.parquet", engine="fastparquet") 

In [None]:
num_affiliations = 15000

In [None]:
df = clean_spacy_mapaffil.head(num_affiliations)
city_counts = df['city'].value_counts()
single_instance_cities = city_counts[city_counts == 1].index.tolist()
num_affiliations -= len(single_instance_cities)
filtered_df = df[~df['city'].isin(single_instance_cities)]

In [None]:
filtered_df['city'] = filtered_df['city'].astype('category')
filtered_df['label'] = filtered_df['city'].cat.codes

In [None]:
X = filtered_df['affiliation']
y_class = filtered_df['city']

lab = LabelBinarizer()
lab.fit(y_class)
y = lab.transform(y_class)

In [None]:
calculated_test_size = (filtered_df['city'].nunique()) / num_affiliations
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=calculated_test_size if calculated_test_size > 0.1 else 0.1, stratify=filtered_df['label'], random_state=42)

In [None]:
max_words = 5600
tokenizer = kprocessing.text.Tokenizer(lower=True, split=' ', num_words=max_words, oov_token="<pad>", filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n')
tokenizer.fit_on_texts(X_train)
voc = tokenizer.word_index
reverse_voc = dict([(value, key) for (key, value) in voc.items()])

max_len = 40
sequences = tokenizer.texts_to_sequences(X_train)
X_train_seq = kprocessing.sequence.pad_sequences(sequences, maxlen=max_len)
X_test_seq = kprocessing.sequence.pad_sequences(tokenizer.texts_to_sequences(X_test), maxlen=max_len)

In [None]:
if os.path.exists("word2vec-google-news-300.model"):
    w2v = gensim.models.KeyedVectors.load("word2vec-google-news-300.model")
else:
    w2v = api.load("word2vec-google-news-300") 
    w2v.save("word2vec-google-news-300.model")  

In [None]:
emb_matrix=np.zeros((max_words+1, 300))
for i in range(max_words):
    w = reverse_voc[i+1]
    if w in w2v:
        emb_matrix[i+1,:] = w2v[w]
emb_size = emb_matrix.shape[1]

In [None]:
input_ = layers.Input(shape = X_train_seq[0,:].shape,name='input')
x = layers.Embedding(max_words+1,emb_size,weights=[emb_matrix],trainable=False, name='embedding')(input_)
x = layers.LSTM(15, dropout=0.2, name='lstm')(x)
x = layers.Dropout(0.2, name='dropout')(x)
x = layers.Dense(64, activation='relu', name='dense')(x)
output = layers.Dense(len(filtered_df["city"].unique()),activation='softmax', name='classification')(x)

model = models.Model(input_, output)

opt = optimizers.Adam(learning_rate=0.01, beta_1=0.9)
model.compile(optimizer=opt,loss='categorical_crossentropy', metrics=['accuracy'])

In [None]:
history = model.fit(X_train_seq, y_train, batch_size=64, shuffle=True, epochs=10, validation_data=(X_test_seq, y_test))

In [None]:
print('Accuracy: {:.1%}'.format(history.history['val_accuracy'][-1]))