In [11]:
import os
import re
import time
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from keras.models import load_model


In [12]:
df_a = pd.read_csv("./df_y4s2_final_new.csv")

In [13]:
df_a = df_a.dropna(subset='mentioned_location')

In [14]:
df_a['relevant'].value_counts()

relevant
False    1536
True      271
Name: count, dtype: int64

In [15]:
num_drop = int(df_a[df_a['relevant'] == False].shape[0]) - int(df_a[df_a['relevant'] == True].shape[0])
np.random.seed(int(time.time()))
false_rows = df_a[df_a['relevant'] == False]
rows_to_keep = false_rows.sample(n = 271, random_state = 42)

df_dropped = df_a[df_a['relevant'] != False]
df_a = pd.concat([df_dropped,rows_to_keep])
df_a['relevant'].value_counts()

relevant
True     271
False    271
Name: count, dtype: int64

In [16]:
df_a['word_length'] = df_a['translated_text'].str.len()

In [17]:
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\natkn\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [18]:
from nltk.tokenize import word_tokenize

nltk.download('punkt')

for index, row in df_a.iterrows():
    text = row['translated_text']
    words = text.split()  # Split the text into words
    filtered_words = [word for word in words if word.lower() not in stop_words]  # Remove stopwords
    processed_text = " ".join(filtered_words)  # Join the remaining words back into a single string
    df_a.at[index, 'processed_text'] = processed_text  # Store the processed text in a new column

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\natkn\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [11]:
# Assuming df_a is your DataFrame and 'translated_text' is the column name
stop_words = set(stopwords.words("english"))
stop_words.add('http')


# Iterate over each row and process the text
for index, row in df_a.iterrows():
    text = row['translated_text']
    words = text.split()  # Split the text into words
    filtered_words = [word for word in words if word.lower() not in stop_words]  # Remove stopwords
    processed_text = " ".join(filtered_words)  # Join the remaining words back into a single string
    df_a.at[index, 'processed_text'] = processed_text  # Store the processed text in a new column

# Now df_a contains a new column 'processed_text' with the processed text

from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
def stem_words(text):
    return " ".join([stemmer.stem(word) for word in text.split()])
df_a["processed_text"] = df_a["processed_text"].apply(lambda x: stem_words(x))

print(df_a['processed_text'])

0       use walk bt wat phra si laksi red line. distan...
1       walk central ladprao phahon yothin 34, bad. we...
2       use walk huai khwang sutthisan. summary, walk ...
3       korea, walk hard, matter much spent, gain weig...
4       banthat thong would worth walk around -make ne...
                              ...                        
924     best thailand!!!!!!!! good sidewalk, demolishe...
1664    sidewalk terribl condit weather terrible. actu...
1211            usual seat footpath front suea pa stadium
559     #pleas share. hawker watthana district humble,...
405     rt @chocolixirmania silom, saladaeng, flood fo...
Name: processed_text, Length: 542, dtype: object


In [12]:
df_a['word_length'] = df_a['processed_text'].str.len()
df_a['word_length'].sort_values(ascending=False)

2775    462
3584    386
3134    356
1664    354
3443    347
       ... 
118      37
182      34
286      34
562      31
431      27
Name: word_length, Length: 542, dtype: int64

In [17]:
import tensorflow as tf
from keras.preprocessing.text import Tokenizer
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential
from sklearn.preprocessing import LabelEncoder
from keras.layers import Embedding, SpatialDropout1D, Dense, Bidirectional, Flatten, LSTM
from keras.callbacks import EarlyStopping
from keras import backend as K
from keras.optimizers import Adam

MAX_WORDS = 3500  # Memorized words
MAX_SEQUENCE_LENGTH = 462
EMBEDDING_DIM = 100
epochs = 50
batch_size = 32

tokenizer = Tokenizer(num_words=MAX_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~')
#tokenizer.fit_on_texts(df_a.cleaned.values)
word_index = tokenizer.word_index
X = tokenizer.texts_to_sequences(df_a.processed_text.values)
X = tf.keras.utils.pad_sequences(X, maxlen=MAX_SEQUENCE_LENGTH)
#Y = df_a['relevance_nlp'].values
label_encoder = LabelEncoder()
Y = label_encoder.fit_transform(df_a['relevant'].values)
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.20, random_state = 42)

#Grid Search

In [18]:
from sklearn.model_selection import GridSearchCV
from keras.optimizers import Adam
from keras.models import Sequential
from keras.layers import Embedding, Dense, SpatialDropout1D, Bidirectional, LSTM
from keras.backend import epsilon


# Function to create model
def create_model(dropout_rate=0.1, lstm_units=100, learning_rate=0.001):
    model = Sequential()
    model.add(Embedding(MAX_WORDS, EMBEDDING_DIM, input_length=MAX_SEQUENCE_LENGTH))
    model.add(Dense(64, activation='relu'))
    model.add(SpatialDropout1D(dropout_rate))
    model.add(Bidirectional(LSTM(lstm_units, dropout=dropout_rate, recurrent_dropout=0.2)))
    model.add(Dense(1, activation='sigmoid'))
    optimizer = Adam(learning_rate)
    model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=[f1_m])
    return model

# Create KerasClassifier
model = tf.keras.wrappers.scikit_learn.KerasClassifier(build_fn=create_model, epochs=50, batch_size=32, verbose=0)

# Define the grid search parameters
param_grid = {
    'dropout_rate': [0.1, 0.2, 0.3],
    'lstm_units': [50, 100, 150],
    'learning_rate': [0.001, 0.01, 0.1]
}

# Perform grid search
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, scoring='f1', verbose=1)
grid_result = grid_search.fit(X_train, Y_train)

# Print results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))


  model = tf.keras.wrappers.scikit_learn.KerasClassifier(build_fn=create_model, epochs=50, batch_size=32, verbose=0)


Fitting 3 folds for each of 27 candidates, totalling 81 fits


ValueError: 
All the 81 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
81 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\natkn\anaconda3\envs\gpt-twitter\lib\site-packages\sklearn\model_selection\_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\natkn\anaconda3\envs\gpt-twitter\lib\site-packages\keras\wrappers\scikit_learn.py", line 248, in fit
    return super().fit(x, y, **kwargs)
  File "c:\Users\natkn\anaconda3\envs\gpt-twitter\lib\site-packages\keras\wrappers\scikit_learn.py", line 175, in fit
    history = self.model.fit(x, y, **fit_args)
  File "c:\Users\natkn\anaconda3\envs\gpt-twitter\lib\site-packages\keras\utils\traceback_utils.py", line 70, in error_handler
    raise e.with_traceback(filtered_tb) from None
  File "C:\Users\natkn\AppData\Local\Temp\__autograph_generated_filerwdwrbzw.py", line 15, in tf__train_function
    retval_ = ag__.converted_call(ag__.ld(step_function), (ag__.ld(self), ag__.ld(iterator)), None, fscope)
  File "C:\Users\natkn\AppData\Local\Temp\__autograph_generated_file3918zo5b.py", line 10, in tf__f1_m
    precision = ag__.converted_call(ag__.ld(precision_m), (ag__.ld(y_true), ag__.ld(y_pred)), None, fscope)
NameError: in user code:

    File "c:\Users\natkn\anaconda3\envs\gpt-twitter\lib\site-packages\keras\engine\training.py", line 1284, in train_function  *
        return step_function(self, iterator)
    File "C:\Users\natkn\AppData\Local\Temp\ipykernel_21512\2042237453.py", line 8, in f1_m  *
        precision = precision_m(y_true, y_pred)  # Define your precision_m function

    NameError: name 'precision_m' is not defined

