In [1]:
import os
import re
import time
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf

In [2]:
from keras.models import load_model

In [3]:
df_a = pd.read_csv("./df_y4s2_final_new.csv")

In [4]:
df_a = df_a.dropna(subset='mentioned_location')

In [5]:
df_a['relevant'].value_counts()

relevant
False    1536
True      271
Name: count, dtype: int64

In [6]:
num_drop = int(df_a[df_a['relevant'] == False].shape[0]) - int(df_a[df_a['relevant'] == True].shape[0])
np.random.seed(int(time.time()))
false_rows = df_a[df_a['relevant'] == False]
rows_to_keep = false_rows.sample(n = 271, random_state = 42)

df_dropped = df_a[df_a['relevant'] != False]
df_a = pd.concat([df_dropped,rows_to_keep])
df_a['relevant'].value_counts()

relevant
True     271
False    271
Name: count, dtype: int64

In [7]:
df_a['word_length'] = df_a['translated_text'].str.len()

In [8]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\natkn\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [9]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
# sent_tokenize(data)
# nltk.download('punkt')

for index, row in df_a.iterrows():
    text = row['translated_text']
    words = text.split()  # Split the text into words
    filtered_words = [word for word in words if word.lower() not in stop_words]  # Remove stopwords
    processed_text = " ".join(filtered_words)  # Join the remaining words back into a single string
    df_a.at[index, 'processed_text'] = processed_text  # Store the processed text in a new column

In [10]:

# Assuming df_a is your DataFrame and 'translated_text' is the column name
stop_words = set(stopwords.words("english"))
stop_words.add('http')


# Iterate over each row and process the text
for index, row in df_a.iterrows():
    text = row['translated_text']
    words = text.split()  # Split the text into words
    filtered_words = [word for word in words if word.lower() not in stop_words]  # Remove stopwords
    processed_text = " ".join(filtered_words)  # Join the remaining words back into a single string
    df_a.at[index, 'processed_text'] = processed_text  # Store the processed text in a new column

# Now df_a contains a new column 'processed_text' with the processed text

from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
def stem_words(text):
    return " ".join([stemmer.stem(word) for word in text.split()])
df_a["processed_text"] = df_a["processed_text"].apply(lambda x: stem_words(x))

print(df_a['processed_text'])

0       use walk bt wat phra si laksi red line. distan...
1       walk central ladprao phahon yothin 34, bad. we...
2       use walk huai khwang sutthisan. summary, walk ...
3       korea, walk hard, matter much spent, gain weig...
4       banthat thong would worth walk around -make ne...
                              ...                        
924     best thailand!!!!!!!! good sidewalk, demolishe...
1664    sidewalk terribl condit weather terrible. actu...
1211            usual seat footpath front suea pa stadium
559     #pleas share. hawker watthana district humble,...
405     rt @chocolixirmania silom, saladaeng, flood fo...
Name: processed_text, Length: 542, dtype: object


In [11]:
df_a['word_length'] = df_a['processed_text'].str.len()
df_a['word_length'].sort_values(ascending=False)

2775    462
3584    386
3134    356
1664    354
3443    347
       ... 
118      37
182      34
286      34
562      31
431      27
Name: word_length, Length: 542, dtype: int64

In [12]:
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from sklearn.preprocessing import LabelEncoder
from keras.layers import Embedding, SpatialDropout1D, Dense, Bidirectional, Flatten, LSTM
from keras.callbacks import EarlyStopping
from tensorflow.keras import backend as K
from keras.optimizers import Adam

MAX_WORDS = 3500  # Memorized words
MAX_SEQUENCE_LENGTH = 132
EMBEDDING_DIM = 100
epochs = 50
batch_size = 64

tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=MAX_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~')
#tokenizer.fit_on_texts(df_a.processed_text.values)
#word_index = tokenizer.word_index
X = tokenizer.texts_to_sequences(df_a['processed_text'].values)
X = tf.keras.utils.pad_sequences(X, maxlen=MAX_SEQUENCE_LENGTH)
#Y = df_a['relevant'].values
label_encoder = LabelEncoder()
Y = label_encoder.fit_transform(df_a['relevant'].values)
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.20, random_state = 42)

In [13]:
def recall_m(y_true, y_pred):
    y_true = K.cast(y_true, K.floatx())  # Cast y_true to the same data type as y_pred
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    y_true = K.cast(y_true, K.floatx())  # Cast y_true to the same data type as y_pred
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2 * ((precision * recall) / (precision + recall + K.epsilon()))

In [14]:
import pandas as pd
import math
import keras
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import save_model
from tensorflow.keras.models import model_from_json
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dropout
from keras_tuner.tuners import RandomSearch
from kerastuner.engine.hyperparameters import HyperParameters
from pickle import dump,load
import warnings
warnings.simplefilter("ignore", UserWarning)

  from kerastuner.engine.hyperparameters import HyperParameters


In [15]:
def build_model(hp):
    model = Sequential()
    model.add(LSTM(hp.Int('input_unit', min_value=32, max_value=512, step=32), return_sequences=True, input_shape=(X_train.shape[1], 1)))
    for i in range(hp.Int('n_layers', 1, 4)):
        model.add(LSTM(hp.Int(f'lstm_{i}_units', min_value=32, max_value=512, step=32), return_sequences=True))
    model.add(LSTM(hp.Int('layer_2_neurons', min_value=32, max_value=512, step=32)))
    model.add(Dropout(hp.Float('Dropout_rate', min_value=0, max_value=0.5, step=0.1)))
    model.add(Dense(1, activation='sigmoid'))  # For binary classification
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['f1'])
    return model


In [19]:
from kerastuner import Objective
from kerastuner.tuners import GridSearch

tuner = GridSearch(
    build_model,
    objective=Objective('f1', direction='max'),
    max_trials=2,
    directory='grid_search',
    project_name='my_grid_search'
)

tuner.search(
    x=X_train,
    y=Y_train,
    epochs=10,
    batch_size=64,
    validation_data=(X_test, Y_test)
)

Reloading Tuner from grid_search\my_grid_search\tuner0.json


# Grid Search

In [20]:
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout
from sklearn.model_selection import GridSearchCV
from scikeras.wrappers import KerasClassifier

# Function to create LSTM model
def create_lstm_model(units=100, optimizer='adam'):
    model = Sequential()
    model.add(LSTM(units=units, input_shape=(X_train.shape[0], X_train.shape[1])))
    model.add(Dropout(0.2))  # Add dropout layer directly
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    return model

# Create KerasClassifier for use in scikit-learn GridSearchCV
lstm_classifier = KerasClassifier(build_fn=create_lstm_model, verbose=0)

# Get the available parameters
params = lstm_classifier.get_params().keys()
print("Available parameters:", params)

Available parameters: dict_keys(['model', 'build_fn', 'warm_start', 'random_state', 'optimizer', 'loss', 'metrics', 'batch_size', 'validation_batch_size', 'verbose', 'callbacks', 'validation_split', 'shuffle', 'run_eagerly', 'epochs', 'class_weight'])


In [21]:
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout
from sklearn.model_selection import GridSearchCV
from scikeras.wrappers import KerasClassifier

# Function to create LSTM model
def create_lstm_model( optimizer='adam'):
    model = Sequential()
    model.add(LSTM(units=100, input_shape=(X_train.shape[0], X_train.shape[1])))
    model.add(Dropout(0.2))  # Add dropout layer directly
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    return model

# Create KerasClassifier for use in scikit-learn GridSearchCV
lstm_classifier = KerasClassifier(build_fn=create_lstm_model, verbose=0)

# Define hyperparameters to tune
param_grid = {
    'optimizer': ['adam', 'rmsprop']
}

# Perform grid search
grid_search = GridSearchCV(estimator=lstm_classifier, param_grid=param_grid, cv=3)
grid_search_result = grid_search.fit(X_train, Y_train)

# Print best parameters and best score
print("Best Parameters: ", grid_search_result.best_params_)
print("Best Score: ", grid_search_result.best_score_)


ValueError: 
All the 6 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
1 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\natkn\anaconda3\envs\gpt-twitter\lib\site-packages\sklearn\model_selection\_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\natkn\anaconda3\envs\gpt-twitter\lib\site-packages\scikeras\wrappers.py", line 1491, in fit
    super().fit(X=X, y=y, sample_weight=sample_weight, **kwargs)
  File "c:\Users\natkn\anaconda3\envs\gpt-twitter\lib\site-packages\scikeras\wrappers.py", line 760, in fit
    self._fit(
  File "c:\Users\natkn\anaconda3\envs\gpt-twitter\lib\site-packages\scikeras\wrappers.py", line 928, in _fit
    self._fit_keras_model(
  File "c:\Users\natkn\anaconda3\envs\gpt-twitter\lib\site-packages\scikeras\wrappers.py", line 524, in _fit_keras_model
    hist = self.model_.fit(x=X, y=y, **fit_args)
  File "c:\Users\natkn\anaconda3\envs\gpt-twitter\lib\site-packages\keras\src\utils\traceback_utils.py", line 70, in error_handler
    raise e.with_traceback(filtered_tb) from None
  File "C:\Users\natkn\AppData\Local\Temp\__autograph_generated_fileuiam9_z2.py", line 15, in tf__train_function
    retval_ = ag__.converted_call(ag__.ld(step_function), (ag__.ld(self), ag__.ld(iterator)), None, fscope)
ValueError: in user code:

    File "c:\Users\natkn\anaconda3\envs\gpt-twitter\lib\site-packages\keras\src\engine\training.py", line 1338, in train_function  *
        return step_function(self, iterator)
    File "c:\Users\natkn\anaconda3\envs\gpt-twitter\lib\site-packages\keras\src\engine\training.py", line 1322, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "c:\Users\natkn\anaconda3\envs\gpt-twitter\lib\site-packages\keras\src\engine\training.py", line 1303, in run_step  **
        outputs = model.train_step(data)
    File "c:\Users\natkn\anaconda3\envs\gpt-twitter\lib\site-packages\keras\src\engine\training.py", line 1080, in train_step
        y_pred = self(x, training=True)
    File "c:\Users\natkn\anaconda3\envs\gpt-twitter\lib\site-packages\keras\src\utils\traceback_utils.py", line 70, in error_handler
        raise e.with_traceback(filtered_tb) from None
    File "c:\Users\natkn\anaconda3\envs\gpt-twitter\lib\site-packages\keras\src\engine\input_spec.py", line 298, in assert_input_compatibility
        raise ValueError(

    ValueError: Input 0 of layer "sequential_1" is incompatible with the layer: expected shape=(None, 433, 132), found shape=(32, 132)


--------------------------------------------------------------------------------
1 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\natkn\anaconda3\envs\gpt-twitter\lib\site-packages\sklearn\model_selection\_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\natkn\anaconda3\envs\gpt-twitter\lib\site-packages\scikeras\wrappers.py", line 1491, in fit
    super().fit(X=X, y=y, sample_weight=sample_weight, **kwargs)
  File "c:\Users\natkn\anaconda3\envs\gpt-twitter\lib\site-packages\scikeras\wrappers.py", line 760, in fit
    self._fit(
  File "c:\Users\natkn\anaconda3\envs\gpt-twitter\lib\site-packages\scikeras\wrappers.py", line 928, in _fit
    self._fit_keras_model(
  File "c:\Users\natkn\anaconda3\envs\gpt-twitter\lib\site-packages\scikeras\wrappers.py", line 524, in _fit_keras_model
    hist = self.model_.fit(x=X, y=y, **fit_args)
  File "c:\Users\natkn\anaconda3\envs\gpt-twitter\lib\site-packages\keras\src\utils\traceback_utils.py", line 70, in error_handler
    raise e.with_traceback(filtered_tb) from None
  File "C:\Users\natkn\AppData\Local\Temp\__autograph_generated_fileuiam9_z2.py", line 15, in tf__train_function
    retval_ = ag__.converted_call(ag__.ld(step_function), (ag__.ld(self), ag__.ld(iterator)), None, fscope)
ValueError: in user code:

    File "c:\Users\natkn\anaconda3\envs\gpt-twitter\lib\site-packages\keras\src\engine\training.py", line 1338, in train_function  *
        return step_function(self, iterator)
    File "c:\Users\natkn\anaconda3\envs\gpt-twitter\lib\site-packages\keras\src\engine\training.py", line 1322, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "c:\Users\natkn\anaconda3\envs\gpt-twitter\lib\site-packages\keras\src\engine\training.py", line 1303, in run_step  **
        outputs = model.train_step(data)
    File "c:\Users\natkn\anaconda3\envs\gpt-twitter\lib\site-packages\keras\src\engine\training.py", line 1080, in train_step
        y_pred = self(x, training=True)
    File "c:\Users\natkn\anaconda3\envs\gpt-twitter\lib\site-packages\keras\src\utils\traceback_utils.py", line 70, in error_handler
        raise e.with_traceback(filtered_tb) from None
    File "c:\Users\natkn\anaconda3\envs\gpt-twitter\lib\site-packages\keras\src\engine\input_spec.py", line 298, in assert_input_compatibility
        raise ValueError(

    ValueError: Input 0 of layer "sequential_2" is incompatible with the layer: expected shape=(None, 433, 132), found shape=(None, 132)


--------------------------------------------------------------------------------
1 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\natkn\anaconda3\envs\gpt-twitter\lib\site-packages\sklearn\model_selection\_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\natkn\anaconda3\envs\gpt-twitter\lib\site-packages\scikeras\wrappers.py", line 1491, in fit
    super().fit(X=X, y=y, sample_weight=sample_weight, **kwargs)
  File "c:\Users\natkn\anaconda3\envs\gpt-twitter\lib\site-packages\scikeras\wrappers.py", line 760, in fit
    self._fit(
  File "c:\Users\natkn\anaconda3\envs\gpt-twitter\lib\site-packages\scikeras\wrappers.py", line 928, in _fit
    self._fit_keras_model(
  File "c:\Users\natkn\anaconda3\envs\gpt-twitter\lib\site-packages\scikeras\wrappers.py", line 524, in _fit_keras_model
    hist = self.model_.fit(x=X, y=y, **fit_args)
  File "c:\Users\natkn\anaconda3\envs\gpt-twitter\lib\site-packages\keras\src\utils\traceback_utils.py", line 70, in error_handler
    raise e.with_traceback(filtered_tb) from None
  File "C:\Users\natkn\AppData\Local\Temp\__autograph_generated_fileuiam9_z2.py", line 15, in tf__train_function
    retval_ = ag__.converted_call(ag__.ld(step_function), (ag__.ld(self), ag__.ld(iterator)), None, fscope)
ValueError: in user code:

    File "c:\Users\natkn\anaconda3\envs\gpt-twitter\lib\site-packages\keras\src\engine\training.py", line 1338, in train_function  *
        return step_function(self, iterator)
    File "c:\Users\natkn\anaconda3\envs\gpt-twitter\lib\site-packages\keras\src\engine\training.py", line 1322, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "c:\Users\natkn\anaconda3\envs\gpt-twitter\lib\site-packages\keras\src\engine\training.py", line 1303, in run_step  **
        outputs = model.train_step(data)
    File "c:\Users\natkn\anaconda3\envs\gpt-twitter\lib\site-packages\keras\src\engine\training.py", line 1080, in train_step
        y_pred = self(x, training=True)
    File "c:\Users\natkn\anaconda3\envs\gpt-twitter\lib\site-packages\keras\src\utils\traceback_utils.py", line 70, in error_handler
        raise e.with_traceback(filtered_tb) from None
    File "c:\Users\natkn\anaconda3\envs\gpt-twitter\lib\site-packages\keras\src\engine\input_spec.py", line 298, in assert_input_compatibility
        raise ValueError(

    ValueError: Input 0 of layer "sequential_3" is incompatible with the layer: expected shape=(None, 433, 132), found shape=(None, 132)


--------------------------------------------------------------------------------
1 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\natkn\anaconda3\envs\gpt-twitter\lib\site-packages\sklearn\model_selection\_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\natkn\anaconda3\envs\gpt-twitter\lib\site-packages\scikeras\wrappers.py", line 1491, in fit
    super().fit(X=X, y=y, sample_weight=sample_weight, **kwargs)
  File "c:\Users\natkn\anaconda3\envs\gpt-twitter\lib\site-packages\scikeras\wrappers.py", line 760, in fit
    self._fit(
  File "c:\Users\natkn\anaconda3\envs\gpt-twitter\lib\site-packages\scikeras\wrappers.py", line 928, in _fit
    self._fit_keras_model(
  File "c:\Users\natkn\anaconda3\envs\gpt-twitter\lib\site-packages\scikeras\wrappers.py", line 524, in _fit_keras_model
    hist = self.model_.fit(x=X, y=y, **fit_args)
  File "c:\Users\natkn\anaconda3\envs\gpt-twitter\lib\site-packages\keras\src\utils\traceback_utils.py", line 70, in error_handler
    raise e.with_traceback(filtered_tb) from None
  File "C:\Users\natkn\AppData\Local\Temp\__autograph_generated_fileuiam9_z2.py", line 15, in tf__train_function
    retval_ = ag__.converted_call(ag__.ld(step_function), (ag__.ld(self), ag__.ld(iterator)), None, fscope)
ValueError: in user code:

    File "c:\Users\natkn\anaconda3\envs\gpt-twitter\lib\site-packages\keras\src\engine\training.py", line 1338, in train_function  *
        return step_function(self, iterator)
    File "c:\Users\natkn\anaconda3\envs\gpt-twitter\lib\site-packages\keras\src\engine\training.py", line 1322, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "c:\Users\natkn\anaconda3\envs\gpt-twitter\lib\site-packages\keras\src\engine\training.py", line 1303, in run_step  **
        outputs = model.train_step(data)
    File "c:\Users\natkn\anaconda3\envs\gpt-twitter\lib\site-packages\keras\src\engine\training.py", line 1080, in train_step
        y_pred = self(x, training=True)
    File "c:\Users\natkn\anaconda3\envs\gpt-twitter\lib\site-packages\keras\src\utils\traceback_utils.py", line 70, in error_handler
        raise e.with_traceback(filtered_tb) from None
    File "c:\Users\natkn\anaconda3\envs\gpt-twitter\lib\site-packages\keras\src\engine\input_spec.py", line 298, in assert_input_compatibility
        raise ValueError(

    ValueError: Input 0 of layer "sequential_4" is incompatible with the layer: expected shape=(None, 433, 132), found shape=(32, 132)


--------------------------------------------------------------------------------
1 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\natkn\anaconda3\envs\gpt-twitter\lib\site-packages\sklearn\model_selection\_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\natkn\anaconda3\envs\gpt-twitter\lib\site-packages\scikeras\wrappers.py", line 1491, in fit
    super().fit(X=X, y=y, sample_weight=sample_weight, **kwargs)
  File "c:\Users\natkn\anaconda3\envs\gpt-twitter\lib\site-packages\scikeras\wrappers.py", line 760, in fit
    self._fit(
  File "c:\Users\natkn\anaconda3\envs\gpt-twitter\lib\site-packages\scikeras\wrappers.py", line 928, in _fit
    self._fit_keras_model(
  File "c:\Users\natkn\anaconda3\envs\gpt-twitter\lib\site-packages\scikeras\wrappers.py", line 524, in _fit_keras_model
    hist = self.model_.fit(x=X, y=y, **fit_args)
  File "c:\Users\natkn\anaconda3\envs\gpt-twitter\lib\site-packages\keras\src\utils\traceback_utils.py", line 70, in error_handler
    raise e.with_traceback(filtered_tb) from None
  File "C:\Users\natkn\AppData\Local\Temp\__autograph_generated_fileuiam9_z2.py", line 15, in tf__train_function
    retval_ = ag__.converted_call(ag__.ld(step_function), (ag__.ld(self), ag__.ld(iterator)), None, fscope)
ValueError: in user code:

    File "c:\Users\natkn\anaconda3\envs\gpt-twitter\lib\site-packages\keras\src\engine\training.py", line 1338, in train_function  *
        return step_function(self, iterator)
    File "c:\Users\natkn\anaconda3\envs\gpt-twitter\lib\site-packages\keras\src\engine\training.py", line 1322, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "c:\Users\natkn\anaconda3\envs\gpt-twitter\lib\site-packages\keras\src\engine\training.py", line 1303, in run_step  **
        outputs = model.train_step(data)
    File "c:\Users\natkn\anaconda3\envs\gpt-twitter\lib\site-packages\keras\src\engine\training.py", line 1080, in train_step
        y_pred = self(x, training=True)
    File "c:\Users\natkn\anaconda3\envs\gpt-twitter\lib\site-packages\keras\src\utils\traceback_utils.py", line 70, in error_handler
        raise e.with_traceback(filtered_tb) from None
    File "c:\Users\natkn\anaconda3\envs\gpt-twitter\lib\site-packages\keras\src\engine\input_spec.py", line 298, in assert_input_compatibility
        raise ValueError(

    ValueError: Input 0 of layer "sequential_5" is incompatible with the layer: expected shape=(None, 433, 132), found shape=(None, 132)


--------------------------------------------------------------------------------
1 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\natkn\anaconda3\envs\gpt-twitter\lib\site-packages\sklearn\model_selection\_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\natkn\anaconda3\envs\gpt-twitter\lib\site-packages\scikeras\wrappers.py", line 1491, in fit
    super().fit(X=X, y=y, sample_weight=sample_weight, **kwargs)
  File "c:\Users\natkn\anaconda3\envs\gpt-twitter\lib\site-packages\scikeras\wrappers.py", line 760, in fit
    self._fit(
  File "c:\Users\natkn\anaconda3\envs\gpt-twitter\lib\site-packages\scikeras\wrappers.py", line 928, in _fit
    self._fit_keras_model(
  File "c:\Users\natkn\anaconda3\envs\gpt-twitter\lib\site-packages\scikeras\wrappers.py", line 524, in _fit_keras_model
    hist = self.model_.fit(x=X, y=y, **fit_args)
  File "c:\Users\natkn\anaconda3\envs\gpt-twitter\lib\site-packages\keras\src\utils\traceback_utils.py", line 70, in error_handler
    raise e.with_traceback(filtered_tb) from None
  File "C:\Users\natkn\AppData\Local\Temp\__autograph_generated_fileuiam9_z2.py", line 15, in tf__train_function
    retval_ = ag__.converted_call(ag__.ld(step_function), (ag__.ld(self), ag__.ld(iterator)), None, fscope)
ValueError: in user code:

    File "c:\Users\natkn\anaconda3\envs\gpt-twitter\lib\site-packages\keras\src\engine\training.py", line 1338, in train_function  *
        return step_function(self, iterator)
    File "c:\Users\natkn\anaconda3\envs\gpt-twitter\lib\site-packages\keras\src\engine\training.py", line 1322, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "c:\Users\natkn\anaconda3\envs\gpt-twitter\lib\site-packages\keras\src\engine\training.py", line 1303, in run_step  **
        outputs = model.train_step(data)
    File "c:\Users\natkn\anaconda3\envs\gpt-twitter\lib\site-packages\keras\src\engine\training.py", line 1080, in train_step
        y_pred = self(x, training=True)
    File "c:\Users\natkn\anaconda3\envs\gpt-twitter\lib\site-packages\keras\src\utils\traceback_utils.py", line 70, in error_handler
        raise e.with_traceback(filtered_tb) from None
    File "c:\Users\natkn\anaconda3\envs\gpt-twitter\lib\site-packages\keras\src\engine\input_spec.py", line 298, in assert_input_compatibility
        raise ValueError(

    ValueError: Input 0 of layer "sequential_6" is incompatible with the layer: expected shape=(None, 433, 132), found shape=(None, 132)



# Idiot GridSearch

In [18]:
Y_train.shape

(433,)

In [17]:
X_train.shape

(433, 132)

In [16]:
from sklearn.base import BaseEstimator
from scikeras.wrappers import KerasClassifier
from sklearn.model_selection import GridSearchCV
from keras.optimizers import Adam
from keras.models import Sequential
from sklearn.metrics import fbeta_score, make_scorer
from keras.layers import Embedding, Dense, SpatialDropout1D, Bidirectional, LSTM
import tensorflow.keras.backend as K

# Define your data-specific constants
MAX_WORDS = 3500
EMBEDDING_DIM = 100
MAX_SEQUENCE_LENGTH = 132

# Function to create model
def create_model(dropout_rate=0.1, LSTM_units=100, learning_rate=0.001):
    model = Sequential()
    model.add(Embedding(MAX_WORDS, EMBEDDING_DIM, input_length=MAX_SEQUENCE_LENGTH))
    model.add(Dense(64, activation='relu'))
    model.add(SpatialDropout1D(dropout_rate))
    model.add(Bidirectional(LSTM(units=LSTM_units, dropout=dropout_rate, recurrent_dropout=0.2)))
    model.add(Dense(1, activation='sigmoid'))
    optimizer = Adam(learning_rate)
    model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])  # Replace 'f1_m' with 'accuracy' or define 'f1_m'
    return model

class CustomKerasClassifier(BaseEstimator):
    def __init__(self, dropout_rate=0.1, LSTM_units=100, learning_rate=0.001, **kwargs):
        self.dropout_rate = dropout_rate
        self.LSTM_units = LSTM_units
        self.learning_rate = learning_rate
        self.kwargs = kwargs

    def fit(self, X, Y):
        self.model = create_model(dropout_rate=self.dropout_rate, LSTM_units=self.LSTM_units,
                                  learning_rate=self.learning_rate, **self.kwargs)
        self.model.fit(X, Y, **self.kwargs)
        return self

    def predict(self, X):
        return self.model.predict(X)

# Create CustomKerasClassifier instance
model = CustomKerasClassifier()


# Define the grid search parameters
param_grid = {
    'dropout_rate': [0.1, 0.2, 0.3],
    'LSTM_units': [50, 100, 150],  # Uncomment if you want to search over LSTM_units
    'learning_rate': [0.01, 0.001, 0.0001]  # Uncomment if you want to search over learning_rate
}

f1 = make_scorer(fbeta_score, beta = 1)
# Perform grid search
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, scoring = f1 , cv=3, verbose=1)  # Change scoring to 'accuracy' or 'f1' as appropriate scoring=f1
grid_result = grid_search.fit(X_train, Y_train)

# Print results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))


Fitting 3 folds for each of 27 candidates, totalling 81 fits


KeyboardInterrupt: 

use this updated

In [None]:
from sklearn.base import BaseEstimator
from scikeras.wrappers import KerasClassifier
from sklearn.model_selection import GridSearchCV
from keras.optimizers import Adam
from keras.models import Sequential
from keras.layers import Embedding, Dense, SpatialDropout1D, Bidirectional, LSTM
import tensorflow.keras.backend as K
# Define your data-specific constants
MAX_WORDS = 3500
EMBEDDING_DIM = 100
MAX_SEQUENCE_LENGTH = 132

def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

# Function to create model
def create_model(dropout_rate=0.1, LSTM_units=100, learning_rate=0.001):
    model = Sequential()
    model.add(Embedding(MAX_WORDS, EMBEDDING_DIM, input_length=MAX_SEQUENCE_LENGTH))
    model.add(Dense(64, activation='relu'))
    model.add(Bidirectional(LSTM(units=LSTM_units, dropout=dropout_rate, recurrent_dropout=0.1)))
    model.add(Dense(1, activation='sigmoid'))
    optimizer = Adam(learning_rate)
    model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=[f1])
    return model

class CustomKerasClassifier(BaseEstimator):
    def __init__(self, dropout_rate=0.1, LSTM_units=100, learning_rate=0.001, **kwargs):
        self.dropout_rate = dropout_rate
        self.LSTM_units = LSTM_units
        self.learning_rate = learning_rate
        self.kwargs = kwargs

    def fit(self, X, Y):
        self.model = create_model(dropout_rate=self.dropout_rate, LSTM_units=self.LSTM_units,
                                  learning_rate=self.learning_rate, **self.kwargs)
        self.model.fit(X, Y, **self.kwargs)
        return self

    def predict(self, X):
        return self.model.predict(X)

# Create CustomKerasClassifier instance
model = CustomKerasClassifier()

# Define the grid search parameters
param_grid = {
    'dropout_rate': [0.1, 0.08, 0.09],
    'LSTM_units': [50, 100, 150],  # Uncomment if you want to search over LSTM_units
    'learning_rate': [0.001, 0.0009, 0.0008]  # Uncomment if you want to search over learning_rate
}

# Perform grid search
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, verbose=1)  # Change scoring to 'accuracy'
grid_result = grid_search.fit(X_train, Y_train)

# Print results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))