In [2]:
#load the encoded_tweets.csv file and store it as a df
import ast
import pandas as pd
import numpy as np

df = pd.read_csv("embeddings_as_string.csv")

print("\nDataFrame loaded from CSV:\n", df)
print("\nType of df['embedding'].iloc[0]:", type(df['embedding'].iloc[0]))  # <class 'str'>

df["embedding"] = df["embedding"].apply(
    lambda x: np.array(ast.literal_eval(x), dtype=np.float32)
)

print("\nAfter converting string back to NumPy array:\n", df)
print("\nType of df['embedding'].iloc[0]:", type(df['embedding'].iloc[0]))  # <class 'numpy.ndarray'>
print("dtype:", df['embedding'].iloc[0].dtype)  # float32


DataFrame loaded from CSV:
                                                embedding  sentiment_encoded
0      [-0.009687530808150768, -0.11519767343997955, ...                  0
1      [0.021411411464214325, -0.054183561354875565, ...                  0
2      [-0.04290156811475754, -0.0567781999707222, 0....                  0
3      [0.03292088955640793, 0.22727343440055847, 0.1...                  2
4      [0.0845080241560936, 0.3749231994152069, 0.003...                  1
...                                                  ...                ...
39821  [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...                  1
39822  [-0.2560422420501709, 0.00734885036945343, -0....                  2
39823  [0.014292838983237743, 0.053937748074531555, -...                  2
39824  [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...                  2
39825  [-0.07798190414905548, 0.015587124042212963, 0...                  2

[39826 rows x 2 columns]

Type of df['embedding'].iloc[0]:

In [5]:
df.head()

Unnamed: 0,embedding,sentiment_encoded
0,"[-0.009687531, -0.11519767, 0.0016011447, 0.06...",0
1,"[0.021411411, -0.05418356, -0.049089134, -0.20...",0
2,"[-0.042901568, -0.0567782, 0.06105573, 0.10129...",0
3,"[0.03292089, 0.22727343, 0.10493607, -0.464905...",2
4,"[0.084508024, 0.3749232, 0.0037533038, 0.09657...",1


In [3]:
df.dtypes

embedding            object
sentiment_encoded     int64
dtype: object

In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Split the dataset into train (70%) and temp (30%)
train_df, temp_df = train_test_split(df, test_size=0.3, random_state=42)

# Split the temp dataset into validation (20% of original) and test (10% of original)
val_df, test_df = train_test_split(temp_df, test_size=1/3, random_state=42)

# Print the sizes of the splits to verify
print(f"Train set size: {len(train_df)}")
print(f"Validation set size: {len(val_df)}")
print(f"Test set size: {len(test_df)}")

Train set size: 27878
Validation set size: 7965
Test set size: 3983


In [6]:
import numpy as np
import pandas as pd

def prepare_lstm_data(df, label_col='sentiment_encoded', embed_col='embedding'):
    """
    df:       DataFrame with at least 2 columns: [label_col, embed_col]
    label_col: name of the sentiment/label column
    embed_col: name of the embedding column (a numerical vector or numeric data)
    """
    # 1) Extract labels
    y = df[label_col].values  # shape -> (num_samples,)

    # 2) Extract numeric features (assuming 'embedding' column contains numeric vectors)
    #    If 'embedding' is already stored as a vector (list/np.array) per row, convert each row to np.array:
    X = np.array(df[embed_col].tolist())  # shape -> (num_samples, embedding_dim)

    # 3) Reshape to 3D for LSTM: (samples, timesteps=1, features=embedding_dim)
    #    If each row is just one “step” with that embedding:
    X = X.reshape((X.shape[0], 1, X.shape[1]))

    return X, y

# -------------------------------------------------------
# Example usage with train_df and val_df
# -------------------------------------------------------
import numpy as np
import pandas as pd

# Keras / TensorFlow
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam

# Sklearn for additional metrics
from sklearn.metrics import classification_report, confusion_matrix
X_train, y_train = prepare_lstm_data(train_df,
                                     label_col='sentiment_encoded',
                                     embed_col='embedding')

X_val, y_val = prepare_lstm_data(val_df,
                                 label_col='sentiment_encoded',
                                 embed_col='embedding')

print("X_train shape:", X_train.shape)  # (28000, 1, embedding_dim) for example
print("y_train shape:", y_train.shape)  # (28000,)

print("X_val shape:", X_val.shape)
print("y_val shape:", y_val.shape)




X_train shape: (27878, 1, 200)
y_train shape: (27878,)
X_val shape: (7965, 1, 200)
y_val shape: (7965,)


In [7]:
from tensorflow.keras.layers import LSTM, Dense, Dropout, BatchNormalization
from tensorflow.keras import regularizers
from tensorflow.keras.optimizers import Adam

def build_lstm_model(input_shape):
    model = Sequential()
    
    # First LSTM layer with L2 regularization
    model.add(LSTM(64, return_sequences=True, kernel_regularizer=regularizers.l2(1e-4), input_shape=input_shape))
    model.add(BatchNormalization())
    model.add(Dropout(0.2))
    
    # Second LSTM layer
    model.add(LSTM(32, return_sequences=False, kernel_regularizer=regularizers.l2(1e-4)))
    model.add(BatchNormalization())
    model.add(Dropout(0.2))
    
    # Dense output layer
    model.add(Dense(1, activation='sigmoid', kernel_regularizer=regularizers.l2(1e-4)))

    # Compile the model
    model.compile(
        loss='binary_crossentropy',
        optimizer=Adam(learning_rate=1e-4),
        metrics=['accuracy']
    )
    return model

In [8]:
# Note: X_train.shape[1:] is (timesteps, features)
model = build_lstm_model(X_train.shape[1:])
model.summary()


  super().__init__(**kwargs)


In [None]:
import tensorflow as tf
import datetime
from tensorflow.keras.callbacks import TensorBoard

# Create a TensorBoard callback
log_dir = "logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = TensorBoard(log_dir=log_dir, histogram_freq=1)

# Fit the model with the TensorBoard callback
history = model.fit(
    X_train, y_train,
    epochs=10,
    batch_size=10,
    validation_data=(X_val, y_val),
    callbacks=[tensorboard_callback],
    verbose=1
)


Epoch 1/10
[1m872/872[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.3158 - loss: -11.7371 - val_accuracy: 0.3189 - val_loss: -12.9368
Epoch 2/10
[1m872/872[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.3150 - loss: -14.2482 - val_accuracy: 0.3043 - val_loss: -15.9401
Epoch 3/10
[1m872/872[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.3139 - loss: -15.9624 - val_accuracy: 0.2987 - val_loss: -17.8180
Epoch 4/10
[1m872/872[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.3100 - loss: -18.5431 - val_accuracy: 0.3075 - val_loss: -19.7066
Epoch 5/10
[1m872/872[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.3148 - loss: -21.0191 - val_accuracy: 0.3058 - val_loss: -21.9626
Epoch 6/10
[1m292/872[0m [32m━━━━━━[0m[37m━━━━━━━━━━━━━━[0m [1m1s[0m 2ms/step - accuracy: 0.3056 - loss: -23.4379

KeyboardInterrupt: 

In [None]:
!tensorboard --logdir logs/fit