In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
os.chdir('drive/MyDrive/CIS 5300 - Final Project/Milestone 2/Data')

In [None]:
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

train_df = pd.read_csv('train_data_with_features.csv').drop(columns=['content_category'])
dev_df = pd.read_csv('dev_data_with_features.csv').drop(columns=['content_category'])
test_df = pd.read_csv('test_data_with_features.csv').drop(columns=['content_category'])

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

X_train = train_df['text']
y_train = train_df['generated']
X_dev = dev_df['text']
y_dev = dev_df['generated']
X_test = test_df['text']
y_test = test_df['generated']

print("Separated text data (X) and labels (y) for training, development, and test sets.")

Separated text data (X) and labels (y) for training, development, and test sets.


In [None]:
max_words = 20000
oov_tok = '<oov>'

tokenizer = Tokenizer(num_words=max_words, oov_token=oov_tok)
tokenizer.fit_on_texts(X_train)

print(f"Tokenizer initialized with a max vocabulary of {max_words} words and OOV token: '{oov_tok}'.")
print(f"Vocabulary built. Total words in vocabulary: {len(tokenizer.word_index)}.")

Tokenizer initialized with a max vocabulary of 20000 words and OOV token: '<oov>'.
Vocabulary built. Total words in vocabulary: 268477.


In [None]:
X_train_sequences = tokenizer.texts_to_sequences(X_train)
X_dev_sequences = tokenizer.texts_to_sequences(X_dev)
X_test_sequences = tokenizer.texts_to_sequences(X_test)

print("Text data converted to numerical sequences for training, development, and test sets.")

Text data converted to numerical sequences for training, development, and test sets.


To determine an appropriate maximum sequence length for padding, we will analyze the distribution of sequence lengths in the training data (X_train_sequences). We'll calculate the mean, median, 90th, and 95th percentiles to choose a reasonable `max_len` that captures most of the text while avoiding excessively long sequences.



In [None]:
import numpy as np

train_sequence_lengths = [len(seq) for seq in X_train_sequences]

mean_len = np.mean(train_sequence_lengths)
median_len = np.median(train_sequence_lengths)
p90_len = np.percentile(train_sequence_lengths, 90)
p95_len = np.percentile(train_sequence_lengths, 95)
p99_len = np.percentile(train_sequence_lengths, 99)

print(f"Mean sequence length: {mean_len:.2f}")
print(f"Median sequence length: {median_len:.2f}")
print(f"90th percentile sequence length: {p90_len:.2f}")
print(f"95th percentile sequence length: {p95_len:.2f}")
print(f"99th percentile sequence length: {p99_len:.2f}")

max_len = int(p95_len)
print(f"Chosen max_len for padding: {max_len}")

Mean sequence length: 394.21
Median sequence length: 363.00
90th percentile sequence length: 618.00
95th percentile sequence length: 725.00
99th percentile sequence length: 960.00
Chosen max_len for padding: 725


In [None]:
X_train_padded = pad_sequences(X_train_sequences, maxlen=max_len, padding='post', truncating='post')
X_dev_padded = pad_sequences(X_dev_sequences, maxlen=max_len, padding='post', truncating='post')
X_test_padded = pad_sequences(X_test_sequences, maxlen=max_len, padding='post', truncating='post')

print(f"Padded X_train_sequences shape: {X_train_padded.shape}")
print(f"Padded X_dev_sequences shape: {X_dev_padded.shape}")
print(f"Padded X_test_sequences shape: {X_test_padded.shape}")

Padded X_train_sequences shape: (389788, 725)
Padded X_dev_sequences shape: (48723, 725)
Padded X_test_sequences shape: (48724, 725)


In [None]:
import numpy as np

y_train = np.array(y_train)
y_dev = np.array(y_dev)
y_test = np.array(y_test)

print(f"Converted y_train to NumPy array with shape: {y_train.shape}")
print(f"Converted y_dev to NumPy array with shape: {y_dev.shape}")
print(f"Converted y_test to NumPy array with shape: {y_test.shape}")

Converted y_train to NumPy array with shape: (389788,)
Converted y_dev to NumPy array with shape: (48723,)
Converted y_test to NumPy array with shape: (48724,)


In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, Dropout, SpatialDropout1D
from tensorflow.keras.optimizers import Adam

print("Keras layers and optimizers imported successfully.")

Keras layers and optimizers imported successfully.


In [None]:
embedding_dim = 128

model = Sequential([
    Embedding(input_dim=max_words, output_dim=embedding_dim),
    SpatialDropout1D(0.2),
    Bidirectional(LSTM(128, return_sequences=False)),
    Dropout(0.3),
    Dense(1, activation='sigmoid')
])

model.build(input_shape=(None, max_len))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

print("Bi-LSTM model created and compiled successfully.")
model.summary()

Bi-LSTM model created and compiled successfully.


In [None]:
hyperparameters = [
    {
        'lstm_units': 64,
        'spatial_dropout_rate': 0.2,
        'dropout_rate': 0.2,
        'batch_size': 128,
        'epochs': 3
    },
    {
        'lstm_units': 128,
        'spatial_dropout_rate': 0.1,
        'dropout_rate': 0.1,
        'batch_size': 256,
        'epochs': 3
    },
    {
        'lstm_units': 128,
        'spatial_dropout_rate': 0.2,
        'dropout_rate': 0.3,
        'batch_size': 64,
        'epochs': 3
    }
]

results = []

print("Hyperparameter combinations defined and results list initialized.")

Hyperparameter combinations defined and results list initialized.


In [None]:
for i, params in enumerate(hyperparameters):
    print(f"\n--- Training with Hyperparameter Set {i+1} ---")
    print(params)

    model = Sequential([
        Embedding(input_dim=max_words, output_dim=embedding_dim),
        SpatialDropout1D(params['spatial_dropout_rate']),
        Bidirectional(LSTM(params['lstm_units'], return_sequences=False)),
        Dropout(params['dropout_rate']),
        Dense(1, activation='sigmoid')
    ])

    model.build(input_shape=(None, max_len))

    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

    history = model.fit(
        X_train_padded, y_train,
        epochs=params['epochs'],
        batch_size=params['batch_size'],
        validation_data=(X_dev_padded, y_dev),
        verbose=1
    )

    y_pred_probs = model.predict(X_dev_padded)
    y_pred = (y_pred_probs > 0.5).astype(int)

    accuracy = accuracy_score(y_dev, y_pred)
    precision = precision_score(y_dev, y_pred)
    recall = recall_score(y_dev, y_pred)
    f1 = f1_score(y_dev, y_pred)

    results.append({
        'hyperparameters': params,
        'dev_accuracy': accuracy,
        'dev_precision': precision,
        'dev_recall': recall,
        'dev_f1_score': f1,
        'training_history': history.history
    })

print("\nHyperparameter tuning complete. Results:")
for i, res in enumerate(results):
    print(f"Set {i+1}: Accuracy={res['dev_accuracy']:.4f}, F1-score={res['dev_f1_score']:.4f}")


--- Training with Hyperparameter Set 1 ---
{'lstm_units': 64, 'spatial_dropout_rate': 0.2, 'dropout_rate': 0.2, 'batch_size': 128, 'epochs': 1}
[1m3046/3046[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9537s[0m 3s/step - accuracy: 0.9562 - loss: 0.1221 - val_accuracy: 0.9801 - val_loss: 0.0665
[1m1523/1523[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m327s[0m 214ms/step

Hyperparameter tuning complete. Results:
Set 1: Accuracy=0.9801, F1-score=0.9734


In [None]:
best_model_result = None
best_f1 = -1

for res in results:
    if res['dev_f1_score'] > best_f1:
        best_f1 = res['dev_f1_score']
        best_model_result = res

if best_model_result:
    print("\n--- Best Hyperparameter Set ---")
    print(f"Hyperparameters: {best_model_result['hyperparameters']}")
    print(f"Development Accuracy: {best_model_result['dev_accuracy']:.4f}")
    print(f"Development Precision: {best_model_result['dev_precision']:.4f}")
    print(f"Development Recall: {best_model_result['dev_recall']:.4f}")
    print(f"Development F1-score: {best_model_result['dev_f1_score']:.4f}")
else:
    print("No results found for hyperparameter tuning.")


--- Best Hyperparameter Set ---
Hyperparameters: {'lstm_units': 64, 'spatial_dropout_rate': 0.2, 'dropout_rate': 0.2, 'batch_size': 128, 'epochs': 1}
Development Accuracy: 0.9801
Development Precision: 0.9700
Development Recall: 0.9768
Development F1-score: 0.9734
