In [2]:
!pip install tensorflow


Collecting tensorflow
  Downloading tensorflow-2.18.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.1 kB)
Collecting absl-py>=1.0.0 (from tensorflow)
  Downloading absl_py-2.1.0-py3-none-any.whl.metadata (2.3 kB)
Collecting astunparse>=1.6.0 (from tensorflow)
  Downloading astunparse-1.6.3-py2.py3-none-any.whl.metadata (4.4 kB)
Collecting flatbuffers>=24.3.25 (from tensorflow)
  Downloading flatbuffers-24.3.25-py2.py3-none-any.whl.metadata (850 bytes)
Collecting gast!=0.5.0,!=0.5.1,!=0.5.2,>=0.2.1 (from tensorflow)
  Downloading gast-0.6.0-py3-none-any.whl.metadata (1.3 kB)
Collecting google-pasta>=0.1.1 (from tensorflow)
  Downloading google_pasta-0.2.0-py3-none-any.whl.metadata (814 bytes)
Collecting libclang>=13.0.0 (from tensorflow)
  Downloading libclang-18.1.1-py2.py3-none-manylinux2010_x86_64.whl.metadata (5.2 kB)
Collecting opt-einsum>=2.3.2 (from tensorflow)
  Downloading opt_einsum-3.4.0-py3-none-any.whl.metadata (6.3 kB)
Collecting termcolor>=1.1.0 (

In [29]:
import numpy as np
import re
import pandas as pd
import nltk
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping, TensorBoard
from tensorflow.keras.utils import to_categorical
from sklearn.metrics import classification_report
import datetime
from gensim.models import KeyedVectors
from nltk.corpus import stopwords                   #Stopwords corpus
nltk.download('stopwords')
from sklearn.utils.class_weight import compute_class_weight
from tensorflow.keras.regularizers import l2
from tensorflow.keras.optimizers import Adam

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/imanandrea/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
# Load dataset
df = pd.read_csv("/mnt/c/Users/imana/Desktop/Masters/Foundations of Artificial Intelligence - AI701/AI_AES_project/dataset/processed_essays.csv")  # Update this with the path to your CSV file

# Text Preprocessing
def preprocess_text_simple(text):
    text = text.lower()
    text = re.sub(r'\b\w\b', '', text)  # Remove single characters
    text = re.sub(r'\s+', ' ', text)  # Remove extra whitespace
    return text.strip()


def essay_to_wordlist(essay_v, remove_stopwords):
    """Remove the tagged labels and word tokenize the sentence."""
    essay_v = re.sub("[^a-zA-Z]", " ", essay_v)
    words = essay_v.lower().split()
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]
    return (words)

def essay_to_sentences(essay_v, remove_stopwords):
    """Sentence tokenize the essay and call essay_to_wordlist() for word tokenization."""
    tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    raw_sentences = tokenizer.tokenize(essay_v.strip())
    sentences = []
    for raw_sentence in raw_sentences:
        if len(raw_sentence) > 0:
            sentences.append(essay_to_wordlist(raw_sentence, remove_stopwords))
    return sentences

def makeFeatureVec(words, model, num_features):
    """Make Feature Vector from the words list of an Essay."""
    featureVec = np.zeros((num_features,),dtype="float32")
    num_words = 0.
    try:
        index2word_set = set(model.wv.index_to_key)
    except(AttributeError):
        index2word_set = set(model.index_to_key)

    for word in words:
        if word in index2word_set:
            num_words += 1
            try:
                featureVec = np.add(featureVec,model.wv.get_vector(word))
            except(AttributeError):
                featureVec = np.add(featureVec,model.get_vector(word))

    featureVec = np.divide(featureVec,num_words)
    return featureVec


def getAvgFeatureVecs(essays, model, num_features):
    """Main function to generate the word vectors for word2vec model."""
    counter = 0
    essayFeatureVecs = np.zeros((len(essays),num_features),dtype="float32")
    for essay in essays:
        essayFeatureVecs[counter] = makeFeatureVec(essay, model, num_features)
        counter = counter + 1
    return essayFeatureVecs


In [3]:
df['processed_text'] = df['essay'].apply(preprocess_text_simple)

# Encode labels
label_encoder = LabelEncoder()
df['label_encoded'] = label_encoder.fit_transform(df['label'])
df = df.drop(['label', 'essay'], axis=1)

print(df)  # Should show only 0s and 1s

                                         processed_text  label_encoded
0     dear local newspaper, think effects computers ...              0
1     dear @caps1 @caps2, believe that using compute...              0
2     dear, @caps1 @caps2 @caps3 more and more peopl...              0
3     dear local newspaper, @caps1 have found that m...              0
4     dear @location1, know having computers has pos...              0
...                                                 ...            ...
8871  the mood of this memoir is nonfiction. the moo...              1
8872  the mood was created by the author in the memo...              1
8873  in the memoir "narciso rodriguez", the mood cr...              1
8874  the mood created @caps3 the author, narciso ro...              1
8875  the author created such specific mood for this...              1

[8876 rows x 2 columns]


In [4]:
print(df['label_encoded'].value_counts())  # Should show only 0s and 1s

label_encoded
1    5303
0    3573
Name: count, dtype: int64


In [5]:
# Split dataset: 70% train, 15% dev, 15% test
train, temp = train_test_split(df, test_size=0.3, stratify=df['label_encoded'], random_state=42)
test, dev = train_test_split(temp, test_size=0.5, stratify=temp['label_encoded'], random_state=42)

In [22]:
# Get labels
y_train = train['label_encoded']
y_dev = dev['label_encoded']
y_test = test['label_encoded']


In [32]:
model = Sequential()

# LSTM layer with L2 regularization, batch normalization, and fewer units
model.add(LSTM(16, dropout=0.2, recurrent_dropout=0.2, input_shape=(1, 300), kernel_regularizer=l2(0.01)))
model.add(BatchNormalization())
model.add(Dense(16, activation='relu', kernel_regularizer=l2(0.01)))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

# Compile with a lower learning rate
model.compile(optimizer=Adam(learning_rate=0.0001), loss='binary_crossentropy', metrics=['accuracy'])


  super().__init__(**kwargs)


In [8]:
word2vec_path = '/mnt/c/Users/imana/Desktop/Masters/Foundations of Artificial Intelligence - AI701/AI_AES_project/models/word2vecmodel_overall_score.bin'
w2v_model = KeyedVectors.load_word2vec_format(word2vec_path, binary=True)

# Step 3: Generate Feature Vectors for Essays
clean_train_essays = [essay_to_wordlist(essay, remove_stopwords=True) for essay in train['processed_text']]
clean_test_essays = [essay_to_wordlist(essay, remove_stopwords=True) for essay in test['processed_text']]
clean_dev_essays = [essay_to_wordlist(essay, remove_stopwords=True) for essay in dev['processed_text']]

In [9]:
num_features = 300
trainDataVecs = getAvgFeatureVecs(clean_train_essays, w2v_model, num_features)
testDataVecs = getAvgFeatureVecs(clean_test_essays, w2v_model, num_features)
devDataVecs = getAvgFeatureVecs(clean_dev_essays, w2v_model, num_features)

In [10]:
print(trainDataVecs.shape)

(6213, 300)


In [11]:
# Reshape data for LSTM
trainDataVecs = np.reshape(trainDataVecs, (trainDataVecs.shape[0], 1, trainDataVecs.shape[1]))
testDataVecs = np.reshape(testDataVecs, (testDataVecs.shape[0], 1, testDataVecs.shape[1]))
devDataVecs = np.reshape(devDataVecs, (devDataVecs.shape[0], 1, devDataVecs.shape[1]))

In [25]:
y_train = np.array(y_train)
y_dev = np.array(y_dev)
y_test = np.array(y_test)

In [13]:
print("Shape of trainDataVecs:", trainDataVecs.shape)  # Expected shape: (num_samples, 1, 300)
print("Shape of y_train:", y_train.shape)              # Expected shape: (num_samples,)
print("Shape of devDataVecs:", devDataVecs.shape)
print("Shape of y_dev:", y_dev.shape)

Shape of trainDataVecs: (6213, 1, 300)
Shape of y_train: (6213,)
Shape of devDataVecs: (1332, 1, 300)
Shape of y_dev: (1332,)


In [33]:
# Define EarlyStopping
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
log_dir = "logs/classifier/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")

tensorboard_callback = TensorBoard(log_dir=log_dir, histogram_freq=1)

class_weights = compute_class_weight('balanced', classes=np.array([0, 1]), y=train['label_encoded'])
class_weight_dict = {0: class_weights[0], 1: class_weights[1]}

# Train the model
history = model.fit(
    trainDataVecs, y_train,
    validation_data=(devDataVecs, y_dev),
    epochs=10,
    batch_size=32,
    callbacks=[early_stopping, tensorboard_callback],
    class_weight=class_weight_dict
)

# Evaluate on the test set
test_loss, test_accuracy = model.evaluate(testDataVecs, y_test)
print(f"Test Accuracy: {test_accuracy}")

# Classification report
y_test_pred = (model.predict(testDataVecs) > 0.5).astype("int32").flatten()
y_test_true = y_test
print(classification_report(y_test_true, y_test_pred, target_names=label_encoder.classes_))

Epoch 1/10
[1m195/195[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 30ms/step - accuracy: 0.7615 - loss: 1.7151 - val_accuracy: 0.9857 - val_loss: 1.5361
Epoch 2/10
[1m195/195[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 24ms/step - accuracy: 0.9366 - loss: 1.1942 - val_accuracy: 0.9880 - val_loss: 1.1629
Epoch 3/10
[1m195/195[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 21ms/step - accuracy: 0.9636 - loss: 0.8833 - val_accuracy: 0.9880 - val_loss: 0.7731
Epoch 4/10
[1m195/195[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 21ms/step - accuracy: 0.9733 - loss: 0.6842 - val_accuracy: 0.9880 - val_loss: 0.5337
Epoch 5/10
[1m195/195[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 21ms/step - accuracy: 0.9822 - loss: 0.5427 - val_accuracy: 0.9895 - val_loss: 0.4161
Epoch 6/10
[1m195/195[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 21ms/step - accuracy: 0.9812 - loss: 0.4425 - val_accuracy: 0.9895 - val_loss: 0.3441
Epoch 7/10
[1m195/19

In [None]:
pred = model.predict(testDataVecs)

[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step


array([0.03990596], dtype=float32)

In [58]:
essay = 1250
print('essay: ',test['processed_text'].iloc[essay],'\n','label:', test['label_encoded'].iloc[essay])
print('predicted:', pred[essay])

essay:  think we can all agree that computer usage is very controversal issue. in my opinion, believe that computers have negative effect on people. for instance, it' not safe and children can get into all sorts of things on the internet. also, people spend too much time in front the computer now days, @caps1, its major distraction and also negetive effect on kids. school work. it' now or never! do we dicide that computers have negetive effect? you decide! isn' every parents biggest concern the safety of their children? when on the internet, kids are capable of accessing anything and everything. sometimes kids don' even look for bad things, they just pop up. would you want your child veiwing things that you have no control over? also, websites like @caps2.com one one of the greatest concerns when it comes to internet safety. although you are supposed to be at least @num1 to have @caps2, most kids lie about their age. did you know that @num2 out of @num3 @caps2 users lie about their age