In [1]:
# Import necessary libraries
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, Flatten, Dense
from keras.utils import to_categorical
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

import pandas as pd

df = pd.read_csv("./labelled_posts.csv")

# Drop reposts (i.e. engagement & comments = 0)
no_reposts_df = df[(df['engagement'] != 0) | (df['comments'] != 0)].copy()

# Lowercase all words
no_reposts_df.loc[:, "content"] = no_reposts_df["content"].apply(lambda x : str.lower(x).replace("\n\n", "").replace("\n", ""))


texts, labels = no_reposts_df["content"], no_reposts_df["personal_exp"]


In [2]:
from tensorflow.keras import layers, Sequential
from tensorflow import expand_dims

max_features = no_reposts_df["content"].str.len().max()
sequence_length = int(0.01 * max_features)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    texts, labels, test_size=0.2, random_state=21)

vectorize_layer = layers.TextVectorization(
    max_tokens=max_features,
    output_mode='int',
    output_sequence_length=sequence_length)

vectorize_layer.adapt(X_train)


def vectorize_text(text):
    text = expand_dims(text, -1)
    return vectorize_layer(text)


X_train_vectorized = vectorize_layer(X_train)


In [3]:

model = Sequential([
    # layers.Embedding(max_features + 1, 16),
    # layers.GlobalAveragePooling1D(),
    # layers.Dense(1)
    layers.Embedding(max_features + 1, 16, input_length=sequence_length),
    layers.Flatten(),
    layers.Dense(64, activation='relu'),
    layers.Dense(32, activation='relu'),
    layers.Dense(1, activation='sigmoid')
]
)

model.summary()


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 27, 16)            44608     
                                                                 
 flatten (Flatten)           (None, 432)               0         
                                                                 
 dense (Dense)               (None, 64)                27712     
                                                                 
 dense_1 (Dense)             (None, 32)                2080      
                                                                 
 dense_2 (Dense)             (None, 1)                 33        
                                                                 
Total params: 74433 (290.75 KB)
Trainable params: 74433 (290.75 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [4]:
model.compile(loss="binary_crossentropy",
              optimizer='adam',
              metrics=["accuracy"])

In [5]:
epochs = 12
history = model.fit(
    X_train_vectorized, y_train,
    epochs=epochs)

Epoch 1/12
Epoch 2/12
Epoch 3/12
Epoch 4/12
Epoch 5/12
Epoch 6/12
Epoch 7/12
Epoch 8/12
Epoch 9/12
Epoch 10/12
Epoch 11/12
Epoch 12/12


In [6]:
X_test_vectorized = vectorize_layer(X_test)
loss, accuracy = model.evaluate(X_test_vectorized, y_test)
print("Test Accuracy:", accuracy)

Test Accuracy: 0.761904776096344


In [8]:
df = pd.read_csv("./my_posts_labelled.csv")

# Drop reposts (i.e. engagement & comments = 0)
no_reposts_df = df[(df['engagement'] != 0) | (df['comments'] != 0)].copy()

# Lowercase all words
no_reposts_df.loc[:, "content"] = no_reposts_df["content"].apply(lambda x : str.lower(x).replace("\n\n", "").replace("\n", ""))


texts, labels = no_reposts_df["content"], no_reposts_df["personal_exp"]
X_test_vectorized = vectorize_layer(texts)
loss, accuracy = model.evaluate(X_test_vectorized, labels)
print("Test Accuracy:", accuracy)

Test Accuracy: 0.27272728085517883


# From ChatGPT

In [7]:
# Encode labels
label_encoder = LabelEncoder()
labels_encoded = label_encoder.fit_transform(labels)

# Tokenization and Padding
tokenizer = Tokenizer(num_words=1000)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
X = pad_sequences(sequences, maxlen=20)  # Adjust maxlen as needed

# Load and train Word2Vec model (replace 'sentences' with your dataset)
word2vec_model = gensim.models.Word2Vec(sentences=sequences, vector_size=100, window=5, min_count=1, sg=0)

# Create an embedding matrix
word_index = tokenizer.word_index
embedding_matrix = np.zeros((len(word_index) + 1, 100))  # Embedding size is 100
for word, i in word_index.items():
    if word in word2vec_model:
        embedding_matrix[i] = word2vec_model[word]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, labels_encoded, test_size=0.2, random_state=42)

# Build the neural network model
model = Sequential()
model.add(Embedding(len(word_index) + 1, 100, weights=[embedding_matrix], input_length=20, trainable=False))
model.add(Flatten())
model.add(Dense(64, activation='relu'))
model.add(Dense(1, activation='sigmoid'))  # Sigmoid for binary classification

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=10, batch_size=32, verbose=2)

# Evaluate the model on the test data
y_pred = model.predict(X_test)
y_pred_binary = [1 if pred > 0.5 else 0 for pred in y_pred]

# Convert one-hot encoded labels back to original labels
y_test_original = label_encoder.inverse_transform(y_test)
y_pred_original = label_encoder.inverse_transform(y_pred_binary)

# Evaluate and print model performance
accuracy = accuracy_score(y_test_original, y_pred_original)
conf_matrix = confusion_matrix(y_test_original, y_pred_original)
classification_rep = classification_report(y_test_original, y_pred_original)

print("Accuracy:", accuracy)
print("Confusion Matrix:\n", conf_matrix)
print("Classification Report:\n", classification_rep)


NameError: name 'gensim' is not defined