In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, classification_report

2023-07-15 17:30:05.411698: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# Load the dataset
data = pd.read_csv("human_annotations_same.csv")  
data = data.rename(columns={'human_label1': 'feelLonely'}).drop('human_label2', axis=1)

# Select relevant columns
data = data[["text", "feelLonely"]]

# Write the modified data to the CSV file
data.to_csv('human_annotations_same1.csv', index=False)

In [3]:
# Preprocessing
# Text vectorization
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(data["text"])
y = data["feelLonely"]

# Split the dataset into train, validation, and test sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Convert sparse matrices to dense arrays
X_train = X_train.toarray()
X_val = X_val.toarray()

In [4]:
# Create the neural network model
model = Sequential()
model.add(Dense(64, activation="relu", input_shape=(X_train.shape[1],)))
model.add(Dropout(0.5))
model.add(Dense(32, activation="relu", input_shape=(X_train.shape[1],)))
model.add(Dropout(0.5))
model.add(Dense(2, activation="softmax"))

# Compile the model
model.compile(loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy"])  

# Define early stopping criteria
early_stopping = EarlyStopping(patience=3, restore_best_weights=True)

In [5]:
# Train the model
model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=10, batch_size=32, callbacks=[early_stopping])

# Evaluate the model
loss, accuracy = model.evaluate(X_test.toarray(), y_test)
print("Test Loss:", loss)
print("Test Accuracy:", accuracy)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Test Loss: 0.42868825793266296
Test Accuracy: 0.7964338660240173


In [6]:
# Perform tenfold cross-validation
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(model, X, y, cv=cv, scoring="accuracy")

# Print the cross-validation scores
print("Cross-Validation Scores:", scores)
print("Mean Accuracy:", np.mean(scores))

NameError: name 'StratifiedKFold' is not defined

In [None]:
# Create the confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Plot the confusion matrix
sns.heatmap(cm, annot=True, cmap="Blues", fmt="d", xticklabels=["Not Lonely", "Lonely"], yticklabels=["Not Lonely", "Lonely"])
plt.xlabel("Predicted")
plt.ylabel("True")
plt.title("Confusion Matrix")
plt.show()

In [None]:
# Calculate classification metrics
classification_metrics = classification_report(y_test, y_pred)

# Print the classification metrics
print(classification_metrics)