<a href="https://colab.research.google.com/github/SilverCobra-prog/sstp_loneliness_classifiers/blob/main/loneliness_class_CNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from textblob import TextBlob
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, classification_report

In [None]:
# Download stopwords, tokenizers, and lemmatizer for text normalization
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

In [None]:
# Importing the dataset
# Load the dataset
url = 'https://drive.google.com/uc?id={}'.format('12M_H4oziPEU5V0ee46wMbLvIPuEHj1HK')
data = pd.read_csv(url)

# Select relevant columns
data = data.rename(columns={'human_label1': 'feelLonely'}).drop('human_label2', axis=1)
data = data[["text", "feelLonely"]]

In [None]:
# Function for spelling correction
def correct_spelling(text):
    blob = TextBlob(text)
    return str(blob.correct())


# Remove punctuation from text
def remove_punctuation(text):
    return re.sub(r'[^\w\s]', '', text)

data["text"] = data["text"].apply(remove_punctuation)

# Function to convert text to lowercase
def convert_to_lowercase(text):
    return text.lower()

# Function for text normalization
def normalize_text(text):
    # Remove punctuation
    text = remove_punctuation(text)
    # Correct spelling
    text = correct_spelling(text)
    # Convert to lowercase
    text = convert_to_lowercase(text)
    # Sentence segmentation
    sentences = sent_tokenize(text)
    # Tokenize each sentence and apply lemmatization
    words = [lemmatizer.lemmatize(word) for sentence in sentences for word in word_tokenize(sentence) if word not in stop_words]
    # Join the tokens back to text
    return ' '.join(words)

In [None]:
# Text vectorization
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(data["text"])
y = data["feelLonely"]

# Split the dataset into train, validation, and test sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

In [None]:
# Convert sparse matrices to dense arrays
X_train = X_train.toarray()
X_val = X_val.toarray()

# Reshape the data for CNN
X_train = X_train.reshape(X_train.shape[0], X_train.shape[1], 1)
X_val = X_val.reshape(X_val.shape[0], X_val.shape[1], 1)

In [None]:
# Create the TextCNN model
model = Sequential()
model.add(Embedding(input_dim=X_train.shape[1], output_dim=128, input_length=X_train.shape[1]))
model.add(Conv1D(128, 5, activation='relu'))
model.add(GlobalMaxPooling1D())
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

# Define early stopping criteria
early_stopping = EarlyStopping(patience=3, restore_best_weights=True)

In [None]:
# Train the model
model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=10, batch_size=32, callbacks=[early_stopping])

# Evaluate the model
loss, accuracy = model.evaluate(X_test.reshape(X_test.shape[0], X_test.shape[1], 1), y_test)
print("Test Loss:", loss)
print("Test Accuracy:", accuracy)

In [None]:
# Plot the confusion matrix
sns.heatmap(cm, annot=True, cmap="Blues", fmt="d", xticklabels=["Not Lonely", "Lonely"], yticklabels=["Not Lonely", "Lonely"])
plt.xlabel("Predicted")
plt.ylabel("True")
plt.title("Confusion Matrix")
plt.show()

In [None]:
# Calculate classification metrics
classification_metrics = classification_report(y_test, y_pred)
print(classification_metrics)