In [None]:
from google.colab import drive
drive.mount('/content/drive')

**Data** **Preparation**

In [None]:
!pip install emoji

In [None]:
!pip install contractions

In [None]:
import pandas as pd
import re
import nltk
import string
import emoji
import contractions
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

In [None]:
nltk.download('punkt_tab')
nltk.download('stopwords')

In [None]:
import pandas as pd

# Load the dataset
df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/GAT/Mental-Health-Twitter.csv")

# Make sure the labels are balanced
positive_samples = df[df['label'] == 1].sample(n=1000, random_state=42)
negative_samples = df[df['label'] == 0].sample(n=1000, random_state=42)

# Combine and shuffle
reduced_df = pd.concat([positive_samples, negative_samples]).sample(frac=1, random_state=42)

# Save to new CSV
reduced_df.to_csv("reduced_dataset.csv", index=False)


In [None]:
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/GAT/reduced_dataset.csv')

In [None]:
print(df.head().to_string(index=False))


In [None]:
#print("Columns:", df.columns)
if 'label' in df.columns and 'post_text' in df.columns:
    df = df.rename(columns={'post_text': 'text'})
    df = df[['text', 'label']]
print("Columns:", df.columns)

print(df.head().to_string(index=False))

df = df.sample(frac=1, random_state=42).reset_index(drop=True)

In [None]:
##Data cleaning(text)
def clean_text(text):
  text = re.sub(r'@\w+|#\w+|http\S+|www\S+', '', text) ## removing symols
  text = emoji.replace_emoji(text, replace='') ## removing emojis
  text = contractions.fix(text) ## expand contractions
  text = text.translate(str.maketrans('', '', string.punctuation)) ## femove punctuations
  text = text.lower()
  return text

In [None]:
## Data Preprocessing
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()
#stop_words

def preprocess(text):
  tokens = word_tokenize(text) # tokernizer
  tokens = [word for word in tokens if word not in stop_words] # remove stop words
  tokens = [stemmer.stem(word) for word in tokens] # stemming
  return ' '.join(tokens)


In [None]:
df['cleaned_text'] = df['text'].apply(clean_text)
df['processed_text'] = df['cleaned_text'].apply(preprocess)
df = df[['processed_text', 'label']]
df.columns = ['text', 'label']
print(df.head().to_string())
df.to_csv('Preprocessed_Mental_Health_Tweets.csv', index=False) #saving preprocessed data(incase)
print(len(df))
print(df['label'].value_counts())

**Embedding** **Extraction**

In [None]:
pip install transformers torch

In [None]:
import torch
from transformers import BertTokenizer, BertModel
from transformers import RobertaTokenizer, RobertaModel
from transformers import DebertaTokenizer, DebertaModel
from tqdm import tqdm

In [None]:
# Loading pretrained models and tokenizer
bert_model = BertModel.from_pretrained('bert-base-uncased')
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

roberta_model = RobertaModel.from_pretrained('roberta-base')
roberta_tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

deberta_model = DebertaModel.from_pretrained('microsoft/deberta-base')
deberta_tokenizer = DebertaTokenizer.from_pretrained('microsoft/deberta-base')

In [None]:
# setting models to evaluation mode
bert_model.eval()
roberta_model.eval()
deberta_model.eval()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
bert_model.to(device)
roberta_model.to(device)
deberta_model.to(device)
device

In [None]:
df = pd.read_csv("Preprocessed_Mental_Health_Tweets.csv")

In [None]:
MAX_LEN = 64

def get_embedding(text, tokenizer, model):
  if not isinstance(text, str):
    text = str(text)
  inputs = tokenizer(text, return_tensors='pt', max_length=MAX_LEN, truncation=True, padding='max_length') #preparing text for model
  inputs = {k: v.to(device) for k, v in inputs.items()}
  with torch.no_grad():
    outputs = model(**inputs)
    last_hidden = outputs.last_hidden_state #shape : (1, max_len, hidden_size)
    avg_pooled = last_hidden.mean(dim=1) #(1, hidden_size)
  return avg_pooled.squeeze().cpu() #removes batch dimension(hidden_size)


#extract embeddings for all tweets
bert_embeddings = []
roberta_embeddings = []
deberta_embeddings = []

print("Extracting embeddings...")

for text in tqdm(df['text']):
  bert_embed = get_embedding(text, bert_tokenizer, bert_model)
  roberta_embed = get_embedding(text, roberta_tokenizer, roberta_model)
  deberta_embed = get_embedding(text, deberta_tokenizer, deberta_model)

  bert_embeddings.append(bert_embed)
  roberta_embeddings.append(roberta_embed)
  deberta_embeddings.append(deberta_embed)

print("Embeddings extracted.")

In [None]:
# Convert to tensors
bert_tensor = torch.stack(bert_embeddings)
roberta_tensor = torch.stack(roberta_embeddings)
deberta_tensor = torch.stack(deberta_embeddings)

# Ensure same hidden size for BER, Roberta, Deberta(768)
assert bert_tensor.shape[1] == roberta_tensor.shape[1] == deberta_tensor.shape[1] == 768

#ensemble
stacked = torch.stack([bert_tensor, roberta_tensor, deberta_tensor], dim=2) #size : (num_samples, hidden_size, 3)

#flattening
fused_embeddings = stacked.mean(dim=2)

#save
torch.save(fused_embeddings, "Feature_Matrix.pt")
torch.save(torch.tensor(df['label'].values), "Labels.pt")

print("Embeddings extracted and saved.")

**Graph Construction**

In [None]:
pip install networkx torch-geometric

In [None]:
import networkx as nx
from torch_geometric.data import Data
import numpy as np
import torch
from tqdm import tqdm

In [None]:
# Load the feature matrix and labels
feature_matrix = torch.load("/content/Feature_Matrix.pt", weights_only=False)  # shape: (N, hidden_size)
labels = torch.load("/content/Labels.pt", weights_only=False)                  # shape: (N,)

print("Feature matrix:", feature_matrix.shape)
print("Labels:", labels.shape)

In [None]:
# constructing subgraphs
BATCH_SIZE = 500

all_subgraphs = []
num_samples = feature_matrix.shape[0]

print("constructing subgraphs...")

for start_idx in tqdm(range(0, num_samples, BATCH_SIZE)):
  end_idx = min(start_idx + BATCH_SIZE, num_samples)
  batch_feats = feature_matrix[start_idx:end_idx]  # (C, 768)

  adj_matrix = torch.mm(batch_feats, batch_feats.T)  # adjacency matrix(C, C)
  adj_np = adj_matrix.numpy()

  G = nx.Graph()
  G.add_nodes_from(range(start_idx, end_idx)) #build networkx subgraph

  for i in range(adj_np.shape[0]):
    for j in range(i+1, adj_np.shape[1]):
      weight = adj_np[i, j]
      if weight > 0:
        G.add_edge(start_idx + i, start_idx + j, weight = weight)

  all_subgraphs.append(G)

print("Subgraphs constructed.")

print("Merging subgraphs...")
full_graph = nx.compose_all(all_subgraphs) #merging subgraphs into a single graph
print("Subgraphs merged.")

In [None]:
edge_index = torch.tensor(list(full_graph.edges), dtype=torch.long).t().contiguous() # converting to pytorch geometric format
x = feature_matrix
y = labels

data = Data(x=x, edge_index=edge_index, y=y)

# Save graph data
torch.save(data, "Graph_Data.pt")

print("✅ Graph construction complete.")
print("Graph stats:")
print(f"- Nodes: {data.num_nodes}")
print(f"- Edges: {data.num_edges}")
print(f"- Feature dim: {data.num_node_features}")


In [None]:
#!pip install torch-scatter
!pip install torch-sparse
!pip install scikit-learn
#pip install torch-geometric

In [None]:
import torch
import torch.nn.functional as F
from torch_geometric.nn import GATConv
from torch_geometric.data import Data
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
import random
import numpy as np

In [None]:
# Load the graph data
data: Data = torch.load("/content/drive/MyDrive/Colab Notebooks/GAT/Graph_Data.pt", weights_only=False)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
data = data.to(device)

In [None]:
def generate_masks(num_nodes, train_ratio=0.6, val_ratio=0.2, seed=42):
    np.random.seed(seed)
    indices = np.random.permutation(num_nodes)
    train_size = int(train_ratio * num_nodes)
    val_size = int(val_ratio * num_nodes)

    train_idx = indices[:train_size]
    val_idx = indices[train_size:train_size + val_size]
    test_idx = indices[train_size + val_size:]

    train_mask = torch.zeros(num_nodes, dtype=torch.bool)
    val_mask = torch.zeros(num_nodes, dtype=torch.bool)
    test_mask = torch.zeros(num_nodes, dtype=torch.bool)

    train_mask[train_idx] = True
    val_mask[val_idx] = True
    test_mask[test_idx] = True

    return train_mask, val_mask, test_mask

In [None]:
data.train_mask, data.val_mask, data.test_mask = generate_masks(data.num_nodes)


In [None]:
#define GAT model
from torch.nn import Linear, Dropout

class GAT(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels, heads=1, dropout=0.5):
        super(GAT, self).__init__()
        self.gat1 = GATConv(in_channels, hidden_channels, heads=heads, dropout=dropout)
        self.gat2 = GATConv(hidden_channels * heads, out_channels, heads=1, concat=False, dropout=dropout)
        self.dropout = Dropout(dropout)

    def forward(self, x, edge_index):
        x = self.gat1(x, edge_index)
        x = F.elu(x)
        x = self.dropout(x)
        x = self.gat2(x, edge_index)
        return x

In [None]:
model = GAT(in_channels=data.num_node_features, hidden_channels=128, out_channels=2, heads=4).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.005, weight_decay=5e-4)
loss_fn = torch.nn.CrossEntropyLoss()


In [None]:
def train():
    model.train()
    optimizer.zero_grad()
    out = model(data.x, data.edge_index)
    loss = loss_fn(out[data.train_mask], data.y[data.train_mask])
    loss.backward()
    optimizer.step()
    return loss.item()

In [None]:
@torch.no_grad()
def evaluate():
    model.eval()
    logits = model(data.x, data.edge_index)
    preds = logits.argmax(dim=1)

    results = {}
    for split in ['train', 'val', 'test']:
        mask = data[f'{split}_mask']
        y_true = data.y[mask].cpu()
        y_pred = preds[mask].cpu()
        results[split] = {
            'acc': accuracy_score(y_true, y_pred),
            'f1': f1_score(y_true, y_pred),
            'precision': precision_score(y_true, y_pred),
            'recall': recall_score(y_true, y_pred)
        }
    return results

In [None]:
for epoch in range(1, 1001):  # 50 epochs
    loss = train()
    if epoch % 50 == 0:
        metrics = evaluate()
        print(f"[Epoch {epoch:02d}] Loss: {loss:.4f} | "
              f"Val Acc: {metrics['val']['acc']:.4f} | "
              f"F1: {metrics['val']['f1']:.4f}")

In [None]:
# Step 5: Final test evaluation
final_metrics = evaluate()
print("\n✅ Final Test Metrics:")
for metric, value in final_metrics['test'].items():
    print(f"{metric.capitalize()}: {value:.4f}")