In [2]:
import pandas as pd
import json
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import networkx as nx
from sklearn.cluster import SpectralClustering
from collections import defaultdict

In [3]:
# Load the Sarcasm Headlines Dataset and extract the "headline" column
dataset_path = '/kaggle/input/newsheadlines/Sarcasm_Headlines_Dataset.json'  # Adjust path if needed
with open(dataset_path, 'r') as f:
    data = [json.loads(line) for line in f]

In [4]:
# Create a DataFrame and extract headlines
df = pd.DataFrame(data)
headlines = df['headline']

In [5]:
# Step 1: Vectorize the headlines using TF-IDF
vectorizer = TfidfVectorizer(stop_words='english')
headline_vectors = vectorizer.fit_transform(headlines)

In [6]:
# Step 2: Compute the similarity matrix
similarity_matrix = cosine_similarity(headline_vectors)

In [7]:
# Step 3: Create a graph based on similarity
threshold = 0.3  # Set threshold for similarity
graph = nx.Graph()
for i in range(similarity_matrix.shape[0]):
    for j in range(i + 1, similarity_matrix.shape[1]):
        if similarity_matrix[i, j] > threshold:
            graph.add_edge(i, j, weight=similarity_matrix[i, j])

In [8]:
# Step 4: Apply Spectral Clustering
n_clusters = 5  # Set desired number of clusters
clustering = SpectralClustering(n_clusters=n_clusters, affinity='precomputed', random_state=42)
labels = clustering.fit_predict(similarity_matrix)



In [9]:
# Step 5: Prepare the adjacency list for Graclus format
adjacency_list = defaultdict(list)
edges = []
for i, j, weight in graph.edges(data='weight'):
    # Add bidirectional edges
    adjacency_list[i].append((j + 1, weight))  # Adjusting for 1-based index
    adjacency_list[j].append((i + 1, weight))  # Ensure bidirectionality
    edges.append((i + 1, j + 1, weight))  # Store for edge count and writing

In [10]:
# Step 6: Save the adjacency list as a .graph file on Kaggle
num_nodes = similarity_matrix.shape[0]
num_edges = len(edges)

In [11]:
# Save to a specific path on Kaggle (working directory)
output_path = '/kaggle/working/output.graph'

with open(output_path, "w") as f:
    f.write(f"{num_nodes} {num_edges} 1\n")  # '1' indicates weighted graph
    for edge in edges:
        f.write(f"{edge[0]} {edge[1]} {edge[2]}\n")  # Node IDs and weight

print(f"Graph file saved to {output_path}")

Graph file saved to /kaggle/working/output.graph
