In [21]:
import pandas as pd
import ast
import networkx as nx
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Step 1: Read the CSV file
df = pd.read_csv('/Users/bandhaviparvathaneni/Downloads/combined_movie_genres_tags.csv')

# Step 2: Extract tags from the 'tagsWithName' column
tags = []
for tags_list_str in df['tagsWithName']:
    tags_list = ast.literal_eval(tags_list_str)  # Convert string representation of list to actual list
    tags.extend(tags_list)

# Print the length of the tags list to check if it's not empty
print("Number of tags:", len(tags))

# Step 3: Calculate Semantic Similarity
# Initialize CountVectorizer to convert tags to vectors
vectorizer = CountVectorizer()
tag_vectors = vectorizer.fit_transform(tags)

# Print the shape of the tag_vectors matrix to verify it's not empty
print("Shape of tag_vectors:", tag_vectors.shape)

# Calculate cosine similarity between tag vectors
similarity_matrix = cosine_similarity(tag_vectors)

# Print the similarity matrix to inspect the values
print("Similarity matrix:", similarity_matrix)

# Step 4: Construct a Network
# Create an empty graph
G = nx.Graph()

# Add nodes for each tag
for tag in tags:
    G.add_node(tag)

# Add edges between similar tags
for i in range(len(tags)):
    for j in range(i + 1, len(tags)):
        similarity = similarity_matrix[i, j]
        if similarity > 0.0001:  # Adjust threshold as needed
            G.add_edge(tags[i], tags[j], weight=similarity)

# Print the number of nodes and edges in the graph
print("Number of nodes in the graph:", G.number_of_nodes())
print("Number of edges in the graph:", G.number_of_edges())

# Step 5: Apply Clustering Algorithm
clusters = list(nx.algorithms.community.greedy_modularity_communities(G))

# Print the clusters
for i, cluster in enumerate(clusters):
    print(f'Cluster {i+1}: {cluster}')

Number of tags: 32896
Shape of tag_vectors: (32896, 4396)
Similarity matrix: [[1. 0. 0. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]
 [0. 0. 1. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 1. 0. 0.]
 [0. 0. 0. ... 0. 1. 0.]
 [0. 0. 0. ... 0. 0. 1.]]
Number of nodes in the graph: 4436
Number of edges in the graph: 21390
Cluster 1: frozenset({'costume drama', 'scifi cult', 'marriage', 'burton', 'tim allen', 'memories', 'high brow', 'immigrant life', 'salt lake city', 'unsimulated sex scenes', 'bfi modern classic', 'setting:la', 'paris', 'subgenre:cop buddies', 'classic monster', 'immigrant', 'hw foreign', 'police state', 'boarding school', 'sex scenes', 'woody allen classic', 'social control', 'chicago', 'lost classic', 'setting:philadelphia', 'life in general', 'setting:chicago', 'setting:diner', 'childhood flashback', 'hillarious comedy', 'cult film', 'classic horror', 'classic animated tale', 'political unrest', 'criterion', 'martial arts', 'political drama', 'unintentional comedy', 'giant monster', '

In [2]:
import csv 



# Step 5: Apply Clustering Algorithm
clusters = list(nx.algorithms.community.greedy_modularity_communities(G))

# Step 6: Store clusters in a CSV file
output_file = 'clusters.csv'
with open(output_file, 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['Cluster', 'Tags'])
    for i, cluster in enumerate(clusters):
        writer.writerow([f'Cluster {i+1}', ', '.join(cluster)])

print(f'Clusters saved to {output_file}')


Clusters saved to clusters.csv
