In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import helper

In [2]:
# Import the Movies dataset
data = pd.read_csv('movies.csv')
data.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [3]:
# Filter movies with Comedy and Fantasy genres
comedy_fantasy_movies = data[data['genres'].str.contains('Comedy') & data['genres'].str.contains('Fantasy')]

In [4]:
# Extract the movie titles
movie_titles = comedy_fantasy_movies['title'].tolist()

In [5]:
# TF-IDF vectorization
tfidf_vectorizer = TfidfVectorizer(max_features=1000, stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(movie_titles)


In [6]:
# Perform K-means clustering
k = 3  # You can change the number of clusters as needed
kmeans = KMeans(n_clusters=k)
kmeans.fit(tfidf_matrix)



In [7]:
# Add cluster labels to the dataset
comedy_fantasy_movies['cluster'] = kmeans.labels_

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  comedy_fantasy_movies['cluster'] = kmeans.labels_


In [8]:
# Print movies in each cluster
for cluster_num in range(k):
    cluster_movies = comedy_fantasy_movies[comedy_fantasy_movies['cluster'] == cluster_num]
    print(f"Cluster {cluster_num + 1}:")
    print(cluster_movies['title'].tolist())

Cluster 1:
['Toy Story (1995)', 'Reckless (1995)', 'Gordy (1995)', "Kid in King Arthur's Court, A (1995)", 'Santa Clause, The (1994)', 'Flintstones, The (1994)', 'Mask, The (1994)', 'Addams Family Values (1993)', 'Last Action Hero (1993)', 'Super Mario Bros. (1993)', 'Ghost (1990)', 'Space Jam (1996)', 'Visitors, The (Visiteurs, Les) (1993)', 'Nutty Professor, The (1996)', 'Kazaam (1996)', "Joe's Apartment (1996)", 'Matilda (1996)', 'Herbie Rides Again (1974)', 'Mary Poppins (1964)', 'Aladdin and the King of Thieves (1996)', 'Willy Wonka & the Chocolate Factory (1971)', 'Drop Dead Fred (1991)', 'Monty Python and the Holy Grail (1975)', 'Lesson Faust (1994)', 'Princess Bride, The (1987)', 'Army of Darkness (1993)', 'Dead Alive (Braindead) (1992)', 'Evil Dead II (Dead by Dawn) (1987)', 'Groundhog Day (1993)', 'Young Frankenstein (1974)', 'Michael (1996)', 'Flubber (1997)', 'Borrowers, The (1997)', 'Goonies, The (1985)', 'Freaky Friday (1977)', 'Absent-Minded Professor, The (1961)', 'Devi