In [None]:
import pandas as pd
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import MultiLabelBinarizer, StandardScaler

In [None]:
#source: https://www.kaggle.com/datasets/saincoder404/imdb-top-movies-dataset?resource=download
data = pd.read_csv("/content/top_rated_movies.csv")

In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8560 entries, 0 to 8559
Data columns (total 15 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Unnamed: 0         8560 non-null   int64  
 1   adult              8560 non-null   bool   
 2   backdrop_path      8553 non-null   object 
 3   genre_ids          8560 non-null   object 
 4   id                 8560 non-null   int64  
 5   original_language  8560 non-null   object 
 6   original_title     8560 non-null   object 
 7   overview           8560 non-null   object 
 8   popularity         8560 non-null   float64
 9   poster_path        8557 non-null   object 
 10  release_date       8560 non-null   object 
 11  title              8560 non-null   object 
 12  video              8560 non-null   bool   
 13  vote_average       8560 non-null   float64
 14  vote_count         8560 non-null   int64  
dtypes: bool(2), float64(2), int64(3), object(8)
memory usage: 886.2+ KB


In [None]:
data = data.drop(["Unnamed: 0", "adult", "backdrop_path", "original_language",
                  "original_title", "poster_path", "release_date", "video"], axis=1)

In [None]:
data.head()

Unnamed: 0,genre_ids,id,overview,popularity,title,vote_average,vote_count
0,"[18, 80]",278,Imprisoned in the 1940s for the double murder ...,123.586,The Shawshank Redemption,8.705,26207
1,"[18, 80]",238,"Spanning the years 1945 to 1955, a chronicle o...",148.891,The Godfather,8.695,19870
2,"[18, 80]",240,In the continuing saga of the Corleone crime f...,152.722,The Godfather Part II,8.6,11994
3,"[18, 36, 10752]",424,The true story of how businessman Oskar Schind...,76.924,Schindler's List,8.6,15402
4,[18],389,The defense and the prosecution have rested an...,72.1,12 Angry Men,8.541,8272


In [None]:
len(data["genre_ids"].unique())

1922

In [None]:
# Menggunakan MultiLabelBinarizer untuk genre
mlb = MultiLabelBinarizer()
genre_one_hot = mlb.fit_transform(data['genre_ids'].apply(eval))

# Membuat dataframe dari one-hot encoding genre
data_genres = pd.DataFrame(genre_one_hot, columns=mlb.classes_)

In [None]:
data_genres.shape

(8560, 18)

In [None]:
# Fitur yang akan dinormalisasi
numerical_features = ['popularity', 'vote_average', 'vote_count']

# Menggunakan StandardScaler untuk normalisasi
scaler = StandardScaler()
data_numerical = pd.DataFrame(scaler.fit_transform(data[numerical_features]), columns=numerical_features)

In [None]:
data_numerical.shape

(8560, 3)

In [None]:
# Menggabungkan semua fitur
data_features = pd.concat([data_genres, data_numerical], axis=1)
# Konversi semua nama kolom ke string
data_features.columns = data_features.columns.astype(str)

In [None]:
data_features.head()

Unnamed: 0,12,14,16,18,27,28,35,36,37,53,...,878,9648,10402,10749,10751,10752,10770,popularity,vote_average,vote_count
0,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0.948442,2.979946,7.512878
1,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1.233774,2.964442,5.540094
2,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1.276971,2.817154,3.0882
3,0,0,0,1,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0.422294,2.817154,4.149152
4,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0.3679,2.72568,1.929497


In [None]:
# Inisialisasi model
model = NearestNeighbors(n_neighbors=5, algorithm='auto')

# Melatih model dengan fitur
model.fit(data_features)

In [None]:
# Pilih film yang ingin dicari tetangganya, misalnya film dengan id 278
target_movie_index = data[data['id'] == 278].index[0]

# Mendapatkan fitur film target sebagai DataFrame
target_movie_features = data_features.iloc[[target_movie_index]]

# Mencari tetangga terdekat
distances, indices = model.kneighbors(target_movie_features)

# Menampilkan hasil
closest_movies = data.iloc[indices[0]]
print(closest_movies[['id', 'title']])

         id                     title
0       278  The Shawshank Redemption
254  106646   The Wolf of Wall Street
139   68718          Django Unchained
13       13              Forrest Gump
1       238             The Godfather
