# This Program is made by Kasra Tookallo

# In the year 2026

# Installation Requirements

In [1]:
!pip install kaggle



# To make directory based on Linux

In [2]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

# Call relative DataSets from Kaggle directly from Google Colab

In [3]:
!git clone https://github.com/rapidsai/rapidsai-csp-utils.git
!python rapidsai-csp-utils/colab/pip-install.py

Cloning into 'rapidsai-csp-utils'...
remote: Enumerating objects: 629, done.[K
remote: Counting objects: 100% (195/195), done.[K
remote: Compressing objects: 100% (110/110), done.[K
remote: Total 629 (delta 149), reused 86 (delta 85), pack-reused 434 (from 3)[K
Receiving objects: 100% (629/629), 207.69 KiB | 7.69 MiB/s, done.
Resolving deltas: 100% (323/323), done.
Installing RAPIDS remaining 25.10 libraries
Using Python 3.12.12 environment at: /usr
Resolved 174 packages in 3.14s
Prepared 37 packages in 41.23s
Uninstalled 31 packages in 675ms
Installed 37 packages in 424ms
 - bokeh==3.8.2
 + bokeh==3.6.3
 - cucim-cu12==26.2.0 (from https://pypi.nvidia.com/cucim-cu12/cucim_cu12-26.2.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl)
 + cucim-cu12==25.10.0
 - cudf-cu12==26.2.1
 + cudf-cu12==25.10.0
 + cugraph-cu12==25.10.1
 - cuml-cu12==26.2.0
 + cuml-cu12==25.10.0
 + cuxfilter-cu12==25.10.0
 - dask==2026.1.1
 + dask==2025.9.1
 - dask-cuda==26.2.0
 + dask-cuda==25.10.0
 -

# Download and Unzip Command

In [4]:
!kaggle datasets download -d grouplens/movielens-20m-dataset

Dataset URL: https://www.kaggle.com/datasets/grouplens/movielens-20m-dataset
License(s): unknown
Downloading movielens-20m-dataset.zip to /content
 89% 174M/195M [00:00<00:00, 508MB/s] 
100% 195M/195M [00:00<00:00, 515MB/s]


In [5]:
!unzip movielens-20m-dataset.zip

Archive:  movielens-20m-dataset.zip
  inflating: genome_scores.csv       
  inflating: genome_tags.csv         
  inflating: link.csv                
  inflating: movie.csv               
  inflating: rating.csv              
  inflating: tag.csv                 


# Import Libraries

In [6]:
import cudf
import numpy as np
from cuml.feature_extraction.text import TfidfVectorizer,CountVectorizer
from cuml.neighbors import NearestNeighbors


## Data Extraction

In [7]:
# uploading datasets on GPU
ratings = cudf.read_csv("/content/rating.csv")


In [8]:
movies  = cudf.read_csv("/content/movie.csv")

In [9]:
tags = cudf.read_csv("/content/tag.csv")

In [10]:
tags.info()

<class 'cudf.core.dataframe.DataFrame'>
RangeIndex: 465426 entries, 0 to 465425
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype
---  ------     --------------   -----
 0   userId     465426 non-null  int64
 1   movieId    465426 non-null  int64
 2   tag        465426 non-null  object
 3   timestamp  465426 non-null  object
dtypes: int64(2), object(2)
memory usage: 24.0+ MB


## Datatype Rectifying & Droping Unnecessary Information

In [11]:
tags['movieId'] = tags['movieId'].astype('uint32')
tags.drop("timestamp", axis=1, inplace=True)
tags = tags.drop(columns=['userId'])

In [12]:
ratings["userId"] = ratings["userId"].astype(np.uint32)
ratings["movieId"] = ratings["movieId"].astype(np.uint32)
ratings["rating"] = ratings["rating"].astype("float32")
# The timestamp was submitted by the user. (not-required)
ratings = ratings.drop("timestamp", axis=1)

In [13]:
movies['movieId'] = movies['movieId'].astype('uint32')

### Numpy utilization for str detection

In [14]:
tags_list = tags['tag'].to_arrow().to_pylist()
tags_max = tags.groupby('movieId').first().reset_index()
tags['tag'] = tags['tag'].astype(str)
tags = tags.sort_values('movieId')

### Converting cudf to pandas before arranging & combining strings into one column

In [15]:
tags_pd = tags[['movieId', 'tag']].to_pandas()

# grouping all tags to a str column
tags_grouped = tags_pd.groupby('movieId')['tag'].apply(lambda x: ' '.join(x)).reset_index()

tags_grouped.rename(columns={'tag': 'combined_tags'}, inplace=True)

tags_grouped

Unnamed: 0,movieId,combined_tags
0,1,Watched computer animation Disney animated fea...
1,2,time travel adapted from:book board game child...
2,3,old people that is actually funny sequel fever...
3,4,chick flick revenge characters chick flick cha...
4,5,Diane Keaton family sequel Steve Martin weddin...
...,...,...
19539,131054,dinosaurs
19540,131082,documentary Yoshitomo Nara
19541,131164,Vietnam War
19542,131170,alternate reality


In [16]:
tags = tags_grouped
tags

Unnamed: 0,movieId,combined_tags
0,1,Watched computer animation Disney animated fea...
1,2,time travel adapted from:book board game child...
2,3,old people that is actually funny sequel fever...
3,4,chick flick revenge characters chick flick cha...
4,5,Diane Keaton family sequel Steve Martin weddin...
...,...,...
19539,131054,dinosaurs
19540,131082,documentary Yoshitomo Nara
19541,131164,Vietnam War
19542,131170,alternate reality


In [17]:
ratings.info()

<class 'cudf.core.dataframe.DataFrame'>
RangeIndex: 20000263 entries, 0 to 20000262
Data columns (total 3 columns):
 #   Column   Dtype
---  ------   -----
 0   userId   uint32
 1   movieId  uint32
 2   rating   float32
dtypes: float32(1), uint32(2)
memory usage: 228.9 MB


In [18]:
ratings

Unnamed: 0,userId,movieId,rating
0,1,2,3.5
1,1,29,3.5
2,1,32,3.5
3,1,47,3.5
4,1,50,3.5
...,...,...,...
20000258,138493,68954,4.5
20000259,138493,69526,4.5
20000260,138493,69644,3.0
20000261,138493,70286,5.0


#### Ratings DataFrame has +20m samples with three features
#### Including multiple ratings relevant to a single movieId from different userId which should be combined as one.
#### Preference is on the highest rating submitted.  

In [19]:
ratings = ratings.drop_duplicates(subset=['movieId', 'rating'])
ratings = ratings.groupby('movieId')['rating'].max().reset_index()
ratings


Unnamed: 0,movieId,rating
0,69075,5.0
1,3589,5.0
2,1122,5.0
3,2042,5.0
4,5513,5.0
...,...,...
26739,4959,5.0
26740,4530,5.0
26741,109351,3.0
26742,86911,5.0


In [20]:
movies.info()

<class 'cudf.core.dataframe.DataFrame'>
RangeIndex: 27278 entries, 0 to 27277
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype
---  ------   --------------  -----
 0   movieId  27278 non-null  uint32
 1   title    27278 non-null  object
 2   genres   27278 non-null  object
dtypes: object(2), uint32(1)
memory usage: 1.4+ MB


In [21]:
print(tags.info())
print("-"*15)
print(ratings.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19544 entries, 0 to 19543
Data columns (total 2 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   movieId        19544 non-null  uint32
 1   combined_tags  19544 non-null  object
dtypes: object(1), uint32(1)
memory usage: 229.2+ KB
None
---------------
<class 'cudf.core.dataframe.DataFrame'>
RangeIndex: 26744 entries, 0 to 26743
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype
---  ------   --------------  -----
 0   movieId  26744 non-null  uint32
 1   rating   26744 non-null  float32
dtypes: float32(1), uint32(1)
memory usage: 208.9 KB
None


#### At First Convert pandas to cudf in an attempt for improving speed proficiency.
#### Then merge the first two Datasets (out of three) based on movieId.
#### Ultimately, merging all Datasets into one general DataFrame.

In [22]:
tags = cudf.DataFrame.from_pandas(tags_grouped)
merged_ra_ta = cudf.merge(ratings, tags, on=['movieId'], how='outer')
merged_ra_ta

Unnamed: 0,movieId,rating,combined_tags
0,115240,4.0,Iceland road trip
1,105015,2.0,nude black women nudity (topless)
2,6688,5.0,Vladimír Michálek 01/11 01/12 02/11 03/11 04/1...
3,39231,5.0,Bad atmospheric music life relaxing funeral Ki...
4,56837,4.5,playwright:Shakespeare based on a play royalty...
...,...,...,...
27273,128886,,Michel Deville
27274,129201,,California San Francisco
27275,129443,,Matt Muir
27276,129820,,high school robots


In [23]:
merged_final = cudf.merge(
    merged_ra_ta,
    movies,
    on=['movieId'],
    how='left'
)
merged_final

Unnamed: 0,movieId,rating,combined_tags,title,genres
0,79879,5.0,death/fatality 3D version blood Christopher Ll...,Piranha (Piranha 3D) (2010),Action|Horror|Thriller
1,4180,5.0,girls in prison spoof sybil danning wendy o wi...,Reform School Girls (1986),Action|Drama
2,80748,5.0,adapted from:book author:Lewis Carroll Norman ...,Alice in Wonderland (1933),Adventure|Children|Fantasy
3,6272,5.0,less than 300 ratings 1.5 crime poignant psych...,Eye of God (1997),Drama
4,104849,4.0,prank calls serial killer teenagers,Lisa (1990),Drama|Thriller
...,...,...,...,...,...
27273,123481,3.0,,Storm Warning (1951),Crime|Drama
27274,27779,4.0,,Moon Child (2003),Action|Drama|Sci-Fi
27275,106674,3.0,,Crackerjack (1994),Action
27276,110372,3.0,,"Boeing, Boeing (1965)",Comedy


##### Finding none value features

In [24]:
merged_final.isna().sum()

movieId             0
rating            534
combined_tags    7734
title               0
genres              0
dtype: int64

In [25]:
merged_final.info()

<class 'cudf.core.dataframe.DataFrame'>
RangeIndex: 27278 entries, 0 to 27277
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype
---  ------         --------------  -----
 0   movieId        27278 non-null  uint32
 1   rating         26744 non-null  float32
 2   combined_tags  19544 non-null  object
 3   title          27278 non-null  object
 4   genres         27278 non-null  object
dtypes: float32(1), object(3), uint32(1)
memory usage: 7.0+ MB


##### Filling none value features.
##### <li>nan integers with ratings.mean()
##### <li>nan strings with Unknown.

In [26]:
merged_final['rating'] = merged_final['rating'].fillna("2.5")
merged_final['combined_tags'] = merged_final['combined_tags'].fillna("Unknown")


In [27]:
merged_final.info()

<class 'cudf.core.dataframe.DataFrame'>
RangeIndex: 27278 entries, 0 to 27277
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype
---  ------         --------------  -----
 0   movieId        27278 non-null  uint32
 1   rating         27278 non-null  float32
 2   combined_tags  27278 non-null  object
 3   title          27278 non-null  object
 4   genres         27278 non-null  object
dtypes: float32(1), object(3), uint32(1)
memory usage: 7.0+ MB


In [28]:

merged_df = merged_final
merged_df

Unnamed: 0,movieId,rating,combined_tags,title,genres
0,79879,5.0,death/fatality 3D version blood Christopher Ll...,Piranha (Piranha 3D) (2010),Action|Horror|Thriller
1,4180,5.0,girls in prison spoof sybil danning wendy o wi...,Reform School Girls (1986),Action|Drama
2,80748,5.0,adapted from:book author:Lewis Carroll Norman ...,Alice in Wonderland (1933),Adventure|Children|Fantasy
3,6272,5.0,less than 300 ratings 1.5 crime poignant psych...,Eye of God (1997),Drama
4,104849,4.0,prank calls serial killer teenagers,Lisa (1990),Drama|Thriller
...,...,...,...,...,...
27273,123481,3.0,Unknown,Storm Warning (1951),Crime|Drama
27274,27779,4.0,Unknown,Moon Child (2003),Action|Drama|Sci-Fi
27275,106674,3.0,Unknown,Crackerjack (1994),Action
27276,110372,3.0,Unknown,"Boeing, Boeing (1965)",Comedy


In [29]:
merged_df.info()


<class 'cudf.core.dataframe.DataFrame'>
RangeIndex: 27278 entries, 0 to 27277
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype
---  ------         --------------  -----
 0   movieId        27278 non-null  uint32
 1   rating         27278 non-null  float32
 2   combined_tags  27278 non-null  object
 3   title          27278 non-null  object
 4   genres         27278 non-null  object
dtypes: float32(1), object(3), uint32(1)
memory usage: 7.0+ MB


In [30]:
merged_df.columns

Index(['movieId', 'rating', 'combined_tags', 'title', 'genres'], dtype='object')

#### Combining (rating, combined_tags, and genres) features into one, aiming to further operation.

In [31]:
merged_df["combined_features"] = (
    merged_df["rating"].astype(str) + " " +
    merged_df["combined_tags"].astype(str) + " " +
    merged_df["genres"].astype(str)
)
merged_df

Unnamed: 0,movieId,rating,combined_tags,title,genres,combined_features
0,79879,5.0,death/fatality 3D version blood Christopher Ll...,Piranha (Piranha 3D) (2010),Action|Horror|Thriller,5.0 death/fatality 3D version blood Christophe...
1,4180,5.0,girls in prison spoof sybil danning wendy o wi...,Reform School Girls (1986),Action|Drama,5.0 girls in prison spoof sybil danning wendy ...
2,80748,5.0,adapted from:book author:Lewis Carroll Norman ...,Alice in Wonderland (1933),Adventure|Children|Fantasy,5.0 adapted from:book author:Lewis Carroll Nor...
3,6272,5.0,less than 300 ratings 1.5 crime poignant psych...,Eye of God (1997),Drama,5.0 less than 300 ratings 1.5 crime poignant p...
4,104849,4.0,prank calls serial killer teenagers,Lisa (1990),Drama|Thriller,4.0 prank calls serial killer teenagers Drama|...
...,...,...,...,...,...,...
27273,123481,3.0,Unknown,Storm Warning (1951),Crime|Drama,3.0 Unknown Crime|Drama
27274,27779,4.0,Unknown,Moon Child (2003),Action|Drama|Sci-Fi,4.0 Unknown Action|Drama|Sci-Fi
27275,106674,3.0,Unknown,Crackerjack (1994),Action,3.0 Unknown Action
27276,110372,3.0,Unknown,"Boeing, Boeing (1965)",Comedy,3.0 Unknown Comedy


In [32]:
final_df = merged_df[["movieId", "title", "combined_features"]]
final_df = final_df.reset_index(drop=True)
final_df.head(12)

Unnamed: 0,movieId,title,combined_features
0,79879,Piranha (Piranha 3D) (2010),5.0 death/fatality 3D version blood Christophe...
1,4180,Reform School Girls (1986),5.0 girls in prison spoof sybil danning wendy ...
2,80748,Alice in Wonderland (1933),5.0 adapted from:book author:Lewis Carroll Nor...
3,6272,Eye of God (1997),5.0 less than 300 ratings 1.5 crime poignant p...
4,104849,Lisa (1990),4.0 prank calls serial killer teenagers Drama|...
5,8835,Paparazzi (2004),4.5 Patrick Stewart thrilling Gfei own it Eric...
6,94539,Viva Riva! (2010),3.0 Unknown Drama
7,88042,Cornered (1945),4.5 Unknown Film-Noir|Thriller
8,1723,Twisted (1996),5.0 Unknown Comedy|Drama
9,125399,The Story of Alexander Graham Bell (1939),3.0 Unknown Drama


#### Assessing reliability of the final DataFrame (movieId feature) before applying the recommendation_system.

In [33]:
print("Total rows:", len(final_df))
print("Unique movieId:", final_df['movieId'].nunique())

final_df['movieId'].value_counts().head(10)

Total rows: 27278
Unique movieId: 27278


movieId
60137    1
3510     1
27857    1
71973    1
3311     1
77344    1
89635    1
7772     1
47634    1
82167    1
Name: count, dtype: int64

In [34]:
duplicates = final_df[final_df.duplicated(subset='movieId', keep=False)]
duplicates.sort_values('movieId').head(20)

Unnamed: 0,movieId,title,combined_features


### Recommendation_System commences at here

In [35]:
vectorizer = TfidfVectorizer(
    stop_words='english',  # Eliminating English Stop_Words
    max_features=5000      # Max Feature Values
)

# fit_transform on GPU
tfidf_matrix = vectorizer.fit_transform(final_df['combined_features'])

print("TF-IDF shape:", tfidf_matrix.shape)

TF-IDF shape: (27278, 5000)


In [36]:
#  NearestNeighbors with metric (GPU) = cosine_similarity (CPU)
nn_model = NearestNeighbors(n_neighbors=11, metric='cosine')  # 10 similar options + self movie
nn_model.fit(tfidf_matrix)

# top-k movies to the chosen one
distances, indices = nn_model.kneighbors(tfidf_matrix)

print("Distances shape:", distances.shape)
print("Indices shape:", indices.shape)

Distances shape: (27278, 11)
Indices shape: (27278, 11)


In [37]:
movie_idx = 0  # first option
similar_movies_idx = indices[movie_idx][1:]  # self movie elimination
similar_movies = final_df.iloc[similar_movies_idx]

print(" Chosen Movie :", final_df.iloc[movie_idx]['title'])
print("10 most likely:")
print(similar_movies[['movieId', 'title']])

 Chosen Movie : 0    Piranha (Piranha 3D) (2010)
Name: title, dtype: object
10 most likely:
       movieId                                           title
14630    53435                          Hostel: Part II (2007)
12420    31359                                 Moolaadé (2004)
4337     92058  Human Centipede II (Full Sequence), The (2011)
4662     96448          Piranha 3DD (a.k.a. Piranha DD) (2012)
5937     26356                              Supervixens (1975)
13491    96239                               Chillerama (2011)
506      82366                               Hatchet II (2010)
1092    103299                            American Mary (2012)
20308     6940                In My Skin (Dans ma Peau) (2002)
25560    90888                                Immortals (2011)


### function_oriented command for similar movie investigation.
### <li> Input: Name of the movie & k: number of the most_likely recommends
### <li> Returns: cuDF DataFrame of similar options


In [38]:
def get_top_k_similar_with_score(title, final_df, indices, distances, k=10):

    # Finding index from title
    match = final_df[final_df['title'] == title]
    if len(match) == 0:
        raise ValueError(f"The Movie with Title: '{title}' ------>> Not Found.")

    movie_idx = match.index[0] # As Chosen One

    # top-k (chosen one elimination)
    similar_idx = indices[movie_idx][1:k+1]
    # Distance Estimation
    similar_dist = distances[movie_idx][1:k+1]

    # Converting Distance Similarity to Percentage %
    similarity_percent = (1 - similar_dist) * 100

    # Data Extraction for the most_similar choices
    similar_movies = final_df.iloc[similar_idx].copy()
    similar_movies['selected_movie'] = title
    similar_movies = similar_movies.rename(columns={'title': 'similar_movie'})

    # Adding Percentage Feature to DataFrameا
    similar_movies['similarity_percent'] = similarity_percent

    # Sorting by sim_percentage
    similar_movies = similar_movies.sort_values(by='similarity_percent', ascending=False)

    # Turning DF to a readable format
    similar_movies = similar_movies[['selected_movie', 'similar_movie', 'movieId', 'similarity_percent']]

    return similar_movies

##### Example

In [39]:
movie_name = "Ice Age (2002)"
top_similar = get_top_k_similar_with_score(movie_name, final_df, indices, distances, k=10)
top_similar


Unnamed: 0,selected_movie,similar_movie,movieId,similarity_percent
23630,Ice Age (2002),"Bug's Life, A (1998)",2355,84.732956
12273,Ice Age (2002),"Monsters, Inc. (2001)",4886,82.506271
4531,Ice Age (2002),Toy Story (1995),1,79.342049
21157,Ice Age (2002),Finding Nemo (2003),6377,79.212662
15908,Ice Age (2002),Toy Story 2 (1999),3114,75.176491
2937,Ice Age (2002),Ratatouille (2007),50872,74.805878
5111,Ice Age (2002),"Incredibles, The (2004)",8961,71.993233
22487,Ice Age (2002),Up (2009),68954,68.828461
15871,Ice Age (2002),The Legend of Mor'du (2012),120470,67.467079
26042,Ice Age (2002),Tin Toy (1988),95446,67.467079


In [40]:
top_similar.to_csv("10_Recommended_Movies.csv", index=False)