In [2]:
import pandas as pd


In [3]:

# Read the .dat file
read_file = pd.read_csv('movies.dat', sep='::', engine='python', encoding='ISO-8859-1', header=None)
movies_df = read_file.columns = ['movieId', 'title', 'genres']


In [4]:
read_file.to_csv('movies.csv', index=False)


In [5]:
ratings_df1 = pd.read_csv(
    'ratings.dat',
    sep='::',
    engine='python',
    header=None,
    encoding='ISO-8859-1'
)
ratings_df1_df = ratings_df1.columns = ['userId', 'movieId', 'rating', 'timestamp']


In [6]:
ratings_df1.to_csv('rating.csv', index=False)


In [7]:
df1 = pd.read_csv("movies.csv")

In [8]:
df2= pd.read_csv("rating.csv")

In [9]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3883 entries, 0 to 3882
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  3883 non-null   int64 
 1   title    3883 non-null   object
 2   genres   3883 non-null   object
dtypes: int64(1), object(2)
memory usage: 91.1+ KB


In [10]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000209 entries, 0 to 1000208
Data columns (total 4 columns):
 #   Column     Non-Null Count    Dtype
---  ------     --------------    -----
 0   userId     1000209 non-null  int64
 1   movieId    1000209 non-null  int64
 2   rating     1000209 non-null  int64
 3   timestamp  1000209 non-null  int64
dtypes: int64(4)
memory usage: 30.5 MB


In [11]:
from datetime import datetime

# Convert Unix timestamp to datetime
df2['timestamp'] = pd.to_datetime(df2['timestamp'], unit='s')

print(df2.head())

   userId  movieId  rating           timestamp
0       1     1193       5 2000-12-31 22:12:40
1       1      661       3 2000-12-31 22:35:09
2       1      914       3 2000-12-31 22:32:48
3       1     3408       4 2000-12-31 22:04:35
4       1     2355       5 2001-01-06 23:38:11


In [12]:
df = pd.merge(df1, df2, on='movieId')


In [13]:
df.to_csv('ni.csv',index=False)

In [14]:
df.head()

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),Animation|Children's|Comedy,1,5,2001-01-06 23:37:48
1,1,Toy Story (1995),Animation|Children's|Comedy,6,4,2000-12-31 04:30:08
2,1,Toy Story (1995),Animation|Children's|Comedy,8,4,2000-12-31 03:31:36
3,1,Toy Story (1995),Animation|Children's|Comedy,9,5,2000-12-31 01:25:52
4,1,Toy Story (1995),Animation|Children's|Comedy,10,5,2000-12-31 01:34:34


In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000209 entries, 0 to 1000208
Data columns (total 6 columns):
 #   Column     Non-Null Count    Dtype         
---  ------     --------------    -----         
 0   movieId    1000209 non-null  int64         
 1   title      1000209 non-null  object        
 2   genres     1000209 non-null  object        
 3   userId     1000209 non-null  int64         
 4   rating     1000209 non-null  int64         
 5   timestamp  1000209 non-null  datetime64[ns]
dtypes: datetime64[ns](1), int64(3), object(2)
memory usage: 45.8+ MB


In [16]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


In [17]:

# Drop duplicate movies (keep only one row per unique title)
df_movies = df[['movieId', 'title', 'genres']].drop_duplicates(subset='title').reset_index(drop=True)


In [18]:

# Clean the genres column (replace '|' with space for TF-IDF)
df_movies['genres'] = df_movies['genres'].str.replace('|', ' ', regex=False)


In [19]:

# TF-IDF Vectorization
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(df_movies['genres'])


In [20]:

# Compute cosine similarity matrix
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)


In [21]:

# Map movie titles to DataFrame index
indices = pd.Series(df_movies.index, index=df_movies['title'].str.strip())


In [24]:

# Recommendation function
def content_recommend(title, cosine_sim=cosine_sim):
    title = title.strip()
    if title not in indices:
        return f"Title '{title}' not found in the dataset."

    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:100]
    movie_indices = [i[0] for i in sim_scores]
    return df_movies['title'].iloc[movie_indices].tolist()


In [27]:

# Example usage
print(content_recommend('X-Men (2000)'))


['Highlander III: The Sorcerer (1994)', 'Demolition Man (1993)', 'No Escape (1994)', 'Barb Wire (1996)', 'Adrenalin: Fear the Rush (1996)', 'Fifth Element, The (1997)', 'Godzilla (1998)', 'Godzilla (Gojira) (1954)', 'Godzilla (Gojira) (1984)', 'King Kong vs. Godzilla (Kingukongu tai Gojira) (1962)', 'Star Trek: Insurrection (1998)', 'Planet of the Apes (1968)', 'Beneath the Planet of the Apes (1970)', 'Battle for the Planet of the Apes (1973)', 'Conquest of the Planet of the Apes (1972)', 'Escape from the Planet of the Apes (1971)', 'Wing Commander (1999)', 'Universal Soldier: The Return (1999)', 'Universal Soldier (1992)', 'Pitch Black (2000)', 'Battlefield Earth (2000)', 'Mad Max (1979)', 'Mad Max 2 (a.k.a. The Road Warrior) (1981)', 'Mad Max Beyond Thunderdome (1985)', 'X-Men (2000)', 'Freejack (1992)', 'Space Cowboys (2000)', 'Johnny Mnemonic (1995)', 'Nemesis 2: Nebula (1995)', 'Terminator 2: Judgment Day (1991)', 'Solo (1996)', 'Arrival, The (1996)', 'Lawnmower Man, The (1992)', 