In [None]:
import pandas as pd

data = pd.read_csv('movies.csv')
print(data.info())

data.dropna(subset=['title', 'genres'], inplace=True)

data['title'] = data['title'].str.strip()
data['title'] = data['title'].str.lower()

data['year'] = data['title'].str.extract(r'\((\d{4})\)', expand=False)
data['year'] = pd.to_numeric(data['year'], errors='coerce')

genres = data['genres'].str.get_dummies('|')
data = pd.concat([data, genres], axis=1)

data.set_index('movieId', inplace=True)

data.to_csv('preprocessed_movies.csv')


Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 62423 entries, 0 to 62422
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  62423 non-null  int64 
 1   title    62423 non-null  object
 2   genres   62423 non-null  object
dtypes: int64(1), object(2)
memory usage: 1.4+ MB
None


In [None]:
import pandas as pd

ratings_data = pd.read_csv('ratings.csv')
print("Data Info:")
print(ratings_data.info())

missing_values = ratings_data.isnull().sum()
print("Missing Values:")
print(missing_values)

valid_rating_range = (0.5, 5.0)
ratings_data = ratings_data[(ratings_data['rating'] >= valid_rating_range[0]) & (ratings_data['rating'] <= valid_rating_range[1])]

ratings_data['timestamp'] = pd.to_datetime(ratings_data['timestamp'], unit='s')

from sklearn.model_selection import train_test_split
train_data, test_data = train_test_split(ratings_data, test_size=0.2, random_state=42)

ratings_data.to_csv('preprocessed_ratings.csv')


Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7568238 entries, 0 to 7568237
Data columns (total 4 columns):
 #   Column     Dtype  
---  ------     -----  
 0   userId     int64  
 1   movieId    int64  
 2   rating     float64
 3   timestamp  int64  
dtypes: float64(1), int64(3)
memory usage: 231.0 MB
None
Missing Values:
userId       0
movieId      0
rating       0
timestamp    0
dtype: int64


In [None]:
movies_df = pd.read_csv('preprocessed_movies.csv',usecols=['movieId','title'],dtype={'movieId': 'int32', 'title': 'str'})
rating_df=pd.read_csv('preprocessed_ratings.csv',usecols=['userId', 'movieId', 'rating'],
dtype={'userId': 'int32', 'movieId': 'int32', 'rating': 'float32'})

In [None]:
movies_df.head()

Unnamed: 0,movieId,title
0,1,toy story (1995)
1,2,jumanji (1995)
2,3,grumpier old men (1995)
3,4,waiting to exhale (1995)
4,5,father of the bride part ii (1995)


In [None]:
rating_df.head()

Unnamed: 0,userId,movieId,rating
0,1,296,5.0
1,1,306,3.5
2,1,307,5.0
3,1,665,5.0
4,1,899,3.5


In [None]:
df = pd.merge(rating_df,movies_df,on='movieId')
df.head()

Unnamed: 0,userId,movieId,rating,title
0,1,296,5.0,pulp fiction (1994)
1,3,296,5.0,pulp fiction (1994)
2,4,296,4.0,pulp fiction (1994)
3,5,296,4.0,pulp fiction (1994)
4,7,296,4.0,pulp fiction (1994)


In [None]:
combine_movie_rating = df.dropna(axis = 0, subset = ['title'])
movie_ratingCount = (combine_movie_rating.
     groupby(by = ['title'])['rating'].
     count().
     reset_index().
     rename(columns = {'rating': 'totalRatingCount'})
     [['title', 'totalRatingCount']]
    )
movie_ratingCount.head()


Unnamed: 0,title,totalRatingCount
0,"""great performances"" cats (1998)",56
1,#1 cheerleader camp (2010),4
2,#female pleasure (2018),1
3,#followme (2019),1
4,#horror (2015),5


In [None]:
rating_with_totalRatingCount = combine_movie_rating.merge(movie_ratingCount, left_on = 'title', right_on = 'title', how = 'left')
rating_with_totalRatingCount.head()

Unnamed: 0,userId,movieId,rating,title,totalRatingCount
0,1,296,5.0,pulp fiction (1994),24038
1,3,296,5.0,pulp fiction (1994),24038
2,4,296,4.0,pulp fiction (1994),24038
3,5,296,4.0,pulp fiction (1994),24038
4,7,296,4.0,pulp fiction (1994),24038


In [None]:
pd.set_option('display.float_format', lambda x: '%.3f' % x)
print(movie_ratingCount['totalRatingCount'].describe())

count   43700.000
mean      173.186
std       865.036
min         1.000
25%         1.000
50%         4.000
75%        27.000
max     24686.000
Name: totalRatingCount, dtype: float64


In [None]:
popularity_threshold = 50
rating_popular_movie= rating_with_totalRatingCount.query('totalRatingCount >= @popularity_threshold')
rating_popular_movie.head()

Unnamed: 0,userId,movieId,rating,title,totalRatingCount
0,1,296,5.0,pulp fiction (1994),24038
1,3,296,5.0,pulp fiction (1994),24038
2,4,296,4.0,pulp fiction (1994),24038
3,5,296,4.0,pulp fiction (1994),24038
4,7,296,4.0,pulp fiction (1994),24038


In [None]:
rating_popular_movie.shape

(7334860, 5)

In [None]:
## First lets create a Pivot matrix

movie_features_df=rating_popular_movie.pivot_table(index='title',columns='userId',values='rating').fillna(0)
movie_features_df.head()

In [None]:
from scipy.sparse import csr_matrix

movie_features_df_matrix = csr_matrix(movie_features_df.values)

from sklearn.neighbors import NearestNeighbors


model_knn = NearestNeighbors(metric = 'cosine', algorithm = 'brute')
model_knn.fit(movie_features_df_matrix)

In [None]:
movie_features_df.shape

(1838, 2848)

In [None]:
query_index = np.random.choice(movie_features_df.shape[0])
print(query_index)
query_index =2

140


In [None]:
distances, indices = model_knn.kneighbors(movie_features_df.iloc[query_index,:].values.reshape(1, -1), n_neighbors = 6)

In [None]:
movie_features_df.head()

userId,1,2,3,4,5,6,7,8,9,10,...,2839,2840,2841,2842,2843,2844,2845,2846,2847,2848
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"'burbs, The (1989)",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
(500) Days of Summer (2009),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10 Cloverfield Lane (2016),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10 Things I Hate About You (1999),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0
101 Dalmatians (1996),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,...,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
for i in range(0, len(distances.flatten())):
    if i == 0:
        print('Recommendations for {0}:\n'.format(movie_features_df.index[query_index]))
    else:
        print('{0}: {1}, with distance of {2}:'.format(i, movie_features_df.index[indices.flatten()[i]], distances.flatten()[i]))

Recommendations for 10 Cloverfield Lane (2016):

1: Get Out (2017), with distance of 0.5031648874282837:
2: Arrival (2016), with distance of 0.5208242535591125:
3: Nightcrawler (2014), with distance of 0.5881109237670898:
4: The Revenant (2015), with distance of 0.6026796102523804:
5: Split (2017), with distance of 0.6057538986206055:


In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel


df = pd.read_csv('movies.csv')


tfidf = TfidfVectorizer(stop_words='english')


df['overview'] = df['overview'].fillna('')


tfidf_matrix = tfidf.fit_transform(df['overview'])


cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)


indices = pd.Series(df.index, index=df['title']).drop_duplicates()


def get_recommendations(title, cosine_sim=cosine_sim):

    idx = indices[title]


    sim_scores = list(enumerate(cosine_sim[idx]))


    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)


    sim_scores = sim_scores[1:11]


    movie_indices = [i[0] for i in sim_scores]


    return df['title'].iloc[movie_indices]


recommended_movies = get_recommendations('The Dark Knight')  # Replace 'Movie Title' with a movie title from your dataset
print(recommended_movies)


3                         The Dark Knight Rises
428                              Batman Returns
3854    Batman: The Dark Knight Returns, Part 2
299                              Batman Forever
1359                                     Batman
119                               Batman Begins
1181                                        JFK
9            Batman v Superman: Dawn of Justice
2507                                  Slow Burn
210                              Batman & Robin
Name: title, dtype: object
