In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix
from sklearn.metrics import precision_score, recall_score, f1_score, mean_absolute_error, mean_squared_error
from sklearn.decomposition import TruncatedSVD
from sklearn.neighbors import NearestNeighbors
from sklearn.model_selection import train_test_split, KFold

movies = pd.read_csv(r'E:\3rd Year 1st semi\IRWA\Movie_Recomendation_system\recommendation-system\DataSet\movies.csv')
ratings = pd.read_csv(r'E:\3rd Year 1st semi\IRWA\Movie_Recomendation_system\recommendation-system\DataSet\ratings.csv')





In [69]:
# Data overview
print("Movies Dataset Info:")
print(movies.info())
print("\nRatings Dataset Info:")
print(ratings.info())

Movies Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9125 entries, 0 to 9124
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  9125 non-null   int64 
 1   title    9125 non-null   object
 2   genres   9125 non-null   object
dtypes: int64(1), object(2)
memory usage: 214.0+ KB
None

Ratings Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100004 entries, 0 to 100003
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100004 non-null  int64  
 1   movieId    100004 non-null  int64  
 2   rating     100004 non-null  float64
 3   timestamp  100004 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.1 MB
None


In [70]:
# Check for missing values
print("\nMissing Values in Movies:")
print(movies.isnull().sum())
print("\nMissing Values in Ratings:")
print(ratings.isnull().sum())


Missing Values in Movies:
movieId    0
title      0
genres     0
dtype: int64

Missing Values in Ratings:
userId       0
movieId      0
rating       0
timestamp    0
dtype: int64


In [71]:
# Merge datasets on movie ID
merged_data = pd.merge(ratings, movies, on='movieId')

In [72]:
# Create user-item matrix
user_item_matrix = merged_data.pivot_table(index='userId', columns='movieId', values='rating').fillna(0)

# Convert to sparse matrix
from scipy.sparse import csr_matrix
sparse_user_item_matrix = csr_matrix(user_item_matrix)



n_components = 10  # Adjust as necessary
svd = TruncatedSVD(n_components=n_components)
reduced_matrix = svd.fit_transform(sparse_user_item_matrix)

### Implement Recommendation Algorithms

##### Collaborative Filtering (User-Based)

In [78]:
# Fit the Nearest Neighbors model
model_knn = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=10)
model_knn.fit(reduced_matrix)

# Function to get user-based recommendations
def get_user_based_recommendations(user_id, n_recommendations=5):
    user_index = user_item_matrix.index.get_loc(user_id)
    distances, indices = model_knn.kneighbors(reduced_matrix[user_index].reshape(1, -1), n_neighbors=n_recommendations + 1)
    
    # Get the recommended movie IDs
    recommended_movie_indices = indices.flatten()[1:]
    return user_item_matrix.columns[recommended_movie_indices].tolist()

# Example usage
user_id_example = 1
print("\nUser-based recommendations for User 1:", get_user_based_recommendations(user_id_example, n_recommendations=5))


User-based recommendations for User 1: [61, 259, 680, 344, 36]


#### Item-Based Collaborative Filtering

##### Content-Based Filtering

In [79]:
# TF-IDF Vectorization for movie titles and genres
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=1000)
tfidf_matrix = tfidf_vectorizer.fit_transform(movies['title'] + ' ' + movies['genres'].fillna(''))

# Compute cosine similarity for content-based filtering
cosine_sim = cosine_similarity(tfidf_matrix)

# Function to get content-based recommendations
def get_content_based_recommendations(movie_title, n_recommendations=5):
    idx = movies[movies['title'] == movie_title].index[0]
    sim_scores = list(enumerate(cosine_sim[idx]))
    
    # Sort the movies based on similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Get the indices of the top n recommendations
    recommended_indices = [i[0] for i in sim_scores[1:n_recommendations + 1]]
    return movies['title'].iloc[recommended_indices].tolist()

# Example usage
movie_title_example = "Toy Story (1995)"
print("\nContent-based recommendations for 'Toy Story (1995)':", get_content_based_recommendations(movie_title_example, n_recommendations=5))


Content-based recommendations for 'Toy Story (1995)': ['Toy Story 2 (1999)', 'Toy Story 3 (2010)', 'Toy Story of Terror (2013)', "We're Back! A Dinosaur's Story (1993)", 'Balto (1995)']


##### Hybrid Model

In [80]:
# Function to get hybrid recommendations
def get_hybrid_recommendations(user_id, movie_title, n_recommendations=5):
    user_based_recommendations = get_user_based_recommendations(user_id, n_recommendations)
    content_based_recommendations = get_content_based_recommendations(movie_title, n_recommendations)
    
    # Combine both recommendations
    combined_recommendations = list(set(user_based_recommendations) | set(content_based_recommendations))
    return combined_recommendations[:n_recommendations]

# Example usage
print("\nHybrid recommendations for User 1 and 'Toy Story (1995)':", get_hybrid_recommendations(user_id_example, movie_title_example, n_recommendations=5))



Hybrid recommendations for User 1 and 'Toy Story (1995)': ['Toy Story 3 (2010)', 259, 36, 'Toy Story of Terror (2013)', 680]


#### Model Evaluation

In [81]:
# Create a test set
train_data, test_data = train_test_split(merged_data, test_size=0.2, random_state=42)

# Generate predictions using user-based collaborative filtering
def predict_rating(user_id, movie_id):
    if movie_id in user_item_matrix.columns:
        user_index = user_item_matrix.index.get_loc(user_id)
        movie_index = user_item_matrix.columns.get_loc(movie_id)
        return user_item_matrix.iloc[user_index, movie_index]
    else:
        return 0  # Or some other default value if the movie was not rated by the user

# Create actual and predicted ratings
actual_ratings = test_data[['userId', 'movieId', 'rating']]
predicted_ratings = actual_ratings.copy()
predicted_ratings['predicted_rating'] = predicted_ratings.apply(lambda x: predict_rating(x['userId'], x['movieId']), axis=1)

# Calculate evaluation metrics
precision = precision_score(actual_ratings['rating'].round(), predicted_ratings['predicted_rating'].round(), average='weighted', zero_division=0)
recall = recall_score(actual_ratings['rating'].round(), predicted_ratings['predicted_rating'].round(), average='weighted', zero_division=0)
f1 = f1_score(actual_ratings['rating'].round(), predicted_ratings['predicted_rating'].round(), average='weighted', zero_division=0)
rmse = np.sqrt(mean_squared_error(actual_ratings['rating'], predicted_ratings['predicted_rating']))
mae = mean_absolute_error(actual_ratings['rating'], predicted_ratings['predicted_rating'])

print(f"\nPrecision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")
print(f"RMSE: {rmse:.2f}")
print(f"MAE: {mae:.2f}")


Precision: 1.00
Recall: 1.00
F1 Score: 1.00
RMSE: 0.00
MAE: 0.00
