In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
movies = pd.read_csv('tmdb_5000_movies.csv')

In [3]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4803 entries, 0 to 4802
Data columns (total 20 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   budget                4803 non-null   int64  
 1   genres                4803 non-null   object 
 2   homepage              1712 non-null   object 
 3   id                    4803 non-null   int64  
 4   keywords              4803 non-null   object 
 5   original_language     4803 non-null   object 
 6   original_title        4803 non-null   object 
 7   overview              4800 non-null   object 
 8   popularity            4803 non-null   float64
 9   production_companies  4803 non-null   object 
 10  production_countries  4803 non-null   object 
 11  release_date          4802 non-null   object 
 12  revenue               4803 non-null   int64  
 13  runtime               4801 non-null   float64
 14  spoken_languages      4803 non-null   object 
 15  status               

In [4]:
# getting movie ids from movies database
movie_ids = movies['id'].values

In [5]:
# getting vote_count from movies database
vote_counts = movies['vote_count'].values

In [6]:
# Normalize vote counts to use as weights for popularity
popularity_weights = vote_counts / np.sum(vote_counts)

In [7]:
# specify number of ratings needed data
num_ratings = 20000

In [8]:
# Generate user IDs using a Pareto distribution
alpha = 2.0  # shape parameter for Pareto distribution
user_ids = (np.random.pareto(alpha, num_ratings) + 1) * 100  # scale and shift
user_ids = user_ids.astype(int)
user_ids = np.clip(user_ids, 1, 1000)  # ensure user IDs are between 1 and 1000

In [9]:
# Calculate frequency of user IDs to simulate frequent and rare raters
user_frequencies = np.bincount(user_ids, minlength=1001)[1:]  # frequency of each user_id from 1 to 1000
user_weights = user_frequencies / np.sum(user_frequencies)  # normalize to create a probability distribution

In [10]:
# Initialize a set to track unique (user_id, movie_id) pairs
unique_pairs = set()

In [11]:
# Generate unique (user_id, movie_id) pairs with weighted sampling for movie_ids and user_ids
user_ids = []
movie_ids_sampled = []
while len(unique_pairs) < num_ratings:
    user_id = np.random.choice(a = np.arange(1, 1001), p=user_weights)
    movie_id = np.random.choice(movie_ids, p=popularity_weights)
    if (user_id, movie_id) not in unique_pairs:
        unique_pairs.add((user_id, movie_id))
        user_ids.append(user_id)
        movie_ids_sampled.append(movie_id)

In [12]:
# Generate ratings using a normal distribution
mean_rating = 3.0  # average rating around 3.0
std_dev_rating = 1.5  # standard deviation of 1.5
ratings = np.random.normal(mean_rating, std_dev_rating, num_ratings)

# Clip ratings to be within the 0.0 to 5.0 range
ratings = np.clip(ratings, 0.0, 5.0)

In [13]:
# Create Ratings DataFrame
ratings_df = pd.DataFrame({
    'movie_id': movie_ids_sampled,
    'user_id': user_ids,
    'rating': ratings
})

In [14]:
# top 10 most rated movies
sorted(ratings_df.groupby('movie_id').count()['rating'][ratings_df.groupby('movie_id')
                                                        .count()['rating'] > 5], reverse=True)[0:10]

[65, 63, 60, 60, 59, 56, 55, 52, 51, 49]

In [15]:
# top 10 raters
sorted(ratings_df.groupby('user_id').count()['rating'][ratings_df.groupby('user_id')
                                                        .count()['rating'] > 50], reverse=True)[0:10]

[363, 347, 344, 341, 329, 322, 315, 309, 307, 285]

In [16]:
# Save to CSV
ratings_df.to_csv('ratings.csv', index = False)