In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
movies = pd.read_csv('tmdb_5000_movies.csv')

In [4]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4803 entries, 0 to 4802
Data columns (total 20 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   budget                4803 non-null   int64  
 1   genres                4803 non-null   object 
 2   homepage              1712 non-null   object 
 3   id                    4803 non-null   int64  
 4   keywords              4803 non-null   object 
 5   original_language     4803 non-null   object 
 6   original_title        4803 non-null   object 
 7   overview              4800 non-null   object 
 8   popularity            4803 non-null   float64
 9   production_companies  4803 non-null   object 
 10  production_countries  4803 non-null   object 
 11  release_date          4802 non-null   object 
 12  revenue               4803 non-null   int64  
 13  runtime               4801 non-null   float64
 14  spoken_languages      4803 non-null   object 
 15  status               

In [5]:
# getting movie ids from movies database
movie_ids = movies['id'].values

In [6]:
# getting vote_count from movies database
vote_counts = movies['vote_count'].values

In [7]:
# Normalize vote counts to use as weights for popularity
popularity_weights = vote_counts / np.sum(vote_counts)

In [8]:
# specify number of ratings needed data
num_ratings = 20000

In [9]:
# Generate user IDs using a Pareto distribution
alpha = 2.0  # shape parameter for Pareto distribution
user_ids = (np.random.pareto(alpha, num_ratings) + 1) * 100  # scale and shift
user_ids = user_ids.astype(int)
user_ids = np.clip(user_ids, 1, 1000)  # ensure user IDs are between 1 and 1000

In [10]:
# Calculate frequency of user IDs to simulate frequent and rare raters
user_frequencies = np.bincount(user_ids, minlength=1001)[1:]  # frequency of each user_id from 1 to 1000
user_weights = user_frequencies / np.sum(user_frequencies)  # normalize to create a probability distribution

In [11]:
# Initialize a set to track unique (user_id, movie_id) pairs
unique_pairs = set()

In [12]:
# Generate unique (user_id, movie_id) pairs with weighted sampling for movie_ids and user_ids
user_ids = []
movie_ids_sampled = []
while len(unique_pairs) < num_ratings:
    user_id = np.random.choice(a = np.arange(1, 1001), p=user_weights)
    movie_id = np.random.choice(movie_ids, p=popularity_weights)
    if (user_id, movie_id) not in unique_pairs:
        unique_pairs.add((user_id, movie_id))
        user_ids.append(user_id)
        movie_ids_sampled.append(movie_id)

In [13]:
# Generate ratings using a normal distribution
mean_rating = 3.0  # average rating around 3.0
std_dev_rating = 1.5  # standard deviation of 1.5
ratings = np.random.normal(mean_rating, std_dev_rating, num_ratings)

# Clip ratings to be within the 0.0 to 5.0 range
ratings = np.clip(ratings, 0.0, 5.0)

In [14]:
# Create Ratings DataFrame
ratings_df = pd.DataFrame({
    'movie_id': movie_ids_sampled,
    'user_id': user_ids,
    'rating': ratings
})

In [18]:
sorted(ratings_df.groupby('movie_id').count()['rating'][ratings_df.groupby('movie_id')
                                                        .count()['rating'] > 5], reverse=True)

[70,
 66,
 62,
 58,
 58,
 55,
 55,
 54,
 54,
 54,
 50,
 49,
 49,
 48,
 47,
 47,
 46,
 45,
 44,
 43,
 43,
 42,
 41,
 41,
 41,
 40,
 40,
 40,
 39,
 39,
 39,
 39,
 39,
 38,
 38,
 38,
 38,
 38,
 37,
 37,
 37,
 37,
 36,
 36,
 36,
 36,
 36,
 36,
 35,
 35,
 35,
 35,
 34,
 34,
 34,
 34,
 34,
 34,
 34,
 34,
 34,
 33,
 33,
 33,
 33,
 33,
 33,
 33,
 32,
 32,
 32,
 32,
 31,
 31,
 31,
 31,
 31,
 31,
 31,
 31,
 31,
 31,
 31,
 31,
 31,
 31,
 31,
 31,
 30,
 30,
 30,
 30,
 30,
 30,
 30,
 30,
 30,
 29,
 29,
 29,
 29,
 29,
 28,
 28,
 28,
 27,
 27,
 27,
 27,
 27,
 27,
 27,
 27,
 27,
 27,
 27,
 27,
 26,
 26,
 26,
 26,
 26,
 26,
 26,
 26,
 26,
 26,
 25,
 25,
 25,
 25,
 25,
 25,
 25,
 25,
 25,
 25,
 25,
 25,
 25,
 25,
 24,
 24,
 24,
 24,
 24,
 24,
 24,
 24,
 24,
 24,
 24,
 24,
 24,
 24,
 24,
 24,
 23,
 23,
 23,
 23,
 23,
 23,
 23,
 23,
 23,
 23,
 23,
 23,
 23,
 23,
 23,
 23,
 22,
 22,
 22,
 22,
 22,
 22,
 22,
 22,
 22,
 22,
 22,
 22,
 22,
 22,
 22,
 21,
 21,
 21,
 21,
 21,
 21,
 21,
 21,
 21,
 21,
 21,
 21,


In [21]:
sorted(ratings_df.groupby('user_id').count()['rating'][ratings_df.groupby('user_id')
                                                        .count()['rating'] > 50], reverse=True)

[368,
 350,
 347,
 345,
 320,
 317,
 314,
 302,
 300,
 294,
 272,
 270,
 268,
 267,
 263,
 254,
 251,
 241,
 240,
 236,
 235,
 206,
 205,
 204,
 203,
 202,
 202,
 191,
 188,
 185,
 184,
 175,
 174,
 166,
 166,
 163,
 157,
 155,
 152,
 148,
 145,
 139,
 139,
 139,
 138,
 137,
 136,
 136,
 135,
 125,
 117,
 113,
 113,
 107,
 107,
 105,
 102,
 101,
 99,
 98,
 97,
 96,
 96,
 95,
 95,
 90,
 89,
 87,
 86,
 86,
 85,
 83,
 82,
 82,
 81,
 78,
 78,
 77,
 75,
 72,
 69,
 68,
 68,
 67,
 67,
 66,
 66,
 65,
 65,
 63,
 61,
 60,
 58,
 57,
 57,
 56,
 56,
 54,
 54,
 53,
 51,
 51]

In [20]:
# Save to CSV
ratings_df.to_csv('ratings.csv', index = False)