In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


In [2]:
ratings_df = pd.read_csv("dataBig/ratings.csv").drop("timestamp", axis=1)

In [3]:
ratings_df =  ratings_df.sample(frac=1, random_state=420).reset_index(drop=True)
ratings_df

Unnamed: 0,userId,movieId,rating
0,254775,6870,3.5
1,123364,2571,4.5
2,311036,165699,2.5
3,14961,509,3.0
4,73223,1200,4.0
...,...,...,...
33832157,18606,457,4.0
33832158,25653,96020,5.0
33832159,56721,1682,5.0
33832160,181122,592,3.0


In [4]:
print(len(ratings_df["userId"].unique()))
len(ratings_df["movieId"].unique())

330975


83239

In [5]:
test_ratings_df = ratings_df.sample(int(len(ratings_df)*0.1), random_state=420)
sampled_indices = test_ratings_df.index

test_pairs =  test_ratings_df[["userId", "movieId"]]
actual_ratings = test_ratings_df["rating"]
remaining_df = ratings_df.drop(sampled_indices)

del ratings_df
del sampled_indices

In [6]:
# Removing rarly scored movies
movie_counts = remaining_df['movieId'].value_counts()
rare_movies = movie_counts[movie_counts < 5].index
remaining_df = remaining_df[~remaining_df['movieId'].isin(rare_movies)]
del rare_movies

In [7]:
remaining_df = remaining_df[remaining_df["rating"] >= 4 ]

In [8]:
# remaining_df = remaining_df.iloc[:1000]
movies = remaining_df["movieId"].unique()
users = remaining_df["userId"].unique()

In [9]:
from sklearn.cluster import MiniBatchKMeans, KMeans
# import warnings
# # warnings.filterwarnings('ignore')

In [None]:
kmeans = MiniBatchKMeans(n_clusters=11,
                         random_state=420,
                         n_init="auto")
          
number_of_batches = 49


for iteration, users_batch in enumerate(np.array_split(users, number_of_batches)):
    print(f"Iteration {iteration}")
    
    ratings = remaining_df.loc[remaining_df["userId"].isin(users_batch)]
#     print(ratings)
    X = pd.pivot_table(ratings, index=["userId"], columns="movieId").fillna(False).astype(bool)
    X.columns = X.columns.get_level_values(1)
    X = X.reindex(columns = movies, fill_value=False)

    kmeans.partial_fit(X)
# 
    

Iteration 0
Iteration 1
Iteration 2


In [None]:
user_clusters = pd.DataFrame({"userId": users, "cluster": 99}).set_index("userId")
user_clusters

x=0
for iteration, users_batch in enumerate(np.array_split(users, number_of_batches)):
    print(f"Iteration {iteration}")
    
    ratings = remaining_df.loc[remaining_df["userId"].isin(users_batch)]
    X = pd.pivot_table(ratings, index="userId", columns="movieId").fillna(False).astype(bool)
    X.columns = X.columns.get_level_values(1)
    X = X.reindex(columns = movies, fill_value=False)
    
    batch_clusters = kmeans.predict(X)
    for i, user_id in enumerate(users_batch):
        user_clusters.loc[user_id, "cluster"] = batch_clusters[i]

sns.countplot(data = user_clusters, x = "cluster")

In [None]:
sns.countplot(data = user_clusters, x = "cluster")

In [None]:
from sklearn.metrics import mean_squared_error

users_not_in_training = 0
movies_not_watched_in_cluster = 0
movie_not_in_training = 0

average_ranking = remaining_df["rating"].mean()
def predict(user_id, movie_id):
    global users_not_in_training, movies_not_watched_in_cluster, movie_not_in_training

    if user_id not in user_clusters.index:
        users_not_in_training += 1
        return average_ranking
    
    cluster = int(user_clusters.loc[user_id, "cluster"])
    users_in_cluster = user_clusters.loc[user_clusters["cluster"] == cluster]
    rating = remaining_df.loc[(remaining_df["userId"].isin(users_in_cluster.index)) & (remaining_df["movieId"] == movie_id)]["rating"].mean()
    if np.isnan(rating):
        rating = remaining_df.loc[remaining_df["movieId"] == movie_id]["rating"].mean()
        
        if np.isnan(rating):
            movie_not_in_training += 1
            rating = average_ranking
        else:
            movies_not_watched_in_cluster += 1
        
    return rating
    
y_predicted = np.array([predict(user_id, movie_id) for user_id, movie_id in zip(test_pairs["userId"] ,test_pairs["movieId"])])
total = len(y_predicted)

print(users_not_in_training/total)
print(movies_not_watched_in_cluster/total)
print(movie_not_in_training/total)

y_predicted
    
mean_squared_error(y_predicted, actual_ratings)


In [None]:
330019 * 38705 / 1024 /1024 /1024