In [1]:
import pandas as pd
import time

start_time = time.time()


pd.set_option('display.max_colwidth', None)

movies_df = pd.read_csv('./the-movies-dataset/movies_metadata.csv',usecols=("genres","id" ,"title","tagline", "overview","production_companies"),
                          dtype={'genres':"string","id":"string","title": "string", "tagline": "string","overview":"string",
                                    "production_companies" :"string"})[["genres","id" ,"title","tagline", "overview","production_companies"]]
movies_df.dropna(inplace = True)
movies_lst = [row for row in movies_df.values.tolist() if not (row[0][len(row[0])  - 2:] == "[]" or row[5][len(row[5]) - 2:] == "[]")]
movies_df = pd.DataFrame(movies_lst, columns = ("genres","id" ,"title","tagline", "overview","production_companies"), dtype = str)



ratings_df = pd.read_csv('./the-movies-dataset/ratings.csv', usecols = ("userId", "movieId", "rating"),
                       dtype={"userId": "string","movieId": "string","rating": "string"})[["userId", "movieId", "rating"]]
ratings_df.rename(columns={"movieId": "id"}, inplace = True)
ratings_df.dropna(inplace = True)


# Question: What if the removal of duplicate movie ids per user was processed here instead of the cell below???
# Answer: The duplicate removal function can be ran here,...
# but the complete_list in the cell below can also be iterated over with relative complexity in order to remove duplicates.
# The iteration in the next cell also populates the gap list...
# which is critical to be ran directly before the function that determines bounds for users rated movies.
# So, omitting the no duplicate function in this cell and making it run in the next cell avoids redundant iteration.


# Question: What if the test and train ratings bounds was enforced here instead of the cell below???
# Answer: The merge functions below needs to be executed before determining test and train users, because merge will remove rows and ratings from users...
# before enforcing the users to be in a certain bounds for the number of their ratings. 
# The current timing of this function will ensure that the final users are within the set train or test bounds.


keywords_df = pd.read_csv('./the-movies-dataset/keywords.csv', usecols = ("id", "keywords"), dtype={"id": "string","keywords":"string"})[["id", "keywords"]]
keywords_df.dropna(inplace = True)
keywords_lst = [row for row in keywords_df.values.tolist() if not (row[1][len(row[1])  - 2:] == "[]")]
keywords_df = pd.DataFrame(keywords_lst, columns = ("id", "keywords"), dtype = str)


credits_df = pd.read_csv("./the-movies-dataset/credits.csv", usecols = ("cast", "id"), dtype={"cast": "string", "id": "string"})[["cast", "id"]]
credits_df.dropna(inplace = True)
credits_lst = [row for row in credits_df.values.tolist() if (not row[0][len(row[0])  - 2:] == "[]")]
credits_df = pd.DataFrame(credits_lst, columns = ("cast", "id"), dtype = str)


# Default merge is inner: This only keeps movies that have the id existing in both dataframes.
complete_df =  pd.merge(movies_df, ratings_df, on ="id")
complete_df =  pd.merge(complete_df,keywords_df, on ="id")
complete_df  = pd.merge(complete_df,credits_df, on ="id")


complete_df.sort_values(by = 'userId', inplace = True)


# Master dataframe: For each (user id, movie id) row combination there is the combined movie data from movies_df, ratings_df, keywords_df, and credits_df for the movie id in question.
# The columns are reordered.
complete_df  = complete_df.loc[:,['userId','id','rating',"title", "genres","production_companies","keywords", "cast", "tagline", "overview" ]]

# For testing:
print("Minutes taken:", (time.time()-start_time)/60)
print(complete_df.head())



# Tested on personal machine:
# Old run with dataframe iteration (old code): 1 minute and 5.7 seconds
# New run with list conversion before iteration (current code): 37.1 seconds

Minutes taken: 0.6288645029067993
        userId    id rating               title  \
6566765      1  1246    5.0        Rocky Balboa   
6880303      1  2959    4.0      License to Wed   
2083077      1  2762    4.5  Young and Innocent   
1492304      1  1968    4.0       Fools Rush In   
2638962      1   147    4.5       The 400 Blows   

                                                                                                genres  \
6566765                                                                  [{'id': 18, 'name': 'Drama'}]   
6880303                                                                 [{'id': 35, 'name': 'Comedy'}]   
2083077                                     [{'id': 18, 'name': 'Drama'}, {'id': 80, 'name': 'Crime'}]   
1492304  [{'id': 18, 'name': 'Drama'}, {'id': 35, 'name': 'Comedy'}, {'id': 10749, 'name': 'Romance'}]   
2638962                                                                  [{'id': 18, 'name': 'Drama'}]   

                      

In [2]:
import ast
import random
import time

start_time = time.time()

SEED_INT = 42
# Seed for consistent results across runtimes:
random.seed(SEED_INT)


def populate_names(item):
    """Extract names from the syntax of certain data entries:"""
    string  = item[1:-1]
    jsons = string.split("}, ")   
    names = ""
    index = 0
    for item in jsons:
        if(index == len(jsons)-1):
            temp_dict = ast.literal_eval(item)
            names+=str(temp_dict["name"])
        else:
            temp_dict = ast.literal_eval(item+"}")
            names+=str(str(temp_dict["name"])+" ")
        index += 1
    return names


def provide_data(row):
    """Extract data from row of complete_list:"""
    movie_data = []
    movie_data.append(int(row[0]))
    movie_data.append(int(row[1]))
    movie_data.append(float(row[2]))
    movie_data.append(row[3])  

    movie_data.append(populate_names(row[4]))
    movie_data.append(populate_names(row[5]))
    movie_data.append(populate_names(row[6]))
    movie_data.append(populate_names(row[7]))

    movie_data.append(str(row[8]))
    movie_data.append(str(row[9]))
    return movie_data
    


# The list of rows with users id, the users rating for the movie, and raw data for the movie:
# Note: It is sorted by user_id.
complete_list = complete_df.values.tolist()

print("Complete number of users:", len(list(complete_df["userId"].unique()))) # 260788

# The complete list of user rows without ratings of the same movie more than once for a given user:
complete_list_no_dups = []

# Distinquish the user the row belongs to:
last_id = complete_list[0][0]

# The set of movies that a user has rated:
# It is used to omit later ratings of a movie that the user has already rated.
movie_set = set()

# The number of rows of movie data a single user takes up for each user:
gaps = []

# Appended to gaps when all of a users rows of movie data have been counted:
gap_len = 0


# Populates gaps and complete_list_no_dups by omitting movies that already have a rating in respect to each user:
# Note: This code is faster than using dataframe methods.
# Example: Filter data by user and then remove duplicate movie ids for each user.
# This avoids slow dataframe iteration, but the filter method is also slow.
for row in complete_list:
    if last_id != row[0]:
        movie_set= set()
        complete_list_no_dups.append(row)
        movie_set.add(row[1])
        gaps.append(gap_len)
        gap_len = 1
    else:
        if row[1] not in movie_set:
            complete_list_no_dups.append(row)
            gap_len+=1
            movie_set.add(row[1])
    last_id = row[0]

# Add the last gap_len:
gaps.append(gap_len)



full_index = 0 
bounds = [] 

for user_index in range(len(gaps)):
    bounds.append([full_index, full_index+gaps[user_index]])
    full_index+=gaps[user_index]    
 


#LOOK: rundown of process
#LOOK: these are the types of user categories
#users that are there only to predict the svd for train and test users
#train users
#test users

#test and train users should have the same range of ratings
#svd users should have a different rating range

#there are 2 features to train the final model...
#against the target ratings of the train users
#feature 1: svd prediction from train users
#feature 2: average rating for the train users

#there are 2 features to test the final model...
#against the target ratings of the test users
#feature 1: svd prediction from test users
#feature 2: average rating for the test users


#These set the rating requirements for test and train users.
    

SVD_USER_RATING_LB = 20
SVD_USER_RATING_UB = 30
USER_RATING_LB = 5
USER_RATING_UB = 10




random.shuffle(bounds)
# no_svd_users = 1000
# train_users = 800
# test_users = 800
no_svd_users = 10000
train_users = 10000
test_users = 10000


last_index = -1
bounds_svd_users = []
bounds_train_users = []
bounds_test_users = []


index = 0
for item in bounds:
    if item[1]-item[0] >=SVD_USER_RATING_LB and item[1]-item[0] <=SVD_USER_RATING_UB:
        bounds_svd_users.append(item)
        if len(bounds_svd_users) == no_svd_users:
            last_index = index
            print("met")
            break
    index+=1



index+=1
for item in bounds[last_index:]:
    if item[1]-item[0] >=USER_RATING_LB and item[1]-item[0] <=USER_RATING_UB:
        bounds_train_users.append(item)
        if len(bounds_train_users) == train_users:
            last_index = index
            print("met")
            break
    index+=1

index+=1
for item in bounds[last_index:]:
    if item[1]-item[0] >=USER_RATING_LB and item[1]-item[0] <=USER_RATING_UB:
        bounds_test_users.append(item)
        if len(bounds_test_users) == test_users:
            print("met")
            break



# Transformed data of the selected train users and test users (in that order):
sampled_data = []


for bound in bounds_svd_users:
    for movie in complete_list_no_dups[bound[0]:bound[1]]:
        movie_data = provide_data(movie)
        sampled_data.append(movie_data)


for bound in bounds_train_users:
    for movie in complete_list_no_dups[bound[0]:bound[1]]:
        movie_data = provide_data(movie)
        sampled_data.append(movie_data)



for bound in bounds_test_users:
    for movie in complete_list_no_dups[bound[0]:bound[1]]:
        movie_data = provide_data(movie)
        sampled_data.append(movie_data)



print("Minutes taken:", (time.time()-start_time)/60)




Complete number of users: 260788
met
met
met
Minutes taken: 6.324570655822754


In [3]:
import csv
import os

current_directory = os.getcwd()
final_directory = os.path.join(current_directory, 'constructed_data')
if not os.path.exists(final_directory):
   os.makedirs(final_directory)

with open("constructed_data/constructed_data_3.csv", "w", encoding="utf-8", newline='') as f:
    writer = csv.writer(f)
    writer.writerow(['userId','id','rating',"title", "genres","production_companies","keywords", "cast", "tagline", "overview"])
    writer.writerows(sampled_data)

In [1]:
import csv

data_list =[]

with open("constructed_data/constructed_data_3.csv", 'r', encoding="utf-8") as f:
    csv_reader = csv.reader(f)
    data_list = list(csv_reader)

data_list = data_list[1:]


In [2]:
# nof_svd_users = 1000
# nof_train_users = 800
nof_svd_users = 10000
nof_train_users = 10000

user_to_data_svd = []
user_to_data_train= []
user_to_data_test = []

user_id = data_list[0][0]
ratings = []
user_index = 0



for row in data_list:
    if (row[0]!=user_id):
        if(user_index<nof_svd_users and user_index>=0):
            user_to_data_svd.append(ratings)
        elif(user_index<nof_svd_users+nof_train_users and user_index>=nof_svd_users):
            user_to_data_train.append(ratings)
        else:
            user_to_data_test.append(ratings)         
        user_id = row[0]
        ratings = [row]
        user_index+=1
    else:
        ratings.append(row)



user_to_data_test.append(ratings)

In [12]:
import random
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from numba import njit
import copy
from skopt import Optimizer
import pandas as pd

from dask.distributed import Client, LocalCluster
import dask.array as da

cluster = LocalCluster()
client = Client(cluster)

#test stock model to compare:
from surprise import SVD,Dataset,Reader


#LOOK: this cell has been developed to test a configuration


SEED_INT = 56
random.seed(SEED_INT)
np.random.seed(SEED_INT)
da.random.RandomState(SEED_INT)
da.random.default_rng(SEED_INT)
da.random.seed(SEED_INT)


def funct(block):

    SEED_INT = 56
    random.seed(SEED_INT)
    np.random.seed(SEED_INT)
    da.random.RandomState(SEED_INT)
    da.random.default_rng(SEED_INT)
    da.random.seed(SEED_INT)

    total_sum = 0 

    for row in block:
        user_to_data_svd_temp, user_to_data_train_temp, nof_latent_features, epochs, rt, lr = row


        old_to_new_svd  = dict()
        last_index_svd = 0
        svd_cnt = 0

        for user in user_to_data_svd_temp:
            for row in user: 
                if(row[1] in old_to_new_svd.keys()):
                    row[1] = old_to_new_svd[row[1]]
                else:
                    old_to_new_svd[row[1]] = last_index_svd
                    row[1] = last_index_svd
                    last_index_svd+=1      
                row[0] = svd_cnt
            svd_cnt+=1


        old_to_new_train = copy.deepcopy(old_to_new_svd)
        last_index_train = last_index_svd
        train_cnt = svd_cnt

        for user in user_to_data_train_temp:
            for row in user: 
                if(row[1] in old_to_new_train.keys()):
                    row[1] = old_to_new_train[row[1]]
                else:
                    old_to_new_train[row[1]] = last_index_train
                    row[1] = last_index_train
                    last_index_train+=1      
                row[0] = train_cnt
            train_cnt+=1


        target_rating_train = []
        train_list = []


        movies_order_svd = set()
        overall_average_svd = 0 
        cnt_svd = 0


        for user in user_to_data_svd_temp:
            for movie in user:
                movies_order_svd.add(movie[1])
                train_list.append([int(movie[0]), int(movie[1]), float(movie[2])])
                overall_average_svd+=float(movie[2])
                cnt_svd += 1



        movies_order_train = copy.deepcopy(movies_order_svd)
        overall_average_train = overall_average_svd 
        cnt_train = cnt_svd
        train_rating_to_predict = []

        for user in user_to_data_train_temp:
            rand_num  = random.randint(0, len(user)-1)
            index = 0
            for movie in user:
                movies_order_train.add(movie[1])
                if(index == rand_num):
                    train_rating_to_predict.append([int(movie[0]), int(movie[1])])
                    target_rating_train.append(float(movie[2]))
                else:
                    overall_average_train+=float(movie[2])
                    cnt_train += 1
                    train_list.append([int(movie[0]), int(movie[1]), float(movie[2])])
                index+=1


        overall_average_train = overall_average_train/cnt_train


        random.shuffle(train_list)


        @njit
        def epoch(list, b1, b2, p, q, overall_average, lr, rt):
            for row in list:
                u = int(row[0])
                i = int(row[1])
                r = row[2]

                pred = overall_average+b1[u]+b2[i]+np.dot(p[u],q[i])
                error = r-pred
                b1[u] += lr*(error- rt*b1[u])
                b2[i] += lr*(error- rt*b2[i])
                temp = lr*(error*q[i] -rt*p[u])
                q[i] += lr*(error*p[u] -rt*q[i])
                p[u] += temp





        def svd_iterative(list, n, epochs, rt, lr, overall_average, nof_users, nof_movies):
            q = np.random.normal(0, .1, (nof_movies, n))
            p = np.random.normal(0, .1, (nof_users, n))

            b1 = np.zeros(nof_users)
            b2 = np.zeros(nof_movies)

            np_array = np.array(list)

            for _ in range(epochs):
                epoch(np_array, b1, b2, p, q, overall_average, lr, rt)

            return b1, b2, p, q


        b1, b2, p, q = svd_iterative(train_list, nof_latent_features, epochs, rt, lr,
                                    overall_average_train, len(user_to_data_svd_temp)+len(user_to_data_train_temp), len(movies_order_train))

        feature_3_train = [overall_average_train + b1[pair[0]]+b2[pair[1]]
                                    +np.dot(p[pair[0]],q[pair[1]]) for pair in train_rating_to_predict]
    
        total_sum+=mean_squared_error(target_rating_train, feature_3_train, squared = False)

    return (np.array([[total_sum]], dtype="float32"))





mse_sum =0 
runs = 80



nof_svd_users, nof_train_users, nof_latent_features, epochs, rt, lr = (355, 123, 196, 204, 0.03407177995884917, 0.03668743198104899)


#LOOK: this makes this cell slightly inconsistent with the best call in the cell below
user_to_data_svd_copy = copy.deepcopy(user_to_data_svd)
user_to_data_train_copy = copy.deepcopy(user_to_data_train)


# random.shuffle(user_to_data_svd_copy)
# random.shuffle(user_to_data_train_copy)


# user_to_data_svd_list = [user_to_data_svd_copy[i*nof_svd_users : (i+1)*nof_svd_users] for i in range(runs)]
# user_to_data_train_list = [user_to_data_train_copy[i*nof_train_users : (i+1)*nof_train_users] for i in range(runs)]



parameters_list = []

for _ in range(runs):
    parameters_list.append([copy.deepcopy(random.sample(user_to_data_svd_copy, nof_svd_users)),
                            copy.deepcopy(random.sample(user_to_data_train_copy, nof_train_users)),
                            nof_latent_features, epochs, rt, lr])



test = np.array(parameters_list, dtype="object")
dask_array = da.from_array(test, chunks=(10,6))
results = dask_array.map_blocks(funct, chunks = (1,1), dtype="float32").compute()


for row in results:
    mse_sum+= row[0]


print("fixed parameters rmse score:",mse_sum/runs)


client.close()
cluster.close()


#LOOK: it is possible that random number generation is not needed
#After all, the true goal is to make results consistent no matter the seed used


# fixed parameters rmse score: 1.053252637386322
# fixed parameters rmse score: 1.0423293828964233
# inconsistent

This may cause some slowdown.
Consider scattering data ahead of time and using futures.


fixed parameters rmse score: 1.0423293828964233


In [6]:
import random
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from numba import njit
import copy
from skopt import Optimizer
import pandas as pd
import time

#Dask for parralel computing
from dask.distributed import Client, LocalCluster
import dask.array as da

cluster = LocalCluster()
client = Client(cluster)



start = time.time()


SEED_INT = 15
random.seed(SEED_INT)
np.random.seed(SEED_INT)



def funct(block):

    #LOOK: need to perform operations on the block size which is ten (need a loop)
    #user_to_data_svd_temp, user_to_data_train_temp become lists
    total_sum = 0 

    for row in block:
        user_to_data_svd_temp, user_to_data_train_temp, nof_latent_features, epochs, rt, lr = row


        old_to_new_svd  = dict()
        last_index_svd = 0
        svd_cnt = 0

        for user in user_to_data_svd_temp:
            for row in user: 
                if(row[1] in old_to_new_svd.keys()):
                    row[1] = old_to_new_svd[row[1]]
                else:
                    old_to_new_svd[row[1]] = last_index_svd
                    row[1] = last_index_svd
                    last_index_svd+=1      
                row[0] = svd_cnt
            svd_cnt+=1


        old_to_new_train = copy.deepcopy(old_to_new_svd)
        last_index_train = last_index_svd
        train_cnt = svd_cnt

        for user in user_to_data_train_temp:
            for row in user: 
                if(row[1] in old_to_new_train.keys()):
                    row[1] = old_to_new_train[row[1]]
                else:
                    old_to_new_train[row[1]] = last_index_train
                    row[1] = last_index_train
                    last_index_train+=1      
                row[0] = train_cnt
            train_cnt+=1

        # for ___ in range(5):    
        #     print(user_to_data_svd_temp[0][___])
        # for ___ in range(5):  
        #     print(user_to_data_train_temp[0][___])


        target_rating_train = []
        train_list = []


        movies_order_svd = set()
        overall_average_svd = 0 
        cnt_svd = 0


        for user in user_to_data_svd_temp:
            for movie in user:
                movies_order_svd.add(movie[1])
                train_list.append([int(movie[0]), int(movie[1]), float(movie[2])])
                overall_average_svd+=float(movie[2])
                cnt_svd += 1



        movies_order_train = copy.deepcopy(movies_order_svd)
        overall_average_train = overall_average_svd 
        cnt_train = cnt_svd
        train_rating_to_predict = []

        for user in user_to_data_train_temp:
            #passed
            rand_num  = random.randint(0, len(user)-1)
            index = 0
            for movie in user:
                movies_order_train.add(movie[1])
                if(index == rand_num):
                    train_rating_to_predict.append([int(movie[0]), int(movie[1])])
                    target_rating_train.append(float(movie[2]))
                else:
                    overall_average_train+=float(movie[2])
                    cnt_train += 1
                    train_list.append([int(movie[0]), int(movie[1]), float(movie[2])])
                index+=1


        #failed
        #note: the user is consistent but the movie isn't
                
        # for ___ in range(5):  
        #     print(train_rating_to_predict[___])

        # for ___ in range(5):
        #     print(train_list[___])


        overall_average_train = overall_average_train/cnt_train


        #passed
        random.shuffle(train_list)



        @njit
        def epoch(list, b1, b2, p, q, overall_average, lr, rt):
            for row in list:
                #conversions needed because numpy array converts to decimal
                u = int(row[0])
                i = int(row[1])
                r = row[2]

                pred = overall_average+b1[u]+b2[i]+np.dot(p[u],q[i])
                error = r-pred
                b1[u] += lr*(error- rt*b1[u])
                b2[i] += lr*(error- rt*b2[i])
                temp = lr*(error*q[i] -rt*p[u])
                q[i] += lr*(error*p[u] -rt*q[i])
                p[u] += temp





        def svd_iterative(list, n, epochs, rt, lr, overall_average, nof_users, nof_movies):
            #passed
            q = np.random.normal(0, .1, (nof_movies, n))
            p = np.random.normal(0, .1, (nof_users, n))

            b1 = np.zeros(nof_users)
            b2 = np.zeros(nof_movies)

            np_array = np.array(list)

            for _ in range(epochs):
                epoch(np_array, b1, b2, p, q, overall_average, lr, rt)

            return b1, b2, p, q


        #LOOK: potential problem: is len(user_to_data_svd_temp)+len(user_to_data_train_temp) the right legnht of all users
        #Answer: yes becasue there no users with only test rating (they at least have 5 ratings total).

        b1, b2, p, q = svd_iterative(train_list, nof_latent_features, epochs, rt, lr,
                                    overall_average_train, len(user_to_data_svd_temp)+len(user_to_data_train_temp), len(movies_order_train))

        #failed:
        # print(b1[0:5])
        # print(b2[0:5])
        # print(q[0][0:5])
        # print(p[0][0:5])
        # print(train_rating_to_predict[0:5])

        #passed: 
        # print(target_rating_train[0:5])

        feature_3_train = [overall_average_train + b1[pair[0]]+b2[pair[1]]
                                    +np.dot(p[pair[0]],q[pair[1]]) for pair in train_rating_to_predict]
        
        #failed:
        # print(feature_3_train[0:5])

        total_sum+=mean_squared_error(target_rating_train, feature_3_train, squared = False)

    return (np.array([[total_sum]], dtype="float32"))





#external parameters that are not part of the optimization process:
#user_to_data_svd
#user_to_data_train


#very large objective function:
def objective_function(vars):
    
    nof_svd_users, nof_train_users,nof_latent_features, epochs, rt, lr = vars

    # rounding here is a mistake:
    # nof_svd_users = round(nof_svd_users)
    # nof_train_users = round(nof_train_users)
    # nof_latent_features = round(nof_latent_features)
    # epochs = round(epochs)



    #LOOK: How does this integrate into the final model combining multiple features???
    # If I were to include the other features this function would take much longer
    # current plan: onyl use this fucntion to find teh right parameters 
    # then test with these parameter for all the features again using....
    # random selected users from the overall pool


    average_score = 0
    runs = 80

    #LOOK: At this level parallelzation can be applied since each
    #set of variables is temporary besides user_to_data_svd and user_to_data_train

    #LOOK: instead of random sampling
    #the 10000 train and svd users can be seperated into 20 parts
    #so no user gets selected twice
    #20 runs should be enough
    #may need to shuffle user_to_data_svd and user_to_data_train....
    #probably not since the bounds are shuffled in cell 2


    user_to_data_svd_copy = copy.deepcopy(user_to_data_svd)
    user_to_data_train_copy = copy.deepcopy(user_to_data_train)


    # random.shuffle(user_to_data_svd_copy)
    # random.shuffle(user_to_data_train_copy)

    # user_to_data_svd_list = [user_to_data_svd_copy[i*nof_svd_users : (i+1)*nof_svd_users]for i in range(runs)]
    # user_to_data_train_list = [user_to_data_train_copy[i*nof_train_users : (i+1)*nof_train_users] for i in range(runs)]


    #LOOK: problem: if the runs is more than 20 then it is possible for some sub ararys in user_to_data_svd_list to not have enough users
    #Instead need to try randomly sampling from user_to_data_svd_copy as there will always be enough 


    #With dask: After using deepcopy, sample from the copies 80 times
    #build a dask array with length of 80 (for each sample) (not sure what to do with chunks)
    #chunks is used to parallelize the task
    #call map_blocks off the array and compute
    #extract the results and average them
    #need to convert the iteration step into a function


    #parameters needed for the fucntion:
    #(svd sample, train sample), nof_latent_features, epochs, rt, lr


    parameters_list = []

    for _ in range(runs):
        parameters_list.append([random.sample(user_to_data_svd_copy, nof_svd_users),
                                random.sample(user_to_data_train_copy, nof_train_users),
                                nof_latent_features, epochs, rt, lr])



    test = np.array(parameters_list, dtype="object")
    dask_array = da.from_array(test, chunks=(10,6))
    results = dask_array.map_blocks(funct, chunks = (1,1), dtype="float32").compute()


    sum = 0 
    for item in results:
        sum += item[0]

    return sum/runs



    # for __ in range(runs):


    #     # user_to_data_svd_temp = user_to_data_svd_list[__]
    #     # user_to_data_train_temp = user_to_data_train_list[__]


    #     user_to_data_svd_temp  = random.sample(user_to_data_svd_copy, nof_svd_users)
    #     user_to_data_train_temp  = random.sample(user_to_data_train_copy, nof_train_users)    


    #     old_to_new_svd  = dict()
    #     last_index_svd = 0
    #     svd_cnt = 0

    #     for user in user_to_data_svd_temp:
    #         for row in user: 
    #             if(row[1] in old_to_new_svd.keys()):
    #                 row[1] = old_to_new_svd[row[1]]
    #             else:
    #                 old_to_new_svd[row[1]] = last_index_svd
    #                 row[1] = last_index_svd
    #                 last_index_svd+=1      
    #             row[0] = svd_cnt
    #         svd_cnt+=1


    #     old_to_new_train = copy.deepcopy(old_to_new_svd)
    #     last_index_train = last_index_svd
    #     train_cnt = svd_cnt

    #     for user in user_to_data_train_temp:
    #         for row in user: 
    #             if(row[1] in old_to_new_train.keys()):
    #                 row[1] = old_to_new_train[row[1]]
    #             else:
    #                 old_to_new_train[row[1]] = last_index_train
    #                 row[1] = last_index_train
    #                 last_index_train+=1      
    #             row[0] = train_cnt
    #         train_cnt+=1

    #     # for ___ in range(5):    
    #     #     print(user_to_data_svd_temp[0][___])
    #     # for ___ in range(5):  
    #     #     print(user_to_data_train_temp[0][___])


    #     target_rating_train = []
    #     train_list = []


    #     movies_order_svd = set()
    #     overall_average_svd = 0 
    #     cnt_svd = 0


    #     for user in user_to_data_svd_temp:
    #         for movie in user:
    #             movies_order_svd.add(movie[1])
    #             train_list.append([int(movie[0]), int(movie[1]), float(movie[2])])
    #             overall_average_svd+=float(movie[2])
    #             cnt_svd += 1



    #     movies_order_train = copy.deepcopy(movies_order_svd)
    #     overall_average_train = overall_average_svd 
    #     cnt_train = cnt_svd
    #     train_rating_to_predict = []

    #     for user in user_to_data_train_temp:
    #         #passed
    #         rand_num  = random.randint(0, len(user)-1)
    #         index = 0
    #         for movie in user:
    #             movies_order_train.add(movie[1])
    #             if(index == rand_num):
    #                 train_rating_to_predict.append([int(movie[0]), int(movie[1])])
    #                 target_rating_train.append(float(movie[2]))
    #             else:
    #                 overall_average_train+=float(movie[2])
    #                 cnt_train += 1
    #                 train_list.append([int(movie[0]), int(movie[1]), float(movie[2])])
    #             index+=1


    #     #failed
    #     #note: the user is consistent but the movie isn't
                
    #     # for ___ in range(5):  
    #     #     print(train_rating_to_predict[___])

    #     # for ___ in range(5):
    #     #     print(train_list[___])


    #     overall_average_train = overall_average_train/cnt_train


    #     #passed
    #     random.shuffle(train_list)



    #     @njit
    #     def epoch(list, b1, b2, p, q, overall_average, lr, rt):
    #         for row in list:
    #             #conversions needed because numpy array converts to decimal
    #             u = int(row[0])
    #             i = int(row[1])
    #             r = row[2]

    #             pred = overall_average+b1[u]+b2[i]+np.dot(p[u],q[i])
    #             error = r-pred
    #             b1[u] += lr*(error- rt*b1[u])
    #             b2[i] += lr*(error- rt*b2[i])
    #             temp = lr*(error*q[i] -rt*p[u])
    #             q[i] += lr*(error*p[u] -rt*q[i])
    #             p[u] += temp




    
    #     def svd_iterative(list, n, epochs, rt, lr, overall_average, nof_users, nof_movies):
    #         #passed
    #         q = np.random.normal(0, .1, (nof_movies, n))
    #         p = np.random.normal(0, .1, (nof_users, n))

    #         b1 = np.zeros(nof_users)
    #         b2 = np.zeros(nof_movies)

    #         np_array = np.array(list)

    #         for _ in range(epochs):
    #             epoch(np_array, b1, b2, p, q, overall_average, lr, rt)

    #         return b1, b2, p, q


    #     #LOOK: potential problem: is len(user_to_data_svd_temp)+len(user_to_data_train_temp) the right legnht of all users
    #     #Answer: yes becasue there no users with only test rating (they at least have 5 ratings total).

    #     b1, b2, p, q = svd_iterative(train_list, nof_latent_features, epochs, rt, lr,
    #                                 overall_average_train, len(user_to_data_svd_temp)+len(user_to_data_train_temp), len(movies_order_train))

    #     #failed:
    #     # print(b1[0:5])
    #     # print(b2[0:5])
    #     # print(q[0][0:5])
    #     # print(p[0][0:5])
    #     # print(train_rating_to_predict[0:5])

    #     #passed: 
    #     # print(target_rating_train[0:5])

    #     feature_3_train = [overall_average_train + b1[pair[0]]+b2[pair[1]]
    #                                 +np.dot(p[pair[0]],q[pair[1]]) for pair in train_rating_to_predict]
        
    #     #failed:
    #     # print(feature_3_train[0:5])

    #     average_score += mean_squared_error(target_rating_train, feature_3_train, squared = False)







def bayesian_optimization(bounds, iterations):

    #presets
    #LOOK: what is the number of jobs???
    #try modifying n_initial_points and n_jobs....
    #LOOK closely at the optimize function

    optimizer = Optimizer(
        dimensions=bounds,
        base_estimator="gp",
        # try setting higher values here...
        # 10 already tested
        n_initial_points = 10,
        acq_func="EI",
        acq_optimizer="sampling",
        random_state=SEED_INT
    )


    #reproducable: 
    # X_init =[np.random.uniform(item[0], item[1]) for item in bounds]
    X_init = [(item[0] + item[1])/2.0 for item in bounds]


    for i in range(4):
        X_init[i] = round(X_init[i])

    #Not reproducable:
    Y_init = objective_function(X_init)


    optimizer.tell(X_init, Y_init)

    for _ in range(iterations):
        x_next = optimizer.ask()
        for i in range(4):
            x_next[i] = round(x_next[i])
        y_next = objective_function(x_next)
        optimizer.tell(x_next, y_next)

    #LOOK: does the optimizer shoot for the min or the max?
    #gpt: the goal is to minimize the obejective function...

    sorted_pairs = sorted(zip(optimizer.Xi, optimizer.yi), key = lambda pair : pair[1], reverse=False)


    X = sorted_pairs[0][0]
    y = sorted_pairs[0][1]

 
    for i in range(5):
        print(sorted_pairs[i][0])
        print(sorted_pairs[i][1])

    return X, y


#of_svd_users, nof_train_users,
# nof_latent_features, epochs, rt, lr

from skopt import gp_minimize
from skopt.space import Real
from skopt.space import Integer

# bounds = [(200.0, 700.0),(100.0, 600.0),(10.0,200.0),(10.0,300.0),(.01, .05),(.001, .05)]
# nof_svd_users, nof_train_users,nof_latent_features, epochs, rt, lr

mid_points = [(300, 500),(100, 200),(100,300),(100,400),(.01, .075),(.001, .05)]

mid_points = [(lambda pair : int((pair[0]+pair[1])/2) if((pair[0]+pair[1])/2==int((pair[0]+pair[1])/2)) else (pair[0]+pair[1])/2)(item) for item in mid_points]


bounds = [Integer(300, 500, name = 'nof_svd_users'),Integer(100, 200, name = 'nof_train_users'),
          Integer(100,300, name = 'nof_latent_features'),Integer(100,400, name = 'epochs'),
          Real(.01, .075, name = 'rt'),Real(.001, .05, name = 'lr')]


# LOOK: This is the custom made baysian opt
# X, y = bayesian_optimization(bounds, 5)


#LOOK: is ther another way???
#LOOK: numpy version downgraded to verison 1.23.5 to support usage of np.int in built in functions...
# https://github.com/WongKinYiu/yolov7/issues/1280
#avoids having ot use (np.int = int)


# https://scikit-optimize.github.io/stable/auto_examples/bayesian-optimization.html
# https://scikit-optimize.github.io/stable/modules/generated/skopt.gp_minimize.html#skopt.gp_minimize

# This is currently not typical baysian optimization:
# LOOK: try to use baysian optmization as applied in the above link

#LOOK: Why does n_calls need to be equal to or larger than n_initial points
res = gp_minimize(objective_function,                 
                  bounds,      
                  n_calls=10, 
                  n_initial_points = 9,    
                  # n_initial_points = 10, 
                  x0 = mid_points,    
                  random_state= SEED_INT,
                  n_points = 10000,
                  )


print("Solution: x", res.x)
print("Result: y", res.fun)
print("time taken:", (time.time()-start)/60)


client.close()
cluster.close()



#LOOK: need to try starting at the midpoint value for bounds instead of randomly selecting the starting point

#LOOK: need to compare to stock svd suprise algorithm

#LOOK: what is the number of jobs???
#try modifying n_initial_points and n_jobs....


#LOOK: Question: What is the issue with keeping the seeds that perfrom the best to use with the actual model
#just because they perform well with a certain seed doesn mean it will perfrom well with completely new random data
#the point of the model is to generalize to new data, not learn the data it has access to.
#LOOK: Potential solution: Need to test the trained model on more chunks of data before evaluation
#LOOK: In training, the initial model params partly determine by random chance (the seed)
#need to test multple initial conditions with the same hyperparamters that need to be tuned 


# upperseed = 42
# svd bounds: 20-30
# trainbounds: 5-10
# 10000 svd pool, 10000 train pool
# seed = 5
# calls = 20
# runs = 20
# no shuffle
# random init
# [(300, 500),(100, 200),(100,300),(100,400),(.01, .075),(.001, .05)]
# Solution: x [490, 114, 265, 100, 0.075, 0.003306356733754837]
# Result: y 1.008934000606866
# time taken: 14.760872900485992

# upperseed = 42
# svd bounds: 20-30
# trainbounds: 5-10
# 10000 svd pool, 10000 train pool
# seed = 5
# calls = 20
# runs = 20
# shuffle
# random init
# [(300, 500),(100, 200),(100,300),(100,400),(.01, .075),(.001, .05)]
# Solution: x [359, 105, 159, 322, 0.019187838615362548, 0.02148636162678124]
# Result: y 1.0268730779416964
# time taken: 15.03641388018926


# upperseed = 42
# svd bounds: 20-30
# trainbounds: 5-10
# 10000 svd pool, 10000 train pool
# seed = 5
# calls = 20
# runs = 20
# shuffle
# middle init
# [(300, 500),(100, 200),(100,300),(100,400),(.01, .075),(.001, .05)]
# Solution: x [491, 107, 100, 398, 0.06854948616872057, 0.02732078900650637]
# Result: y 1.0097142989582657
# time taken: 22.572298232714335

#****
# upperseed = 42
# svd bounds: 20-30
# trainbounds: 5-10
# 10000 svd pool, 10000 train pool
# seed = 5
# calls = 20
# runs = 40
# shuffle
# middle init
# [(300, 500),(100, 200),(100,300),(100,400),(.01, .075),(.001, .05)]


# upperseed = 42
# svd bounds: 20-30
# trainbounds: 5-10
# 10000 svd pool, 10000 train pool
# seed = 10
# calls = 20
# runs = 20
# no shuffle
# random init
# [(300, 500),(100, 200),(100,300),(100,400),(.01, .075),(.001, .05)]
# Solution: x [408, 113, 183, 334, 0.047953588767488946, 0.009948940378798735]
# Result: y 0.998127063254195
# time taken: 15.871690555413563


# upperseed = 42
# svd bounds: 20-30
# trainbounds: 5-10
# 10000 svd pool, 10000 train pool
# seed = 10
# calls = 20
# runs = 20
# shuffle
# random init
# [(300, 500),(100, 200),(100,300),(100,400),(.01, .075),(.001, .05)]
# Solution: x [419, 144, 171, 276, 0.019715636904085565, 0.009390691312187987]
# Result: y 1.0273296521578534
# time taken: 16.87793436050415

# upperseed = 42
# svd bounds: 20-30
# trainbounds: 5-10
# 10000 svd pool, 10000 train pool
# seed = 10
# calls = 20
# runs = 20
# shuffle
# middle init
# [(300, 500),(100, 200),(100,300),(100,400),(.01, .075),(.001, .05)]
# Solution: x [442, 127, 153, 145, 0.05444819790848023, 0.04101348992581268]
# Result: y 1.0108166741898796
# time taken: 18.037524509429932

#****
# upperseed = 42
# svd bounds: 20-30
# trainbounds: 5-10
# 10000 svd pool, 10000 train pool
# seed = 10
# calls = 20
# runs = 40
# shuffle
# middle init
# [(300, 500),(100, 200),(100,300),(100,400),(.01, .075),(.001, .05)]

# upperseed = 42
# svd bounds: 20-30
# trainbounds: 5-10
# 10000 svd pool, 10000 train pool
# seed = 15
# calls = 20
# runs = 20
# no shuffle
# random init
# [(300, 500),(100, 200),(100,300),(100,400),(.01, .075),(.001, .05)]
# Solution: x [463, 159, 122, 346, 0.07023178357221199, 0.0022226985841246414]
# Result: y 1.013064110433222
# time taken: 17.710401757558188


# upperseed = 42
# svd bounds: 20-30
# trainbounds: 5-10
# 10000 svd pool, 10000 train pool
# seed = 15
# calls = 20
# runs = 20
# shuffle
# random init
# [(300, 500),(100, 200),(100,300),(100,400),(.01, .075),(.001, .05)]
# Solution: x [410, 198, 184, 350, 0.05588458182689485, 0.03895764265158545]
# Result: y 1.0295537903652323
# time taken: 21.355472441514333


# upperseed = 42
# svd bounds: 20-30
# trainbounds: 5-10
# 10000 svd pool, 10000 train pool
# seed = 15
# calls = 20
# runs = 20
# shuffle
# middle init
# [(300, 500),(100, 200),(100,300),(100,400),(.01, .075),(.001, .05)]
# Solution: x [387, 183, 259, 380, 0.05996516467807905, 0.013017812396446948]
# Result: y 1.0126191620694736
# time taken: 27.12620999018351

#****
# upperseed = 42
# svd bounds: 20-30
# trainbounds: 5-10
# 10000 svd pool, 10000 train pool
# seed = 15
# calls = 20
# runs = 40
# shuffle
# middle init
# [(300, 500),(100, 200),(100,300),(100,400),(.01, .075),(.001, .05)]
# Solution: x [437, 100, 169, 189, 0.016322398571097588, 0.024221960056937827]
# Result: y 1.0389433334175275
# time taken: 37.2435434738795


#********
# upperseed = 42
# svd bounds: 20-30
# trainbounds: 5-10
# 10000 svd pool, 10000 train pool
# seed = 15
# calls = 10
# runs = 80
# shuffle
# middle init
# [(300, 500),(100, 200),(100,300),(100,400),(.01, .075),(.001, .05)]
# Solution: x [355, 123, 196, 204, 0.03407177995884917, 0.03668743198104899]
# Result: y 1.084261420345655
# time taken: 37.630781781673434

#(2 blocks)
# upperseed = 42
# svd bounds: 20-30
# trainbounds: 5-10
# 10000 svd pool, 10000 train pool
# seed = 15
# calls = 10
# runs = 80
# shuffle
# middle init
# [(300, 500),(100, 200),(100,300),(100,400),(.01, .075),(.001, .05)]
# Solution: x [355, 123, 196, 204, 0.03407177995884917, 0.03668743198104899]
# Result: y 1.0563209533691407
# time taken: 23.191765848795573

#(8 blocks) (inconsistent)
# upperseed = 42
# svd bounds: 20-30
# trainbounds: 5-10
# 10000 svd pool, 10000 train pool
# seed = 15
# calls = 10
# runs = 80
# shuffle
# middle init
# [(300, 500),(100, 200),(100,300),(100,400),(.01, .075),(.001, .05)]
# Solution: x [323, 136, 265, 178, 0.015681081867037996, 0.0481870063690547]
# Result: y 1.0332631111145019
# time taken: 15.885440341631572

# Solution: x [323, 136, 265, 178, 0.015681081867037996, 0.0481870063690547]
# Result: y 1.0419336438179017
# time taken: 16.073787983258566



This may cause some slowdown.
Consider scattering data ahead of time and using futures.
This may cause some slowdown.
Consider scattering data ahead of time and using futures.
This may cause some slowdown.
Consider scattering data ahead of time and using futures.
This may cause some slowdown.
Consider scattering data ahead of time and using futures.
This may cause some slowdown.
Consider scattering data ahead of time and using futures.
This may cause some slowdown.
Consider scattering data ahead of time and using futures.
This may cause some slowdown.
Consider scattering data ahead of time and using futures.
This may cause some slowdown.
Consider scattering data ahead of time and using futures.
This may cause some slowdown.
Consider scattering data ahead of time and using futures.
This may cause some slowdown.
Consider scattering data ahead of time and using futures.


Solution: x [323, 136, 265, 178, 0.015681081867037996, 0.0481870063690547]
Result: y 1.0419336438179017
time taken: 16.073787983258566
