In [1]:
import numpy as np
import pandas as pd
import json
import sklearn as skl
from sklearn.model_selection import GroupShuffleSplit

In [3]:
movies = pd.read_csv("./ml-latest-small/movies.csv")
genre = {}
for l in movies['genres'].str.split('|'):
    for g in l:
        if g not in genre:
            genre[g] = 1
        else:
            genre[g] += 1
            
genretoidx = {}
idxtogenre = []
for i, g in enumerate(genre):
    # print(g, i)
    genretoidx[g] = i
    idxtogenre.append(g)
    
def toVec(genres):
    vec = [0] * 20
    # print(genres)
    for g in genres.split('|'):
        # print(g)
        vec[genretoidx[g]] += 1
    return vec
    
genredict = {m[0] : toVec(m[1]) for m in movies[['movieId','genres']].to_numpy()}
with open('data.json', 'w') as fp:
    json.dump(genredict, fp)

In [2]:
def fx(rating, movieId):
    # print(genredict[movieId], rating)
    return (rating * np.array(genredict[movieId])).astype(object)

def gx(genrating):
    return np.rint((10 / (1 + np.exp(-genrating.astype(float)*0.2)))).astype(object)


In [98]:
ratings = pd.read_csv("./ml-latest-small/ratings.csv")
train, test = skl.model_selection.train_test_split(ratings, test_size=0.2)
train['rating'] = train.groupby('userId')['rating'].transform(lambda x: (x - x.mean()) / x.std())
train['genrating'] = np.vectorize(fx)(train['rating'], train['movieId'])
train["sigmoid"] = np.vectorize(gx)(train['genrating'])
train.sort_values(by=['userId'])
train.head()

Unnamed: 0,userId,movieId,rating,timestamp,genrating,sigmoid
70584,450,3173,-0.031442,974705331,"[-0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.031442...","[5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, ..."
8623,59,10,-1.39899,953609378,"[-1.3989898003317212, -0.0, -0.0, -0.0, -0.0, ...","[4.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 4.0, 5.0, ..."
8048,57,95,-0.358619,969754664,"[-0.35861947061399607, -0.0, -0.0, -0.0, -0.0,...","[5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, ..."
70639,451,1059,0.301816,854090644,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.30181636371957526,...","[5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, ..."
74811,474,7493,0.732881,1083589553,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.7328810949992...","[5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, ..."


In [101]:
scores = train.groupby('userId')['genrating'].sum()
users = (train.groupby(['userId']).size()).to_frame(name="rated")
users["scores"] = scores
users["sigmoid"] = np.vectorize(gx)(users['scores'])
users.head()

Unnamed: 0_level_0,rated,scores,sigmoid
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,186,"[0.16723733960625142, 8.696341659523318, 7.065...","[5.0, 9.0, 8.0, 2.0, 4.0, 4.0, 9.0, 2.0, 4.0, ..."
2,27,"[-0.5035640516566127, 0.0, 0.0, 0.801124627635...","[5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 4.0, ..."
3,37,"[1.9533216295594686, -2.9203125352819765, -3.8...","[6.0, 4.0, 3.0, 3.0, 6.0, 3.0, 1.0, 8.0, 4.0, ..."
4,169,"[0.8598899218054157, 1.8683472122997313, 1.823...","[5.0, 6.0, 6.0, 3.0, 5.0, 2.0, 2.0, 4.0, 7.0, ..."
5,33,"[-3.1512357524463424, 3.858054986639916, 3.269...","[3.0, 7.0, 7.0, 5.0, 7.0, 3.0, 6.0, 3.0, 5.0, ..."


In [102]:
train_mod = train[['userId', 'movieId']]
# train_mod.merge(users, on="userId")
cols = ["userId"]
train_mod = train_mod.merge(users, left_on='userId', right_on='userId')
train_mod["genre"] = train_mod['movieId'].apply(lambda x: np.random.choice(np.nonzero(np.array(genredict[x]) == 1)[0])
)
train_mod["reward"] = 1
train_mod.drop(columns=["movieId", "rated", "scores"], inplace=True)
train_mod.head()

Unnamed: 0,userId,sigmoid,genre,reward
0,450,"[6.0, 5.0, 5.0, 4.0, 5.0, 5.0, 7.0, 7.0, 6.0, ...",6,1
1,450,"[6.0, 5.0, 5.0, 4.0, 5.0, 5.0, 7.0, 7.0, 6.0, ...",8,1
2,450,"[6.0, 5.0, 5.0, 4.0, 5.0, 5.0, 7.0, 7.0, 6.0, ...",3,1
3,450,"[6.0, 5.0, 5.0, 4.0, 5.0, 5.0, 7.0, 7.0, 6.0, ...",6,1
4,450,"[6.0, 5.0, 5.0, 4.0, 5.0, 5.0, 7.0, 7.0, 6.0, ...",12,1


In [103]:
train_false = train_mod.copy()
train_false['reward'] = 0
train_false['genre'] = np.random.randint(1, 20, train_false.shape[0])
train_false.head()

Unnamed: 0,userId,sigmoid,genre,reward
0,450,"[6.0, 5.0, 5.0, 4.0, 5.0, 5.0, 7.0, 7.0, 6.0, ...",18,0
1,450,"[6.0, 5.0, 5.0, 4.0, 5.0, 5.0, 7.0, 7.0, 6.0, ...",6,0
2,450,"[6.0, 5.0, 5.0, 4.0, 5.0, 5.0, 7.0, 7.0, 6.0, ...",1,0
3,450,"[6.0, 5.0, 5.0, 4.0, 5.0, 5.0, 7.0, 7.0, 6.0, ...",19,0
4,450,"[6.0, 5.0, 5.0, 4.0, 5.0, 5.0, 7.0, 7.0, 6.0, ...",11,0


In [104]:
train_final = pd.concat([train_mod, train_false], ignore_index=True)
train_final = train_final.iloc[:,[2,3,0,1]]
train_final[list(genre.keys())] = pd.DataFrame(train_final.sigmoid.tolist(), index= train_final.index)
train_final.drop(columns=["userId", "sigmoid"], inplace=True)
train_final = train_final.sample(frac=1)
train_final.head()

Unnamed: 0,genre,reward,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,Action,...,Horror,Mystery,Sci-Fi,War,Musical,Documentary,IMAX,Western,Film-Noir,(no genres listed)
161261,9,0,5.0,4.0,4.0,3.0,4.0,4.0,6.0,6.0,...,5.0,5.0,6.0,6.0,5.0,5.0,5.0,5.0,5.0,5.0
110330,3,0,2.0,6.0,3.0,8.0,5.0,7.0,9.0,3.0,...,5.0,4.0,3.0,5.0,5.0,5.0,3.0,4.0,5.0,5.0
109636,8,0,6.0,9.0,2.0,0.0,3.0,8.0,10.0,1.0,...,1.0,3.0,0.0,10.0,9.0,5.0,7.0,9.0,8.0,5.0
46888,4,1,5.0,6.0,6.0,4.0,6.0,3.0,6.0,4.0,...,5.0,6.0,4.0,6.0,5.0,5.0,6.0,5.0,5.0,5.0
891,8,1,5.0,8.0,0.0,0.0,0.0,10.0,10.0,0.0,...,0.0,8.0,8.0,5.0,6.0,10.0,9.0,2.0,10.0,5.0


In [105]:
train_final.dropna(inplace=True)

In [106]:
np.savetxt("./smalltraindataset.txt", train_final.values, fmt='%d')

In [113]:
test_mod = test[['userId', 'movieId']]
test_mod = test_mod.merge(users, left_on='userId', right_on='userId')
test_mod["genre"] = test_mod['movieId'].apply(lambda x: np.random.choice(np.nonzero(np.array(genredict[x]) == 1)[0])
)
test_mod["reward"] = 1
test_mod.drop(columns=["movieId", "rated", "scores"], inplace=True)
test_false = test_mod.copy()
test_false['reward'] = 0
test_false['genre'] = np.random.randint(1, 20, test_false.shape[0])
test_final = pd.concat([test_mod, test_false], ignore_index=True)
test_final = test_final.iloc[:,[2,3,0,1]]
test_final[list(genre.keys())] = pd.DataFrame(test_final.sigmoid.tolist(), index= test_final.index)
test_final.drop(columns=["userId", "sigmoid"], inplace=True)
test_final = test_final.sample(frac=1)
test_final.dropna(inplace=True)
test_final.head()
np.savetxt("./smalltestdataset.txt", train_final.values, fmt='%d')

In [112]:
ratings['userId'].value_counts()[414]
train['userId'].value_counts()[414]
# test['userId'].value_counts()[414]
test['userId'].unique().shape

(608,)