In [1]:
import numpy as np
import pandas as pd
import json
import sklearn as skl
from sklearn.model_selection import GroupShuffleSplit

In [3]:
movies = pd.read_csv("./ml-latest-small/movies.csv")
# movies = pd.read_csv("../ml-25m/movies.csv")
genre = {}
for l in movies['genres'].str.split('|'):
    for g in l:
        if g not in genre:
            genre[g] = 1
        else:
            genre[g] += 1
            
genretoidx = {}
idxtogenre = []
for i, g in enumerate(genre):
    # print(g, i)
    genretoidx[g] = i
    idxtogenre.append(g)
    
def toVec(genres):
    vec = [0] * 20
    # print(genres)
    for g in genres.split('|'):
        # print(g)
        vec[genretoidx[g]] += 1
    return vec
    
genredict = {m[0] : toVec(m[1]) for m in movies[['movieId','genres']].to_numpy()}
with open('data.json', 'w') as fp:
    json.dump(genredict, fp)

In [4]:
def fx(rating, movieId):
    # print(genredict[movieId], rating)
    return (rating * np.array(genredict[movieId])).astype(object)

def gx(genrating):
    return np.rint((10 / (1 + np.exp(-genrating.astype(float)*0.2)))).astype(object)


In [5]:
ratings = pd.read_csv("./ml-latest-small/ratings.csv")
# ratings = pd.read_csv("../ml-25m/ratings.csv")
ratings['rating'] = ratings.groupby('userId')['rating'].transform(lambda x: (x - x.mean()) / x.std())
ratings['genrating'] = np.vectorize(fx)(ratings['rating'], ratings['movieId'])
ratings["sigmoid"] = np.vectorize(gx)(ratings['genrating'])
ratings.sort_values(by=['userId'])
ratings.head()

In [5]:
scores = ratings.groupby('userId')['genrating'].sum()
users = (ratings.groupby(['userId']).size()).to_frame(name="rated")
users["scores"] = scores
users["sigmoid"] = np.vectorize(gx)(users['scores'])
users.head()

Unnamed: 0_level_0,rated,scores,sigmoid
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,232,"[2.3220588165700233, 11.71804623211082, 9.5145...","[6.0, 9.0, 9.0, 1.0, 3.0, 4.0, 9.0, 3.0, 5.0, ..."
2,29,"[0.8132579362354637, 0.0, 0.0, 0.4494320173932...","[5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 4.0, 5.0, 4.0, ..."
3,39,"[1.5330834557146171, -3.7039296290065127, -4.6...","[6.0, 3.0, 3.0, 2.0, 6.0, 3.0, 1.0, 8.0, 4.0, ..."
4,216,"[2.1982045396984713, 2.0291118827985866, 1.860...","[6.0, 6.0, 6.0, 3.0, 6.0, 2.0, 2.0, 3.0, 7.0, ..."
5,44,"[-3.1207416126885827, 4.222179828931615, 4.313...","[3.0, 7.0, 7.0, 4.0, 7.0, 2.0, 7.0, 3.0, 6.0, ..."


In [6]:
train_mod = ratings[['userId', 'movieId']]
# train_mod.merge(users, on="userId")
cols = ["userId"]
train_mod = train_mod.merge(users, left_on='userId', right_on='userId')
train_mod["genre"] = train_mod['movieId'].apply(lambda x: np.random.choice(np.nonzero(np.array(genredict[x]) == 1)[0])
)
train_mod["reward"] = 1
train_mod.drop(columns=["movieId", "rated", "scores"], inplace=True)
train_mod.head()

Unnamed: 0,userId,sigmoid,genre,reward
0,1,"[6.0, 9.0, 9.0, 1.0, 3.0, 4.0, 9.0, 3.0, 5.0, ...",0,1
1,1,"[6.0, 9.0, 9.0, 1.0, 3.0, 4.0, 9.0, 3.0, 5.0, ...",5,1
2,1,"[6.0, 9.0, 9.0, 1.0, 3.0, 4.0, 9.0, 3.0, 5.0, ...",7,1
3,1,"[6.0, 9.0, 9.0, 1.0, 3.0, 4.0, 9.0, 3.0, 5.0, ...",11,1
4,1,"[6.0, 9.0, 9.0, 1.0, 3.0, 4.0, 9.0, 3.0, 5.0, ...",9,1


In [7]:
train_false = train_mod.copy()
train_false['reward'] = 0
train_false['genre'] = np.random.randint(1, 20, train_false.shape[0])
train_false.head()

Unnamed: 0,userId,sigmoid,genre,reward
0,1,"[6.0, 9.0, 9.0, 1.0, 3.0, 4.0, 9.0, 3.0, 5.0, ...",18,0
1,1,"[6.0, 9.0, 9.0, 1.0, 3.0, 4.0, 9.0, 3.0, 5.0, ...",11,0
2,1,"[6.0, 9.0, 9.0, 1.0, 3.0, 4.0, 9.0, 3.0, 5.0, ...",15,0
3,1,"[6.0, 9.0, 9.0, 1.0, 3.0, 4.0, 9.0, 3.0, 5.0, ...",11,0
4,1,"[6.0, 9.0, 9.0, 1.0, 3.0, 4.0, 9.0, 3.0, 5.0, ...",2,0


In [8]:
train_final = pd.concat([train_mod, train_false], ignore_index=True)
train_final = train_final.iloc[:,[2,3,0,1]]
train_final[list(genre.keys())] = pd.DataFrame(train_final.sigmoid.tolist(), index= train_final.index)
train_final.drop(columns=["userId", "sigmoid"], inplace=True)
train_final = train_final.sample(frac=1)
train_final.head()

Unnamed: 0,genre,reward,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,Action,...,Horror,Mystery,Sci-Fi,War,Musical,Documentary,IMAX,Western,Film-Noir,(no genres listed)
97291,3,1,8.0,2.0,5.0,7.0,1.0,8.0,2.0,6.0,...,2.0,6.0,6.0,3.0,3.0,5.0,8.0,5.0,5.0,5.0
111264,7,0,10.0,9.0,0.0,0.0,9.0,10.0,1.0,10.0,...,0.0,1.0,2.0,8.0,1.0,10.0,10.0,7.0,6.0,5.0
10734,6,1,10.0,9.0,0.0,0.0,9.0,10.0,1.0,10.0,...,0.0,1.0,2.0,8.0,1.0,10.0,10.0,7.0,6.0,5.0
142503,15,0,8.0,8.0,8.0,0.0,8.0,4.0,10.0,2.0,...,3.0,4.0,8.0,6.0,6.0,3.0,6.0,6.0,5.0,5.0
9144,10,1,0.0,1.0,1.0,0.0,0.0,0.0,10.0,0.0,...,4.0,8.0,0.0,6.0,2.0,5.0,4.0,6.0,5.0,6.0


In [12]:
train_final.dropna(inplace=True)

In [13]:
np.savetxt("./traindataset.txt", train_final.values, fmt='%d')

In [11]:
test_mod = test[['userId', 'movieId']]
test_mod = test_mod.merge(users, left_on='userId', right_on='userId')
test_mod["genre"] = test_mod['movieId'].apply(lambda x: np.random.choice(np.nonzero(np.array(genredict[x]) == 1)[0])
)
test_mod["reward"] = 1
test_mod.drop(columns=["movieId", "rated", "scores"], inplace=True)
test_false = test_mod.copy()
test_false['reward'] = 0
test_false['genre'] = np.random.randint(1, 20, test_false.shape[0])
test_final = pd.concat([test_mod, test_false], ignore_index=True)
test_final = test_final.iloc[:,[2,3,0,1]]
test_final[list(genre.keys())] = pd.DataFrame(test_final.sigmoid.tolist(), index= test_final.index)
test_final.drop(columns=["userId", "sigmoid"], inplace=True)
test_final = test_final.sample(frac=1)
test_final.dropna(inplace=True)
test_final.head()
np.savetxt("./smalltestdataset.txt", train_final.values, fmt='%d')

NameError: name 'test' is not defined

In [None]:
ratings['userId'].value_counts()[414]
train['userId'].value_counts()[414]
# test['userId'].value_counts()[414]
test['userId'].unique().shape

(608,)