In [2]:
import numpy as np
import pandas as pd
import json
import sklearn as skl
from sklearn.model_selection import GroupShuffleSplit

In [3]:
movies = pd.read_csv("./ml-latest-small/movies.csv")
genre = {}
for l in movies['genres'].str.split('|'):
    for g in l:
        if g not in genre:
            genre[g] = 1
        else:
            genre[g] += 1
            
genretoidx = {}
idxtogenre = []
for i, g in enumerate(genre):
    # print(g, i)
    genretoidx[g] = i
    idxtogenre.append(g)
    
def toVec(genres):
    vec = [0] * 20
    # print(genres)
    for g in genres.split('|'):
        # print(g)
        vec[genretoidx[g]] += 1
    return vec
    
genredict = {m[0] : toVec(m[1]) for m in movies[['movieId','genres']].to_numpy()}
with open('data.json', 'w') as fp:
    json.dump(genredict, fp)

In [4]:
def fx(rating, movieId):
    # print(genredict[movieId], rating)
    return (rating * np.array(genredict[movieId])).astype(object)

def gx(genrating):
    return np.rint((10 / (1 + np.exp(-genrating.astype(float)*0.2)))).astype(object)


In [5]:
ratings = pd.read_csv("./ml-latest-small/ratings.csv")
ratings['rating'] = ratings.groupby('userId')['rating'].transform(lambda x: (x - x.mean()) / x.std())
ratings['genrating'] = np.vectorize(fx)(ratings['rating'], ratings['movieId'])
ratings["sigmoid"] = np.vectorize(gx)(ratings['genrating'])
ratings.sort_values(by=['userId'])
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp,genrating,sigmoid
0,1,1,-0.457947,964982703,"[-0.45794663435835353, -0.45794663435835353, -...","[5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, ..."
1,1,3,-0.457947,964981247,"[-0.0, -0.0, -0.0, -0.45794663435835353, -0.0,...","[5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, ..."
2,1,6,-0.457947,964982224,"[-0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0....","[5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, ..."
3,1,47,0.791978,964983815,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, ..."
4,1,50,0.791978,964982931,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.791...","[5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, ..."


In [6]:
movieScores = ratings.groupby('movieId')['rating'].sum()
movieScores.head()

movieId
1    73.611387
2    -6.467026
3   -12.407988
4    -8.316450
5   -30.711621
Name: rating, dtype: float64

In [7]:
fairIdx = [genretoidx['War'], genretoidx['Musical'], genretoidx['Western'], genretoidx['Film-Noir']]
fairGenreIdx = []
print(fairIdx)
def hx(movieId):
    return np.any(np.array(genredict[movieId])[fairIdx])

[13, 14, 17, 18]


In [8]:
# ratings = ratings.loc[np.any(genredict[ratings["movieId"]][fairIdx])]
ratings['fairGenre'] = np.vectorize(hx)(ratings['movieId'])
ratings = ratings.loc[ratings['fairGenre']]
ratings.drop(columns=['fairGenre'], inplace=True)
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp,genrating,sigmoid
7,1,110,-0.457947,964982176,"[-0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.457946...","[5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, ..."
8,1,151,0.791978,964984041,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.7919782970668002, ...","[5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, ..."
9,1,157,0.791978,964984100,"[0.0, 0.0, 0.0, 0.7919782970668002, 0.0, 0.0, ...","[5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, ..."
10,1,163,0.791978,964983650,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.7919782970668002, ...","[5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, ..."
20,1,356,-0.457947,964980962,"[-0.0, -0.0, -0.0, -0.45794663435835353, -0.0,...","[5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, ..."


In [9]:
scores = ratings.groupby('userId')['genrating'].sum()
users = (ratings.groupby(['userId']).size()).to_frame(name="rated")
users["scores"] = scores
users["sigmoid"] = np.vectorize(gx)(users['scores'])
users.head()

Unnamed: 0_level_0,rated,scores,sigmoid
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,50,"[0.4202333821170796, 5.630049798876235, 7.2140...","[5.0, 8.0, 8.0, 7.0, 7.0, 5.0, 8.0, 7.0, 5.0, ..."
2,2,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.1284091478266...","[5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, ..."
3,6,"[-0.9259824072516282, -0.0, -0.0, -1.851964814...","[5.0, 5.0, 5.0, 4.0, 5.0, 5.0, 3.0, 4.0, 5.0, ..."
4,37,"[3.4663994664475855, 2.0291118827985866, 3.466...","[7.0, 6.0, 7.0, 6.0, 7.0, 4.0, 7.0, 5.0, 5.0, ..."
5,9,"[1.10143821624303, 3.8550337568506046, 3.85503...","[6.0, 7.0, 7.0, 5.0, 7.0, 4.0, 6.0, 5.0, 5.0, ..."


In [10]:
train_mod = ratings[['userId', 'movieId']]
# train_mod.merge(users, on="userId")
cols = ["userId"]
train_mod = train_mod.merge(users, left_on='userId', right_on='userId')
train_mod["genre"] = train_mod['movieId'].apply(lambda x: fairIdx[np.random.choice(np.nonzero(np.array(genredict[x])[fairIdx] == 1)[0])])
train_mod["reward"] = 1
# train_mod.drop(columns=["movieId", "rated", "scores"], inplace=True)
train_mod.head()

Unnamed: 0,userId,movieId,rated,scores,sigmoid,genre,reward
0,1,110,50,"[0.4202333821170796, 5.630049798876235, 7.2140...","[5.0, 8.0, 8.0, 7.0, 7.0, 5.0, 8.0, 7.0, 5.0, ...",13,1
1,1,151,50,"[0.4202333821170796, 5.630049798876235, 7.2140...","[5.0, 8.0, 8.0, 7.0, 7.0, 5.0, 8.0, 7.0, 5.0, ...",13,1
2,1,157,50,"[0.4202333821170796, 5.630049798876235, 7.2140...","[5.0, 8.0, 8.0, 7.0, 7.0, 5.0, 8.0, 7.0, 5.0, ...",13,1
3,1,163,50,"[0.4202333821170796, 5.630049798876235, 7.2140...","[5.0, 8.0, 8.0, 7.0, 7.0, 5.0, 8.0, 7.0, 5.0, ...",17,1
4,1,356,50,"[0.4202333821170796, 5.630049798876235, 7.2140...","[5.0, 8.0, 8.0, 7.0, 7.0, 5.0, 8.0, 7.0, 5.0, ...",13,1


In [11]:
train_false = train_mod.copy()
train_false['reward'] = 0
train_false['genre'] = np.random.choice(fairIdx, size=train_false.shape[0])
train_false.head()

Unnamed: 0,userId,movieId,rated,scores,sigmoid,genre,reward
0,1,110,50,"[0.4202333821170796, 5.630049798876235, 7.2140...","[5.0, 8.0, 8.0, 7.0, 7.0, 5.0, 8.0, 7.0, 5.0, ...",13,0
1,1,151,50,"[0.4202333821170796, 5.630049798876235, 7.2140...","[5.0, 8.0, 8.0, 7.0, 7.0, 5.0, 8.0, 7.0, 5.0, ...",17,0
2,1,157,50,"[0.4202333821170796, 5.630049798876235, 7.2140...","[5.0, 8.0, 8.0, 7.0, 7.0, 5.0, 8.0, 7.0, 5.0, ...",17,0
3,1,163,50,"[0.4202333821170796, 5.630049798876235, 7.2140...","[5.0, 8.0, 8.0, 7.0, 7.0, 5.0, 8.0, 7.0, 5.0, ...",17,0
4,1,356,50,"[0.4202333821170796, 5.630049798876235, 7.2140...","[5.0, 8.0, 8.0, 7.0, 7.0, 5.0, 8.0, 7.0, 5.0, ...",17,0


In [12]:
train_final = pd.concat([train_mod, train_false], ignore_index=True)
train_final = train_final.iloc[:,[5,6, 4]]
train_final[list(genre.keys())] = pd.DataFrame(train_final.sigmoid.tolist(), index= train_final.index)
# train_final.drop(columns=["userId", "sigmoid", "movieId", "rated", "scores"], inplace=True)
train_final.drop(columns=["sigmoid"], inplace=True)
train_final = train_final.sample(frac=1)
train_final.head()

Unnamed: 0,genre,reward,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,Action,...,Horror,Mystery,Sci-Fi,War,Musical,Documentary,IMAX,Western,Film-Noir,(no genres listed)
2200,17,1,7.0,8.0,8.0,7.0,8.0,7.0,9.0,6.0,...,5.0,5.0,5.0,8.0,9.0,5.0,6.0,7.0,5.0,5.0
19896,13,0,4.0,4.0,4.0,5.0,4.0,4.0,7.0,5.0,...,5.0,5.0,5.0,9.0,5.0,8.0,4.0,7.0,7.0,5.0
7198,18,1,5.0,4.0,5.0,4.0,6.0,5.0,5.0,5.0,...,5.0,5.0,5.0,6.0,4.0,5.0,5.0,5.0,7.0,5.0
15570,14,0,4.0,5.0,5.0,4.0,5.0,5.0,7.0,6.0,...,5.0,7.0,5.0,7.0,4.0,5.0,5.0,5.0,8.0,5.0
20655,17,0,6.0,3.0,1.0,4.0,3.0,3.0,7.0,8.0,...,2.0,5.0,5.0,7.0,2.0,5.0,6.0,6.0,6.0,5.0


In [14]:
train_final.dropna(inplace=True)
np.savetxt("./smallfairtraindataset.txt", train_final.values, fmt='%d')