In [1]:
import numpy as np
import pandas as pd
import json
import sklearn as skl
from sklearn.model_selection import GroupShuffleSplit

In [2]:
movies = pd.read_csv("./ml-latest-small/movies.csv")
genre = {}
for l in movies['genres'].str.split('|'):
    for g in l:
        if g not in genre:
            genre[g] = 1
        else:
            genre[g] += 1
            
genretoidx = {}
idxtogenre = []
for i, g in enumerate(genre):
    # print(g, i)
    genretoidx[g] = i
    idxtogenre.append(g)
    
def toVec(genres):
    vec = [0] * 20
    # print(genres)
    for g in genres.split('|'):
        # print(g)
        vec[genretoidx[g]] += 1
    return vec
    
genredict = {m[0] : toVec(m[1]) for m in movies[['movieId','genres']].to_numpy()}
with open('data.json', 'w') as fp:
    json.dump(genredict, fp)

In [3]:
def fx(rating, movieId):
    # print(genredict[movieId], rating)
    return (rating * np.array(genredict[movieId])).astype(object)

def gx(genrating):
    return np.rint((10 / (1 + np.exp(-genrating.astype(float)*0.2)))).astype(object)


In [4]:
ratings = pd.read_csv("./ml-latest-small/ratings.csv")
ratings['rating'] = ratings.groupby('userId')['rating'].transform(lambda x: (x - x.mean()) / x.std())
ratings['genrating'] = np.vectorize(fx)(ratings['rating'], ratings['movieId'])
ratings["sigmoid"] = np.vectorize(gx)(ratings['genrating'])
ratings.sort_values(by=['userId'])
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp,genrating,sigmoid
0,1,1,-0.457947,964982703,"[-0.45794663435835353, -0.45794663435835353, -...","[5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, ..."
1,1,3,-0.457947,964981247,"[-0.0, -0.0, -0.0, -0.45794663435835353, -0.0,...","[5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, ..."
2,1,6,-0.457947,964982224,"[-0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0....","[5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, ..."
3,1,47,0.791978,964983815,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, ..."
4,1,50,0.791978,964982931,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.791...","[5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, ..."


In [9]:
movieScores = ratings.groupby('movieId')['rating'].sum().to_frame()
movieScores['movieId'] = movieScores.index
movieScores.index.name = None
movieScores.head()

Unnamed: 0,rating,movieId
1,73.611387,1
2,-6.467026,2
3,-12.407988,3
4,-8.31645,4
5,-30.711621,5


In [10]:
movieScores["genrating"] = np.vectorize(fx)(movieScores['rating'], movieScores['movieId'])
movieScores.head()

Unnamed: 0,rating,movieId,genrating
1,73.611387,1,"[73.6113868894724, 73.6113868894724, 73.611386..."
2,-6.467026,2,"[-6.467026031766981, -0.0, -6.467026031766981,..."
3,-12.407988,3,"[-0.0, -0.0, -0.0, -12.407988309376245, -0.0, ..."
4,-8.31645,4,"[-0.0, -0.0, -0.0, -8.316450122643886, -0.0, -..."
5,-30.711621,5,"[-0.0, -0.0, -0.0, -30.71162135413032, -0.0, -..."


In [21]:
df3 = pd.DataFrame(movieScores.genrating.tolist(), index= movieScores.index, columns = list(genre.keys()))
df3=(df3-df3.mean())/df3.std()
df3 = df3.apply(gx)
df3["bestGenre"] = df3.idxmax(axis=1)
df3.head()

TypeError: arg must be a list, tuple, 1-d array, or Series

In [16]:
movieScores[list(genre.keys())] = df3
movieScores.head()

Unnamed: 0,rating,movieId,genrating,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,...,Horror,Mystery,Sci-Fi,War,Musical,Documentary,IMAX,Western,Film-Noir,(no genres listed)
1,73.611387,1,"[73.6113868894724, 73.6113868894724, 73.611386...",9.0,10.0,10.0,9.0,10.0,5.0,5.0,...,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0
2,-6.467026,2,"[-6.467026031766981, -0.0, -6.467026031766981,...",4.0,5.0,4.0,5.0,4.0,5.0,5.0,...,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0
3,-12.407988,3,"[-0.0, -0.0, -0.0, -12.407988309376245, -0.0, ...",5.0,5.0,5.0,4.0,5.0,3.0,5.0,...,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0
4,-8.31645,4,"[-0.0, -0.0, -0.0, -8.316450122643886, -0.0, -...",5.0,5.0,5.0,4.0,5.0,4.0,4.0,...,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0
5,-30.711621,5,"[-0.0, -0.0, -0.0, -30.71162135413032, -0.0, -...",5.0,5.0,5.0,3.0,5.0,5.0,5.0,...,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0
