In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
import dgl
import torch
import pandas as pd
import sklearn
import numpy as np
import os
pd.set_option('display.width', 1000)
pd.set_option('display.float_format', lambda x: '%.2f' % x)
pd.set_option('display.max_columns', None)


Using backend: pytorch


In [2]:
DATASET_PATH = "../data/ml-100k/"
OUTPUT_PATH = "../data/ml-100k_processed/"

In [3]:
item_cols_names = "Id | Title | Release Date | video release date |\
              IMDb URL | unknown | Action | Adventure | Animation |\
              Children's | Comedy | Crime | Documentary | Drama | Fantasy |\
              Film-Noir | Horror | Musical | Mystery | Romance | Sci-Fi |\
              Thriller | War | Western"
item_cols_names = [x.strip() for x in item_cols_names.split('|')]

In [4]:
movies = pd.read_csv(DATASET_PATH + "u.item" , sep = '|' , encoding='latin-1' , header = None , names = item_cols_names)
movies.head(5)

Unnamed: 0,Id,Title,Release Date,video release date,IMDb URL,unknown,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0


In [5]:
def clean_movies(df):
    df = df.copy()

    # columns were dropped because 1st isn't relevant and 2nd is completely NaN
    df = df.drop(columns=['IMDb URL', 'video release date'])

    # remove movies with unknown films
    df = df[df["Title"] != "unknown"]

    # parse dates in the dataset
    # using year/month only (day shouldn't be relevant much)
    df.loc[:, 'Release Date'] = pd.to_datetime(df['Release Date'], format='%d-%b-%Y')
    df.insert(1, 'Year', df['Release Date'].dt.year.astype(int))
    df.insert(2, 'Month', df['Release Date'].dt.month.astype(int))
    df = df.drop(columns='Release Date')

    films = df.reset_index(drop = True).sort_index()

    return films



def prepare_movies(df):
    from sklearn.preprocessing import StandardScaler, FunctionTransformer
    from sklearn.compose import ColumnTransformer

    df = df.copy()

    # title isn't relevant
    df = df.drop(columns='Title')

    to_scale = ['Year', 'Month']
    rest = [col for col in df.columns if not col in to_scale]

    transformer = ColumnTransformer(
        [
            ("scaler", StandardScaler(), to_scale),
            ("identity", FunctionTransformer(), rest)
        ]
    )

    movies = transformer.fit_transform(df)
    return pd.DataFrame(movies, columns=to_scale + rest), transformer
    
    
cleaned_movies = clean_movies(movies)
prepared_movies , movies_transformer = prepare_movies(cleaned_movies)
pd.Series(cleaned_movies.index).describe()
np_prepared_movies = prepared_movies.to_numpy()
np.save(os.path.join(OUTPUT_PATH , "movies") , np_prepared_movies)

In [6]:
prepared_movies.head(4)

Unnamed: 0,Year,Month,Id,unknown,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,0.39,-0.55,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.39,-0.55,2.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,0.39,-0.55,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,0.39,-0.55,4.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
#prepared_films.describe()

In [8]:
user_cols_names = 'Id | Age | Gender | Occupation | Zip'
user_cols_names = [x.strip() for x in user_cols_names.split('|')]

users = pd.read_csv(DATASET_PATH + "u.user" , sep = '|' , encoding='latin-1' , header = None , names = user_cols_names)
users.head(5)

Unnamed: 0,Id,Age,Gender,Occupation,Zip
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213


In [9]:
def clean_users(df):
    df = df.copy()
        
    occupations = pd.get_dummies(df['Occupation'] , prefix_sep = '')
    
    df[occupations.columns] = occupations
    
    df = df.drop(columns = ['Occupation' , 'Zip']).reset_index(drop = True)
    
    df.loc[: , 'Gender'] = df['Gender'].map({'M' : 0 , 'F' : 1}).astype(int)
    
    return df

def prepare_users(df):
    
    from sklearn.preprocessing import StandardScaler,FunctionTransformer
    from sklearn.compose import ColumnTransformer
    
    df = df.copy()
    
    to_scale = ['Age']
    rest = [col for col in df.columns if not col in to_scale]
    
    transformer = ColumnTransformer(
        [
            ("scaler" , StandardScaler() , to_scale),
            ("identity", FunctionTransformer() , rest) 
        ]
    )
    
    df = transformer.fit_transform(df)
    return pd.DataFrame(df , columns = to_scale + rest) , transformer

    
    

In [10]:
cleaned_users = clean_users(users)
#cleaned_users.info()
prepared_users , users_transformer = prepare_users(cleaned_users)
cleaned_users.head(5)
np_prepared_users = prepared_users.to_numpy()
np.save(os.path.join(OUTPUT_PATH , "users") , np_prepared_users)

In [11]:
suffixes = ["a"]

def create_graph(full_df):
    graphdic= {}
    
    groups = full_df.groupby('Rating')
    for rating , df in groups:
        graphdic[("user" , str(rating) + "u" , "movie")] = (torch.tensor(df['uindex'].values) , torch.tensor(df['vindex'].values))
        graphdic[("movie" , str(rating) + "m" , "user")] = (torch.tensor(df['vindex'].values) , torch.tensor(df['uindex'].values))

        
    G = dgl.heterograph(graphdic)
    return G

def get_graph_df(df):
    
    df = df.drop(columns = ['Timestamp'])

    df = df.merge(cleaned_users.reset_index()[['index' , 'Id']] , left_on = 'User' , right_on = ['Id'], validate = 'many_to_one')
    df = df.drop(columns=['Id']).rename(columns = {'index' : 'uindex'})
    
    df = df.merge(cleaned_movies.reset_index()[['index' , 'Id']] , left_on = 'Movie' , right_on = ['Id'], validate = 'many_to_one')
    df = df.drop(columns=['Id']).rename(columns = {'index' : 'vindex'})
    
    print(df.dtypes)
    
    return df
    
for suff in suffixes:
    
    fname = "u{}.base".format(suff)
    fpath = DATASET_PATH + "u{}.base".format(suff)
    
    df = pd.read_csv(fpath , sep = '\t' , header = None , names = ["User" , "Movie" , "Rating" , "Timestamp"])
    df = get_graph_df(df)
    g = create_graph(df)
    
    dgl.save_graphs(os.path.join(OUTPUT_PATH , "u{}_train.graph").format(suff),[g])
    
    
    
    print(g)
    
print(df['uindex'].unique().shape)
print(df['vindex'].unique().shape)

df.describe()

g.nodes('user')


User      int64
Movie     int64
Rating    int64
uindex    int64
vindex    int64
dtype: object
Graph(num_nodes={'movie': 1681, 'user': 943},
      num_edges={('movie', '1m', 'user'): 5567, ('movie', '2m', 'user'): 10375, ('movie', '3m', 'user'): 24718, ('movie', '4m', 'user'): 30855, ('movie', '5m', 'user'): 19047, ('user', '1u', 'movie'): 5567, ('user', '2u', 'movie'): 10375, ('user', '3u', 'movie'): 24718, ('user', '4u', 'movie'): 30855, ('user', '5u', 'movie'): 19047},
      metagraph=[('movie', 'user', '1m'), ('movie', 'user', '2m'), ('movie', 'user', '3m'), ('movie', 'user', '4m'), ('movie', 'user', '5m'), ('user', 'movie', '1u'), ('user', 'movie', '2u'), ('user', 'movie', '3u'), ('user', 'movie', '4u'), ('user', 'movie', '5u')])
(943,)
(1679,)


tensor([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,
         14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,
         28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,  40,  41,
         42,  43,  44,  45,  46,  47,  48,  49,  50,  51,  52,  53,  54,  55,
         56,  57,  58,  59,  60,  61,  62,  63,  64,  65,  66,  67,  68,  69,
         70,  71,  72,  73,  74,  75,  76,  77,  78,  79,  80,  81,  82,  83,
         84,  85,  86,  87,  88,  89,  90,  91,  92,  93,  94,  95,  96,  97,
         98,  99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111,
        112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125,
        126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139,
        140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153,
        154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167,
        168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 1

In [14]:
print(g)

Graph(num_nodes={'movie': 1681, 'user': 943},
      num_edges={('movie', '1m', 'user'): 5567, ('movie', '2m', 'user'): 10375, ('movie', '3m', 'user'): 24718, ('movie', '4m', 'user'): 30855, ('movie', '5m', 'user'): 19047, ('user', '1u', 'movie'): 5567, ('user', '2u', 'movie'): 10375, ('user', '3u', 'movie'): 24718, ('user', '4u', 'movie'): 30855, ('user', '5u', 'movie'): 19047},
      metagraph=[('movie', 'user', '1m'), ('movie', 'user', '2m'), ('movie', 'user', '3m'), ('movie', 'user', '4m'), ('movie', 'user', '5m'), ('user', 'movie', '1u'), ('user', 'movie', '2u'), ('user', 'movie', '3u'), ('user', 'movie', '4u'), ('user', 'movie', '5u')])
