In [1]:
import numpy as np
import pandas as pd
import keras as K
from keras.models import Sequential, Model
from keras.layers import Input, Embedding, Dense, Flatten, merge, Reshape, Dot, Add, Concatenate
from keras.preprocessing import sequence
from keras import optimizers
from keras.utils import np_utils, to_categorical, plot_model
from keras.regularizers import l2
import scipy
from sklearn.metrics import recall_score, roc_auc_score
from sklearn.manifold import TSNE
from collections import Counter
import matplotlib.pyplot as plt
import random
import operator

import gc
import time
import sys

import os

Using TensorFlow backend.


In [2]:
def load_data(ratings_file_name, movies_file_name = None):
    user_dict = {}
    item_dict = {}
    
    with open(ratings_file_name, encoding='ISO-8859-1') as data:
        for line in data:
            line = line.split('::')
            user = line[0]
            item = line[1]
            
            if user in user_dict:
                user_dict[user].append(item)
            else:
                user_dict[user] = [item]
    
    if movies_file_name:
        with open(movies_file_name, encoding='ISO-8859-1') as data:
            for line in data:
                line = line.split('::')
                item_idx = line[0]
                item_name = line[1]
                item_dict[item_idx] = item_name
            
    
    return user_dict, item_dict

In [3]:
def get_genre_data(genres_file_name):
    #Get genre data
    flat_item_list = [item for item_list in filtered_item_lists for item in item_list]
    genres_by_idx = {}
    with open(genres_file_name, encoding='ISO-8859-1') as data:
        for line in data:
            line = line.split('::')
            item_genre = line[2][:-1].split('|')
            item = line[0]
            if item in flat_item_list:
                item_idx = items_to_idx[item]
                genres_by_idx[item_idx] =  item_genre
    return (genres_by_idx)

In [4]:
def count_items(item_lists):
    #Count each item in a given dataset
    #Dataset is assumed to be a list of lists
    item_count = Counter()
    for item_list in item_lists:
            item_count.update(item_list)
    return item_count

In [5]:
def filter_data(item_lists, item_counts, min_item_freq, max_items, min_items):   
    #filter out items that have frequency less than min_item_freq
    filtered_data = []
    for item_list in item_lists:
        item_list = [item for item in item_list if (item_counts[item] >= min_item_freq)]
        
        #Keep only the lists with more than "min_items" items (necessary for further use of skipgrams for Item2Vec model)
        #Filter out users that watched more than 'max_items' movies (not helpful for the model)
        if len(item_list) >= min_items and len(item_list) <= max_items:
            filtered_data.append(item_list)
            
    return filtered_data 

In [6]:
def items_vs_index(data):
    #Map each item to indexes by popularity (counts)
    new_counts = count_items(data)
    counts_sorted = sorted(new_counts.items(), key = operator.itemgetter(1), reverse = True)
    item_as_idx = {}
    idx_as_item = {}
    for j, count in enumerate(counts_sorted):
        item_as_idx[count[0]] = j
        idx_as_item[j] = count[0]
        
    return item_as_idx, idx_as_item

In [7]:
def genre_to_idx(genres_as_df, genre_col_name):
    genres_idx ={}
    for k, g in enumerate(sorted(genres_as_df[genre_col_name].unique())):
        genres_idx[g] = k
    return genres_idx

In [8]:
def train_test(data, test_size):
    trainset = []
    testset = []
    
    random.shuffle(data)
    num_test_items = int(len(data)*test_size)
    testset = data[:num_test_items]
    trainset = data[num_test_items:]
    
    trainset_count = count_items(trainset)   
    for item_list in testset:
        for item in item_list:
            if trainset_count[item] == 0:
                trainset.append(item_list)
                testset.remove(item_list)
                
    return trainset, testset

In [9]:
def loadNetFlix(ratings_data_name, name_data_name):
           netflix=pd.read_csv(df, encoding = 'latin1',header=None)
           #netflix.columns=netflix[['Date','Moviename']]
           return netflix
    

# Please  RUn Movielens or Netlix Cells to get results.You have to chosse only one at a time

 ### Movielens Data Loading

In [14]:
ratings_data_name = 'ml-1m/ratings.dat' # Shows all the items of all the users
name_data_name = 'ml-1m/movies.dat' # Links between item_id and actual name of the item

In [15]:
user_dict, item_names = load_data(ratings_data_name, name_data_name)



# Netflix Data loading

In [10]:
name_data_name='ml-1m/movie_titles.csv'
ratings_data_name = 'ml-1m/combined_data_2.csv' # Shows all the items of all the users



In [11]:
user_dict, item_names = loadNetFlix(ratings_data_name, name_data_name)


NameError: name 'df' is not defined

In [16]:
item_names

{'1': 'Toy Story (1995)',
 '2': 'Jumanji (1995)',
 '3': 'Grumpier Old Men (1995)',
 '4': 'Waiting to Exhale (1995)',
 '5': 'Father of the Bride Part II (1995)',
 '6': 'Heat (1995)',
 '7': 'Sabrina (1995)',
 '8': 'Tom and Huck (1995)',
 '9': 'Sudden Death (1995)',
 '10': 'GoldenEye (1995)',
 '11': 'American President, The (1995)',
 '12': 'Dracula: Dead and Loving It (1995)',
 '13': 'Balto (1995)',
 '14': 'Nixon (1995)',
 '15': 'Cutthroat Island (1995)',
 '16': 'Casino (1995)',
 '17': 'Sense and Sensibility (1995)',
 '18': 'Four Rooms (1995)',
 '19': 'Ace Ventura: When Nature Calls (1995)',
 '20': 'Money Train (1995)',
 '21': 'Get Shorty (1995)',
 '22': 'Copycat (1995)',
 '23': 'Assassins (1995)',
 '24': 'Powder (1995)',
 '25': 'Leaving Las Vegas (1995)',
 '26': 'Othello (1995)',
 '27': 'Now and Then (1995)',
 '28': 'Persuasion (1995)',
 '29': 'City of Lost Children, The (1995)',
 '30': 'Shanghai Triad (Yao a yao yao dao waipo qiao) (1995)',
 '31': 'Dangerous Minds (1995)',
 '32': 'Twelv

In [17]:
user_dict

{'1': ['1193',
  '661',
  '914',
  '3408',
  '2355',
  '1197',
  '1287',
  '2804',
  '594',
  '919',
  '595',
  '938',
  '2398',
  '2918',
  '1035',
  '2791',
  '2687',
  '2018',
  '3105',
  '2797',
  '2321',
  '720',
  '1270',
  '527',
  '2340',
  '48',
  '1097',
  '1721',
  '1545',
  '745',
  '2294',
  '3186',
  '1566',
  '588',
  '1907',
  '783',
  '1836',
  '1022',
  '2762',
  '150',
  '1',
  '1961',
  '1962',
  '2692',
  '260',
  '1028',
  '1029',
  '1207',
  '2028',
  '531',
  '3114',
  '608',
  '1246'],
 '2': ['1357',
  '3068',
  '1537',
  '647',
  '2194',
  '648',
  '2268',
  '2628',
  '1103',
  '2916',
  '3468',
  '1210',
  '1792',
  '1687',
  '1213',
  '3578',
  '2881',
  '3030',
  '1217',
  '3105',
  '434',
  '2126',
  '3107',
  '3108',
  '3035',
  '1253',
  '1610',
  '292',
  '2236',
  '3071',
  '902',
  '368',
  '1259',
  '3147',
  '1544',
  '1293',
  '1188',
  '3255',
  '3256',
  '3257',
  '110',
  '2278',
  '2490',
  '1834',
  '3471',
  '589',
  '1690',
  '3654',
  '2852

In [28]:
#Convert the data to a list of lists 
all_item_lists = list(user_dict.values())

In [45]:
all_item_lists

[['1193',
  '661',
  '914',
  '3408',
  '2355',
  '1197',
  '1287',
  '2804',
  '594',
  '919',
  '595',
  '938',
  '2398',
  '2918',
  '1035',
  '2791',
  '2687',
  '2018',
  '3105',
  '2797',
  '2321',
  '720',
  '1270',
  '527',
  '2340',
  '48',
  '1097',
  '1721',
  '1545',
  '745',
  '2294',
  '3186',
  '1566',
  '588',
  '1907',
  '783',
  '1836',
  '1022',
  '2762',
  '150',
  '1',
  '1961',
  '1962',
  '2692',
  '260',
  '1028',
  '1029',
  '1207',
  '2028',
  '531',
  '3114',
  '608',
  '1246'],
 ['1357',
  '3068',
  '1537',
  '647',
  '2194',
  '648',
  '2268',
  '2628',
  '1103',
  '2916',
  '3468',
  '1210',
  '1792',
  '1687',
  '1213',
  '3578',
  '2881',
  '3030',
  '1217',
  '3105',
  '434',
  '2126',
  '3107',
  '3108',
  '3035',
  '1253',
  '1610',
  '292',
  '2236',
  '3071',
  '902',
  '368',
  '1259',
  '3147',
  '1544',
  '1293',
  '1188',
  '3255',
  '3256',
  '3257',
  '110',
  '2278',
  '2490',
  '1834',
  '3471',
  '589',
  '1690',
  '3654',
  '2852',
  '1945

In [29]:
def count_items(item_lists):
    #Count each item in a given dataset
    #Dataset is assumed to be a list of lists
    item_count = Counter()
    for item_list in item_lists:
            item_count.update(item_list)
    return item_count

In [30]:
#Get initial counts of the items
item_counts = count_items(all_item_lists)

In [31]:
len(item_counts)

3706

In [32]:
def filter_data(item_lists, item_counts, min_item_freq, max_items, min_items):   
    #filter out items that have frequency less than min_item_freq
    filtered_data = []
    for item_list in item_lists:
        item_list = [item for item in item_list if (item_counts[item] >= min_item_freq)]
        
        #Keep only the lists with more than "min_items" items (necessary for further use of skipgrams for Item2Vec model)
        #Filter out users that watched more than 'max_items' movies (not helpful for the model)
        if len(item_list) >= min_items and len(item_list) <= max_items:
            filtered_data.append(item_list)
            
    return filtered_data

In [33]:
#2.2 Hyperparameters
min_item_frequency = 15 # Keep only the items with more than this frequency in the data
max_items_for_user = 180 # Maximum amount of items to consider a user informative 
min_items_for_user = 2 # Minimum amount of items to consider a user informative

In [34]:
filtered_item_lists = filter_data(all_item_lists, item_counts, 
                                  min_item_freq = min_item_frequency, 
                                  max_items = max_items_for_user, 
                                  min_items = min_items_for_user)

filtered_item_counts = count_items(filtered_item_lists)

In [35]:
len(filtered_item_counts)

3125

In [41]:
filtered_item_counts

Counter({'1193': 880,
         '661': 177,
         '914': 227,
         '3408': 714,
         '2355': 853,
         '1197': 1112,
         '1287': 231,
         '2804': 552,
         '594': 227,
         '919': 718,
         '595': 357,
         '938': 57,
         '2398': 81,
         '2918': 505,
         '1035': 341,
         '2791': 743,
         '2687': 159,
         '2018': 170,
         '3105': 197,
         '2797': 474,
         '2321': 380,
         '720': 160,
         '1270': 1267,
         '527': 1220,
         '2340': 101,
         '48': 107,
         '1097': 1044,
         '1721': 732,
         '1545': 23,
         '745': 305,
         '2294': 185,
         '3186': 179,
         '1566': 152,
         '588': 507,
         '1907': 160,
         '783': 116,
         '1836': 23,
         '1022': 155,
         '2762': 1316,
         '150': 460,
         '1': 950,
         '1961': 556,
         '1962': 161,
         '2692': 479,
         '260': 1618,
         '1028': 341,
    

In [36]:
def items_vs_index(data):
    #Map each item to indexes by popularity (counts)
    new_counts = count_items(data)
    counts_sorted = sorted(new_counts.items(), key = operator.itemgetter(1), reverse = True)
    item_as_idx = {}
    idx_as_item = {}
    for j, count in enumerate(counts_sorted):
        item_as_idx[count[0]] = j
        idx_as_item[j] = count[0]
        
    return item_as_idx, idx_as_item

In [37]:
#Convert each item_id to an index to be able to use with the model
#Also create a reverse link between the IDs and the indexes
items_to_idx, idx_to_items = items_vs_index(filtered_item_lists)
unique_item_size = len(items_to_idx)

#Format and convert the whole data as indexes instead of actual IDs
data_by_idx = [[items_to_idx[item] for item in item_list] for item_list in filtered_item_lists]
counts_by_idx = count_items(data_by_idx)

In [38]:
#Create a new dictionary with the item name according to new indexes
#This will make it easier to do a sanity check for the recommender and get actual names instead of indexes
names_by_idx = {}
for k,v in item_names.items():
    if k in items_to_idx:
        names_by_idx[items_to_idx[k]] = v

In [39]:
def train_test(data, test_size):
    trainset = []
    testset = []
    
    random.shuffle(data)
    num_test_items = int(len(data)*test_size)
    testset = data[:num_test_items]
    trainset = data[num_test_items:]
    
    trainset_count = count_items(trainset)   
    for item_list in testset:
        for item in item_list:
            if trainset_count[item] == 0:
                trainset.append(item_list)
                testset.remove(item_list)
                
    return trainset, testset

In [40]:
#Split the data into train and test sets
trainset, testset = train_test(data_by_idx, test_size = 0.2)
trainset_counts = count_items(trainset)