# Install packages

In [1]:
# Install your required packages here
!pip install pandas numpy matplotlib sklearn

Collecting pandas
  Downloading pandas-1.3.3-cp37-cp37m-win_amd64.whl (10.0 MB)
Collecting matplotlib
  Using cached matplotlib-3.4.3-cp37-cp37m-win_amd64.whl (7.2 MB)
Collecting sklearn
  Using cached sklearn-0.0.tar.gz (1.1 kB)
Collecting pytz>=2017.3
  Downloading pytz-2021.3-py2.py3-none-any.whl (503 kB)
Collecting cycler>=0.10
  Using cached cycler-0.10.0-py2.py3-none-any.whl (6.5 kB)
Collecting pillow>=6.2.0
  Downloading Pillow-8.4.0-cp37-cp37m-win_amd64.whl (3.2 MB)
Collecting kiwisolver>=1.0.1
  Downloading kiwisolver-1.3.2-cp37-cp37m-win_amd64.whl (51 kB)
Collecting scikit-learn
  Downloading scikit_learn-1.0-cp37-cp37m-win_amd64.whl (7.1 MB)
Collecting threadpoolctl>=2.0.0
  Downloading threadpoolctl-3.0.0-py3-none-any.whl (14 kB)
Collecting joblib>=0.11
  Downloading joblib-1.1.0-py2.py3-none-any.whl (306 kB)
Collecting scipy>=1.1.0
  Using cached scipy-1.7.1-cp37-cp37m-win_amd64.whl (33.6 MB)
Using legacy setup.py install for sklearn, since package 'wheel' is not installed

You should consider upgrading via the 'c:\users\quentindh\appdata\local\programs\python\python37\python.exe -m pip install --upgrade pip' command.


In [2]:
# Mount google drive
from google.colab import drive
drive.mount('/content/drive')

ModuleNotFoundError: No module named 'google'

In [None]:
import numpy as np
import pandas as pd
import sklearn
import gzip
import json
from tqdm import tqdm
import os
from collections import Counter

In [None]:
#read file line-by-line and parse json, returns dataframe
def parse_json(filename_gzipped_python_json, read_max=-1):
  #read gzipped content
  f=gzip.open(filename_gzipped_python_json,'r')
  
  #parse json
  parse_data = []
  for line in tqdm(f): #tqdm is for showing progress bar, always good when processing large amounts of data
    line = line.decode('utf-8')
    line = line.replace('true','True') #difference json/python
    line = line.replace('false','False')
    parsed_result = eval(line) #load python nested datastructure
    parse_data.append(parsed_result)
    if read_max !=-1 and len(parse_data) > read_max:
      print(f'Break reading after {read_max} records')
      break
  print(f"Reading {len(parse_data)} rows.")

  #create dataframe
  df= pd.DataFrame.from_dict(parse_data)
  return df


# Load steam data
Note: For steam dataset, json data, is actually coded with Python (so not actually json), such as using values False instead of false and single quotes   instead of double literals

In [None]:
steam_path = '/content/drive/MyDrive/AI Project/datasets/Steam/'
metadata_games = 'steam_games.json.gz' 
user_items = 'australian_users_items.json.gz'
user_reviews = 'australian_user_reviews.json.gz'
game_bundles = 'bundle_data.json.gz'
steam_reviews= 'steam_reviews.json.gz'

In [None]:
for dataset in [metadata_games, user_items, user_reviews, game_bundles, steam_reviews]:
  print(f"----- {dataset}-----")
  size = os.path.getsize(steam_path + dataset) 
  print(f'Size of file is {size / 1000000}MB')
  df_metadata = parse_json(steam_path + dataset, read_max=1000000)
  pd.set_option('display.max_colwidth', None)
  display(df_metadata.head(5))
  display(df_metadata.describe(include='all'))

## Example pre-processing / data cleaning

In [None]:
#cleaner games
games = parse_json(steam_path + metadata_games)
games = games[['publisher','app_name', 'genres', 'release_date', 'price']]
games = games.sort_values(by='release_date', ascending=False)
display(games.head(50))

In [None]:
#cleaner interactions
user_items_df = parse_json(steam_path + user_items, read_max=100000)
user_items_df = user_items_df[['user_id', 'items']]
#flatten interactions
user_items_all_dct = {'user_id': [], 'item_id': [], 'title': [], 'playtime_forever': []}
for idx, row in tqdm(user_items_df.iterrows()):
    user_id = row['user_id']
    items = row['items']
    #rule: if never played, do not add to history
    items = [item_dct for item_dct in items if item_dct['playtime_forever'] > 0]  
    user_items_all_dct['user_id'].extend([user_id] * len(items))
    user_items_all_dct['item_id'].extend([item_dct['item_id'] for item_dct in items])
    user_items_all_dct['title'].extend([item_dct['item_name'] for item_dct in items])
    user_items_all_dct['playtime_forever'].extend([item_dct['playtime_forever'] for item_dct in items])
user_items_df = pd.DataFrame.from_dict(user_items_all_dct)
display(user_items_df.head(50))

## Example: Compute popularity and popularity per genre

In [None]:
#Most popular items:
popular_items = user_items_df.groupby(by="title")['user_id'].count().reset_index()
popular_items = popular_items.rename(columns={"user_id": "user_count","title": "app_name"})
popular_items['user_pct'] = popular_items['user_count'] / user_items_df['user_id'].nunique()
popular_items = pd.merge(popular_items,games,how='left',on='app_name')
popular_items = popular_items.sort_values(by='user_count',ascending=False)
display(popular_items.head(10))

In [None]:
#Most popular items per genre
popular_per_genre_dct = {'genre': [], 'app_name': [], 'user_count':[]}
#flatten frame on genre
for idx, row in popular_items.iterrows():
  genres = row['genres']
  app_name = row['app_name']
  user_count = row['user_count']
  if isinstance(genres,list):
    popular_per_genre_dct['genre'].extend(genres)
    popular_per_genre_dct['app_name'].extend([app_name] * len(genres))
    popular_per_genre_dct['user_count'].extend([user_count] * len(genres))  
popular_items_genre = pd.DataFrame.from_dict(popular_per_genre_dct)
#compute rank on user_count per genre
popular_items_genre = popular_items_genre.sort_values(by=['genre', 'user_count'], ascending=['True', 'False'])
popular_items_genre['genre_rank'] = popular_items_genre.groupby(by='genre')['user_count'].rank(ascending=False)
#show top-10 popular games per genre
popular_items_genre = popular_items_genre[popular_items_genre['genre_rank'] <= 10]
display(popular_items_genre.head(50))

## Example: Make recommendations content-based

### Learn user profile consisting of top-3 most liked genres

In [None]:
#Example of content-based recommender
#1)For each user compute top-3 most liked genres
user_profiles_df = pd.merge(user_items_df, games, how='left', left_on='title', right_on='app_name')#merge with games to get genre of items
display(user_profiles_df.head(3))
user_profiles_df2 = user_profiles_df.groupby(by='user_id')['genres'].apply(list).reset_index() #collect list of genres of items in history
user_profiles_df3 = user_profiles_df.groupby(by='user_id')['app_name'].apply(list).reset_index() #collect list of items in history
user_profiles_df = pd.merge(user_profiles_df2,user_profiles_df3,on="user_id") #merge both lists
display(user_profiles_df.head(3))

#i.e. [[Indie, Simulation], [Action], [Action], [Action, Adventure, Indie, RPG],...]
def create_profile(genres_list):
  cnt = Counter()
  for genres in genres_list:
    if isinstance(genres, list):
      for genre in genres:
       cnt[genre]+=1
  return cnt.most_common(3)
  
user_profiles_df['top-3-genres'] = user_profiles_df['genres'].apply(create_profile)
user_profiles_df = user_profiles_df[['user_id','app_name','top-3-genres']]
display(user_profiles_df.head(3))

### Make recommendations based on most popular games matching top-3 most liked genres in history

In [None]:
#2)For prediction suggest top-10 most popular games matching genre profile 
def recommend_popular_in_genre(profile):
  genres = profile['top-3-genres']
  genres = set([genre for genre, count in genres])
  history = set(profile['app_name'])
  selection = popular_items_genre[popular_items_genre['genre'].isin(genres)]
  selection = popular_items_genre[~popular_items_genre['app_name'].isin(history)]
  selection = selection.drop_duplicates(subset='app_name')
  selection = selection.sort_values(by='user_count', ascending=False)
  return selection.values[0:10]

tqdm.pandas() #adds progress_apply to pandas, i.e. apply with progress bar
user_profiles_df['recommendation'] = user_profiles_df.progress_apply(recommend_popular_in_genre,axis=1)
display(user_profiles_df)


# Load food data

In [None]:
food_path = '/content/drive/MyDrive/AI Project/datasets/Food/'
recipes = 'RAW_recipes.csv.zip' 
interactions = 'RAW_interactions.csv.zip'
pp_recipes = 'PP_recipes.csv.zip'
pp_users = 'PP_users.csv.zip'

In [None]:
for dataset in [recipes, interactions, pp_recipes, pp_users]:
  print(f"----- {dataset}-----")
  size = os.path.getsize(food_path + dataset) 
  print(f'Size of file is {size / 1000000}MB')
  df = pd.read_csv(food_path + dataset)
  pd.set_option('display.max_colwidth', None)
  display(df.head(5))
  display(df.describe(include='all'))

In [None]:
#clean data

# Load Goodreads data

In [None]:
goodreads_path = '/content/drive/MyDrive/AI Project/datasets/Goodreads/'
books = 'goodreads_books_comics_graphic.json.gz'
interactions = 'goodreads_interactions_comics_graphic.json.gz'
reviews = 'goodreads_reviews_comics_graphic.json.gz'

In [None]:
for dataset in [books, interactions, reviews]:
  print(f"----- {dataset}-----")
  size = os.path.getsize(goodreads_path + dataset) 
  print(f'Size of file is {size / 1000000}MB')
  #df = pd.read_json(goodreads_path + dataset, lines=True, nrows=1000)
  df = parse_json(goodreads_path + dataset, read_max=100000)
  pd.set_option('display.max_colwidth', None)
  display(df.head(5))