# Sample Code

## 基礎建設

In [2]:
import pandas as pd
import gzip, json

def parse(path):
    g = gzip.open(path, 'rb')
    for l in g:
        yield json.loads(l)

def getDF(path):
    i = 0
    df = {}
    for d in parse(path):
        df[i] = d
        i += 1
    return pd.DataFrame.from_dict(df, orient='index')

## 載入資料

In [3]:
!wget http://deepyeti.ucsd.edu/jianmo/amazon/categoryFilesSmall/All_Beauty.csv
!wget http://deepyeti.ucsd.edu/jianmo/amazon/metaFiles2/meta_All_Beauty.json.gz

--2021-12-21 07:26:20--  http://deepyeti.ucsd.edu/jianmo/amazon/categoryFilesSmall/All_Beauty.csv
Resolving deepyeti.ucsd.edu (deepyeti.ucsd.edu)... 169.228.63.50
Connecting to deepyeti.ucsd.edu (deepyeti.ucsd.edu)|169.228.63.50|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 15499476 (15M) [application/octet-stream]
Saving to: ‘All_Beauty.csv’


2021-12-21 07:26:21 (20.0 MB/s) - ‘All_Beauty.csv’ saved [15499476/15499476]

--2021-12-21 07:26:21--  http://deepyeti.ucsd.edu/jianmo/amazon/metaFiles2/meta_All_Beauty.json.gz
Resolving deepyeti.ucsd.edu (deepyeti.ucsd.edu)... 169.228.63.50
Connecting to deepyeti.ucsd.edu (deepyeti.ucsd.edu)|169.228.63.50|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 10329961 (9.9M) [application/octet-stream]
Saving to: ‘meta_All_Beauty.json.gz’


2021-12-21 07:26:22 (14.0 MB/s) - ‘meta_All_Beauty.json.gz’ saved [10329961/10329961]



In [4]:
metadata = getDF('/content/meta_All_Beauty.json.gz')
ratings = pd.read_csv('/content/All_Beauty.csv', names=['asin', 'reviewerID', 'overall', 'unixReviewTime'], header=None)

## 資料整理

In [5]:
ratings['DATE'] = pd.to_datetime(ratings['unixReviewTime'], unit='s')

## 資料切分

In [6]:
ratings_trainings = ratings[
    (ratings['DATE'] < '2018-09-01')
]
ratings_testings = ratings[
    (ratings['DATE'] >= '2018-09-01') & 
    (ratings['DATE'] <= '2018-09-30')
]
ratings_testings_by_user = ratings_testings.groupby('reviewerID').agg(list).reset_index()[['reviewerID', 'asin']].to_dict('records')
ratings_testings_by_user = { rating['reviewerID']: rating['asin'] for rating in ratings_testings_by_user }
users = list(ratings_testings_by_user.keys())

## 產生推薦

In [7]:
def recommender(training_data, users=[], k=10):
    '''
    * training_data: dataframe 輸入的訓練資料集（2018-09-01 以前資料）
    * users: [] 需要被推薦的使用者
    * k: int 每個使用者需要推薦的商品數
    * recommendations: dict
      {
          使用者一： [推薦商品一, 推薦商品二, ...],
          使用者二： [...], ...
      }
    '''
    recommendations = {}
    '''
    random-based
    '''
    ratings_trainings = training_data
    recommendations = {user: ratings_trainings['asin'].sample(n=k).tolist() for user in users}
    return recommendations


ratings_by_user = recommender(ratings_trainings, users)
ratings_by_user

{'A100XQFWKQ30O2': ['B0009RF9DW',
  'B005ZX8ZWM',
  'B001AJ6YS2',
  'B00OIQ4OFO',
  'B00BMVV3MK',
  'B012GVBHZC',
  'B00021DJ32',
  'B00YQBYO1K',
  'B009AED4EG',
  'B00DT4757A'],
 'A103T1QOGFCSEH': ['B01DP8QD22',
  'B00F550JWE',
  'B005QDYH8M',
  'B0067F28ZW',
  'B01AKP8BUK',
  'B00005JS5C',
  'B000FOI48G',
  'B0117Z2STC',
  'B00JVU3K9I',
  'B013JG7YP0'],
 'A106UKKSJ2KXPF': ['B01B54XW54',
  'B01BQR7TK4',
  'B00005JS5C',
  'B000GLRREU',
  'B017KQGXS4',
  'B00W259T7G',
  'B0006NXAHQ',
  'B00UICO9FC',
  'B00CK4JDRK',
  'B0159ZECGI'],
 'A10A7GV4D5A11V': ['B01CJNZKZK',
  'B00006L9LC',
  'B00FW6JXNM',
  'B01FX13TYO',
  'B01DU3BQ2E',
  'B00021C22G',
  'B000FOI48G',
  'B000FOI48G',
  'B001C9VNIQ',
  'B000WR2HB6'],
 'A1119JJ37ZLB8R': ['B01GU7XYPY',
  'B0195R1FT8',
  'B01BFJ3TFC',
  'B00QU76L4Q',
  'B01BEX4GU6',
  'B000GLRREU',
  'B000AADG60',
  'B00KHYHD36',
  'B0017TZD7S',
  'B000EA8VSE'],
 'A113UOOLBSZN52': ['B001QY8QXM',
  'B0016BFK2U',
  'B00692OX2O',
  'B00XC8OIDC',
  'B001WAKUWK',
  'B001

## 結果評估

In [8]:
def evaluate(ratings_testings_by_user={}, ratings_by_user={}, method=None):
    '''
    * ratings_testings_by_user: dict 真實被購買的商品資料（2018-09-01 以後資料）
    * ratings_by_user: dict 利用訓練資料學習的推薦商品
    * method: str
    * score: float
    '''
    total = 0
    for d in ratings_testings_by_user:
        if d in ratings_by_user:
            total += len(set(ratings_by_user[d]) & set(ratings_testings_by_user[d]))

    score = total / len(ratings_testings)
    return score

evaluate(ratings_testings_by_user, ratings_by_user)

0.015254237288135594