# Sample Code

## 基礎建設

In [2]:
import pandas as pd
import gzip, json

def parse(path):
    g = gzip.open(path, 'rb')
    for l in g:
        yield json.loads(l)

def getDF(path):
    i = 0
    df = {}
    for d in parse(path):
        df[i] = d
        i += 1
    return pd.DataFrame.from_dict(df, orient='index')

## 載入資料

In [3]:
!wget http://deepyeti.ucsd.edu/jianmo/amazon/categoryFilesSmall/All_Beauty.csv
!wget http://deepyeti.ucsd.edu/jianmo/amazon/metaFiles2/meta_All_Beauty.json.gz

--2021-12-26 09:00:12--  http://deepyeti.ucsd.edu/jianmo/amazon/categoryFilesSmall/All_Beauty.csv
Resolving deepyeti.ucsd.edu (deepyeti.ucsd.edu)... 169.228.63.50
Connecting to deepyeti.ucsd.edu (deepyeti.ucsd.edu)|169.228.63.50|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 15499476 (15M) [application/octet-stream]
Saving to: ‘All_Beauty.csv’


2021-12-26 09:00:13 (18.4 MB/s) - ‘All_Beauty.csv’ saved [15499476/15499476]

--2021-12-26 09:00:13--  http://deepyeti.ucsd.edu/jianmo/amazon/metaFiles2/meta_All_Beauty.json.gz
Resolving deepyeti.ucsd.edu (deepyeti.ucsd.edu)... 169.228.63.50
Connecting to deepyeti.ucsd.edu (deepyeti.ucsd.edu)|169.228.63.50|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 10329961 (9.9M) [application/octet-stream]
Saving to: ‘meta_All_Beauty.json.gz’


2021-12-26 09:00:14 (14.2 MB/s) - ‘meta_All_Beauty.json.gz’ saved [10329961/10329961]



In [4]:
#商品的基本資訊 metadata,購買商品的紀錄 ratings
metadata = getDF('/content/meta_All_Beauty.json.gz')
ratings = pd.read_csv('/content/All_Beauty.csv', names=['asin', 'reviewerID', 'overall', 'unixReviewTime'], header=None)

## 資料整理

In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [6]:
ratings['DATE'] = pd.to_datetime(ratings['unixReviewTime'], unit='s')

In [7]:
no_ratings = ratings[['reviewerID','asin']].groupby("reviewerID").count().sort_values(by=['asin'], ascending=False).reset_index()
no_ratings

Unnamed: 0,reviewerID,asin
0,A2GJX2KCUSR0EI,27
1,A1KSC91G9AIY2Z,23
2,A22CW0ZHY3NJH8,21
3,A1EGCED01USBA9,21
4,AENH50GW3OKDA,20
...,...,...
324033,A2BZ7MYTSNYMEW,1
324034,A2BZ6YMTPARCXV,1
324035,A2BZ6FH245B7AV,1
324036,A2BZ5TE4U5DI01,1


In [8]:
# 商品資訊-只留下要求的欄位
purchase_df = metadata[['asin', 'brand', 'title', 'price', 'rank']]

In [9]:
#商品資訊-買什麼種類
purchase_df['categories'] = purchase_df['rank'].str.split(expand = True).loc[:, 2:5].apply(lambda row: ''.join(row.values.astype(str)), axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [10]:
# 取出字串前段數字,取代掉千分位
purchase_df['rank'] = purchase_df['rank'].str.split(expand = True)[0].str.replace(',', '')
#轉換型別
purchase_df['rank'] = pd.to_numeric(purchase_df['rank'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [48]:
#purchase_df[purchase_df['price'] == '']

In [49]:
#ratings[ratings['asin'] == '6546546450']

In [36]:
# 調整 DataFrame index 讓兩邊可以用 asin 合併資料
# purchase_df = purchase_df.set_index('asin')

In [11]:
purchase_df.head(5)

Unnamed: 0,asin,brand,title,price,rank,categories
0,6546546450,idea village,Loud 'N Clear&trade; Personal Sound Amplifier,,2938573.0,Beauty&PersonalCare
1,7178680776,,No7 Lift &amp; Luminate Triple Action Serum 50...,$44.99,872854.0,Beauty&PersonalCare
2,7250468162,No7,No7 Stay Perfect Foundation Cool Vanilla by No7,$28.76,956696.0,Beauty&PersonalCare
3,7367905066,,Wella Koleston Perfect Hair Colour 44/44 Mediu...,,1870258.0,Beauty&PersonalCare
4,7414204790,Pirmal Healthcare,Lacto Calamine Skin Balance Oil control 120 ml...,$12.15,67701.0,Beauty&PersonalCare


In [21]:
ratings.shape

(371345, 5)

In [22]:
purchase_category_df = purchase_df[['asin','categories']].drop_duplicates(subset=['asin','categories'])

In [23]:
new_rating = pd.merge(ratings, purchase_category_df, on="asin", how="left").fillna("unknown")

In [24]:
#new_rating.head(5)
new_rating

Unnamed: 0,asin,reviewerID,overall,unixReviewTime,DATE,categories
0,0143026860,A1V6B6TNIC10QE,1.0,1424304000,2015-02-19,unknown
1,0143026860,A2F5GHSXFQ0W6J,4.0,1418860800,2014-12-18,unknown
2,0143026860,A1572GUYS7DGSR,4.0,1407628800,2014-08-10,unknown
3,0143026860,A1PSGLFK1NSVO,5.0,1362960000,2013-03-11,unknown
4,0143026860,A6IKXKZMTKGSC,5.0,1324771200,2011-12-25,unknown
...,...,...,...,...,...,...
371340,B01HJEGTYK,A202DCI7TV1022,1.0,1500508800,2017-07-20,Beauty&PersonalCare
371341,B01HJEGTYK,A3FSOR5IJOFIBE,5.0,1489622400,2017-03-16,Beauty&PersonalCare
371342,B01HJEGTYK,A1B5DK6CTP2P24,5.0,1488326400,2017-03-01,Beauty&PersonalCare
371343,B01HJEGTYK,A23OUYS5IRMJS9,2.0,1487635200,2017-02-21,Beauty&PersonalCare


## 資料切分

In [26]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
import recmetrics
import matplotlib.pyplot as plt
from surprise import Reader,SVD,Dataset
from surprise.model_selection import train_test_split

ModuleNotFoundError: ignored

In [27]:
pip install recmetrics

Collecting recmetrics
  Downloading recmetrics-0.1.0-py3-none-any.whl (9.3 kB)
Collecting scikit-learn<0.24.0,>=0.23.2
  Downloading scikit_learn-0.23.2-cp37-cp37m-manylinux1_x86_64.whl (6.8 MB)
[K     |████████████████████████████████| 6.8 MB 19.6 MB/s 
[?25hCollecting pytest-cov<3.0.0,>=2.10.1
  Downloading pytest_cov-2.12.1-py2.py3-none-any.whl (20 kB)
Collecting matplotlib<4.0.0,>=3.3.2
  Downloading matplotlib-3.5.1-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl (11.2 MB)
[K     |████████████████████████████████| 11.2 MB 63.6 MB/s 
[?25hCollecting plotly<5.0.0,>=4.11.0
  Downloading plotly-4.14.3-py2.py3-none-any.whl (13.2 MB)
[K     |████████████████████████████████| 13.2 MB 52.0 MB/s 
[?25hCollecting funcsigs<2.0.0,>=1.0.2
  Downloading funcsigs-1.0.2-py2.py3-none-any.whl (17 kB)
Collecting scikit-surprise<2.0.0,>=1.1.1
  Downloading scikit-surprise-1.1.1.tar.gz (11.8 MB)
[K     |████████████████████████████████| 11.8 MB 55.8 MB/s 
[?25hCollecting ipython<8.0.0,>=

In [25]:
ratings_trainings = ratings[
    (ratings['DATE'] < '2018-09-01')
]
ratings_testings = ratings[
    (ratings['DATE'] >= '2018-09-01') & 
    (ratings['DATE'] <= '2018-09-30')
]

In [None]:
import matplotlib.pyplot as plt
fig=plt.figure(figsize=(15,7))
recmetrics.long_tail_plot(df= ratings,
                         item_id_column='movieId',
                         interaction_type='movie ratings',
                         percentage=0.5,
                         x_labels=False)

In [None]:
#group by 用戶ID {'asin': ['B017JBTB98'], 'reviewerID': 'ACCY5NWUNEC87'}
ratings_testings_by_user = ratings_testings.groupby('reviewerID').agg(list).reset_index()[['reviewerID', 'asin']].to_dict('records')

#換格式 {'A12M4U7WK4ALCR': ['B01DKQAXC0']}
ratings_testings_by_user = { rating['reviewerID']: rating['asin'] for rating in ratings_testings_by_user }
#所有用戶 'ATR9J7TYR1KVL'
users = list(ratings_testings_by_user.keys())

In [92]:
ratings_testings_by_user

{'A100XQFWKQ30O2': ['B01DKQAXC0'],
 'A103T1QOGFCSEH': ['B01DKQAXC0'],
 'A106UKKSJ2KXPF': ['B01ENS6XDC'],
 'A10A7GV4D5A11V': ['B01BFSNF66'],
 'A1119JJ37ZLB8R': ['B01CJNZKZK'],
 'A113UOOLBSZN52': ['B01EKY0M22'],
 'A12M4U7WK4ALCR': ['B01DKQAXC0'],
 'A12T8YTW6VWT7S': ['B01G53GFKO'],
 'A1364JXGKB46MM': ['B01GLA54SA'],
 'A137DALOQFKBTI': ['B01AWXGD3M'],
 'A13FEZ3WV7S2EY': ['B01BHN3EHE'],
 'A13IV4I1B0RXMG': ['B01DKQAXC0'],
 'A13JU88JAHN72I': ['B017I6B6GK'],
 'A13K55R6VH1OOD': ['B01FNJ9MOW'],
 'A13P7VFU075A': ['B01H3ZQ2NI'],
 'A13SWYE4QLB6NG': ['B01DVLHVPQ'],
 'A13ZTQ0Q4ATA41': ['B01AE1TJV0'],
 'A142EDN04OD62U': ['B01CW24JXC'],
 'A142I22FIC8MZK': ['B01CD7JK9E'],
 'A14834QTII5TLT': ['B01GK8P0CS'],
 'A14A447VPACTBC': ['B01B3R5EFO'],
 'A14AP6MN5XO6LB': ['B01E7UKR38'],
 'A14CLF25IX25US': ['B01DLR9IDI'],
 'A14LYXC3HTBAHI': ['B01FT6TMQM'],
 'A14VUW4KZ34EOE': ['B01DKQAXC0'],
 'A14Y32P26G9YL': ['B018WCT01C'],
 'A157T25PBS7MX4': ['B01CX5KCIE'],
 'A15HZDSERD85C8': ['B01DDWA5II'],
 'A15JJ8J1FGADIX': ['B0

## 產生推薦

In [7]:
def recommender(training_data, users=[], k=10):
    '''
    * training_data: dataframe 輸入的訓練資料集（2018-09-01 以前資料）
    * users: [] 需要被推薦的使用者
    * k: int 每個使用者需要推薦的商品數
    * recommendations: dict
      {
          使用者一： [推薦商品一, 推薦商品二, ...],
          使用者二： [...], ...
      }
    '''
    recommendations = {}
    '''
    random-based
    '''
    ratings_trainings = training_data
    recommendations = {user: ratings_trainings['asin'].sample(n=k).tolist() for user in users}
    return recommendations

In [8]:
ratings_by_user = recommender(ratings_trainings, users)
ratings_by_user

{'A100XQFWKQ30O2': ['B00CR6X0IY',
  'B016YF1U3A',
  'B001MW1MK2',
  'B00JB0V0FO',
  'B00OINB5CC',
  'B001EXOO72',
  'B00L0JYAZG',
  'B00H78NZYQ',
  'B000VUXCGI',
  'B00MGL9EW6'],
 'A103T1QOGFCSEH': ['B00C6NZH3E',
  'B01CHS3CHA',
  'B01DYI8GH8',
  'B0000530HU',
  'B00NT0AR7E',
  'B000EE9XYG',
  'B004Z9O8TG',
  'B015S9XXFG',
  'B00GW58YC8',
  'B00D8ARRIC'],
 'A106UKKSJ2KXPF': ['B009PIJV1W',
  'B019Z9L6YU',
  'B000IQVVGI',
  'B001OHV1H4',
  '1620213982',
  'B000VV1YOY',
  'B00HA8JNSU',
  'B00S8JQKEA',
  'B000FOI48G',
  'B01EWW45QG'],
 'A10A7GV4D5A11V': ['B00ODZCSMQ',
  'B01CN6P7T2',
  'B01GQ12URM',
  'B0014BB6WA',
  'B001ARV2MC',
  'B01DKQAXC0',
  'B00CQ1C6RG',
  'B00005JS5C',
  'B005V3AV5U',
  'B00DIP4G4A'],
 'A1119JJ37ZLB8R': ['B00PGLZH9W',
  'B00JRPIF7Y',
  'B00Q5HEZ76',
  'B00VMT6ZHA',
  'B01B18T01Y',
  'B000NOT9GO',
  'B01EGH00J8',
  'B01G7TWQH2',
  'B001QY8QXM',
  'B0007MXYKY'],
 'A113UOOLBSZN52': ['B0017DQ0U8',
  'B00JG8DY0A',
  'B014J0LJ38',
  'B004P5R57M',
  'B011773RYA',
  'B000

In [35]:
ratings_trainings[ratings_trainings['reviewerID']=='AEKJTREHL41OE']

Unnamed: 0,asin,reviewerID,overall,unixReviewTime,DATE


## 結果評估

In [34]:
def evaluate(ratings_testings_by_user={}, ratings_by_user={}, method=None):
    '''
    * ratings_testings_by_user: dict 真實被購買的商品資料（2018-09-01 以後資料）
    * ratings_by_user: dict 利用訓練資料學習的推薦商品
    * method: str
    * score: float
    '''
    total = 0
    for d in ratings_testings_by_user:
        if d in ratings_by_user:
            total += len(set(ratings_by_user[d]) & set(ratings_testings_by_user[d]))

    score = total / len(ratings_testings)
    return score

evaluate(ratings_testings_by_user, ratings_by_user)

0.011864406779661017