In [1]:
import numpy as np
import pandas as pd

In [2]:
item_df = pd.concat([pd.read_csv('../ecommerce-dataset/item_properties_part1.csv'), pd.read_csv('../ecommerce-dataset/item_properties_part2.csv')], axis=0)
item_df.head()

Unnamed: 0,timestamp,itemid,property,value
0,1435460400000,460429,categoryid,1338
1,1441508400000,206783,888,1116713 960601 n277.200
2,1439089200000,395014,400,n552.000 639502 n720.000 424566
3,1431226800000,59481,790,n15360.000
4,1431831600000,156781,917,828513


In [3]:
# 只使用 888 這個 property
# filter only 888 properties

item_df = item_df.loc[item_df.property=='888']
item_df.shape

(3000398, 4)

In [4]:
# 移除重複的 (itemid, property) pair
# remove all snapshots except for the last one
item_df.drop_duplicates(subset=['itemid','property'], keep='last', inplace=True)
item_df.shape

(417053, 4)

In [5]:
# 得到每一個物品的 888 的值
# get 888 values
features = item_df.value.values
features

array(['599031', '665993',
       '150169 1219716 136963 442519 243135 656611 n96.000 1318853', ...,
       '249879 724834 1172269', '5135 1233825',
       '888666 n10800.000 746840 1318567'], dtype=object)

In [6]:
# 888 中的移除數值
# remove numerical values in strings
import re

def remove_numbers(text):
    return re.sub(r'n[\d.]+\s*','',text)
    
features = [remove_numbers(f) for f in features]
features[:5]

['599031',
 '665993',
 '150169 1219716 136963 442519 243135 656611 1318853',
 '1169506',
 '911581 794767']

In [7]:
# 將 888按照空格切開來
# split each feature
features = [ f.split(' ') for f in features]
features[:5]

[['599031'],
 ['665993'],
 ['150169', '1219716', '136963', '442519', '243135', '656611', '1318853'],
 ['1169506'],
 ['911581', '794767']]

In [8]:
from collections import Counter
from sklearn.feature_extraction.text import TfidfTransformer

max_words = 10000
def get_tfidf_matrix(title_overviews):
    '''
    args:
        title_overviews: list(list(str)), list of list of string representing each document
    return:
        tfidf_matrix: np.array, TF-IDF matrix
    '''
    # get count of the matrix
    counter = Counter(np.hstack(title_overviews))

    # create mapping from word to index, get the most frequent 10000 words    
    word2index =  {unique_word: idx for idx, (unique_word, count) in enumerate(sorted(counter.items(), key=lambda x:-x[1])) if idx < max_words}

    #initialize empty count matrix
    count_matrix = np.zeros([len(title_overviews), max_words], dtype=np.int32)
    
    # start counting terms
    for idx, title_overview in enumerate(title_overviews):
        for word in title_overview:
            if word in word2index:
                count_matrix[idx][word2index[word]] += 1
    # tfidf
    transformer = TfidfTransformer()
    tfidf_matrix = transformer.fit_transform(count_matrix)
    
    # convert to numpy array for easier operation
    return tfidf_matrix.toarray()
tfidf_matrix = get_tfidf_matrix(features)
tfidf_matrix.shape

(417053, 10000)

In [9]:
def get_most_similar_items(tfidf_matrix, idx, top_n):
    '''
    args:
        tfidf_matrix: np.array, TF-IDF matrix
        idx: int, index of the movie
        top_n: int, how many recommendation to be returned
    return:
        most_similar_items: np.array, a list of  item index
        
    '''
    # 1d array (max_words, )
    tfidf_matrix[idx]
    
    # 2d array (num_movies, max_words)
    tfidf_matrix
    
    # compute dot product to obtain similarity scores, output shape: (num_movies, )
    scores = np.matmul(tfidf_matrix, tfidf_matrix[idx].reshape(-1,1)).reshape(-1)
    
    # obtain the highest score items, flip because arg
    most_similar_items = np.flip(np.argsort(scores))
    
    # filter out the item itself
    most_similar_items = most_similar_items[most_similar_items != idx][:top_n]
    
    return most_similar_items

In [10]:
# 我們隨便找一個物品來看他的888的值，比如說物品2
# randomly pick an item, let's say item 2. Let's check its '888' value
item_df.iloc[2].value

'150169 1219716 136963 442519 243135 656611 n96.000 1318853'

In [11]:
# 來看看和物品2最相關的物品的888的值
# see most similar items
item_df.iloc[get_most_similar_items(tfidf_matrix, 2, 10)].value.tolist()

['150169 1219716 136963 594002 243135 656611 n120.000 1059040',
 '136963',
 '136963 1330788',
 '136963 874647',
 'n31200.000 136963',
 '518169 n360.000 136963',
 '136963 532712',
 '136963 n3240.000',
 '136963 n38280.000',
 '635911 n747852.000 136963 1154859']

In [12]:
# 隨便再找一個物品，比如說物品123
# pick another item 123
item_df.iloc[123].value

'746623 621959 300207 n24.000 1072114 n64800.000 21263'

In [13]:
# 來看看和物品123最相關的物品的888的值
# see most similar items
item_df.iloc[get_most_similar_items(tfidf_matrix, 123, 10)].value.tolist()

['746623 621959 51776 n12.000 1072114 n64800.000 21263',
 '746623 621959 384019 n24.000 1072114 n64800.000 21263',
 '746623 621959 703975 n12.000 1072114 n64800.000 21263',
 '746623 621959 76474 n6000.000 1187104 n64800.000 21263',
 '746623 621959 547770 n6000.000 1187104 n64800.000 21263',
 '746623',
 '746623',
 '746623 1140809',
 'n108004248.000 n108004248.000 n72.000 1072114 n86400.000 21263',
 '1274453 726994 726994 n12.000 1072114 n70800.000 21263']