In [1]:
# To support both python 2 and python 3
from __future__ import division, print_function, unicode_literals

# Common imports
import numpy as np
import os
import os
import tarfile
from six.moves import urllib
import pandas as pd

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12

# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = ""
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

In [57]:
# read files

path='./birch6_validate/'

def load_files(method):
    train_data = np.load(path + method + '_train_data.npy').item()
    test_data = np.load(path + 'user_test_data.npy').item()
    return train_data, test_data

In [48]:
def get_distance_matrix(name):
    matrix = pd.read_csv(path + name + '.csv')
    return matrix.set_index('Unnamed: 0')

### Recommender

In [49]:
class UserCF:
    def __init__(self, metric='jaccard'):
        self.metric = metric
    
    def fit(self, cluster, train_data):
        self.train_data = train_data
        #self.distance_matrix = generate_distance_matrix(self.train_data, self.metric)
        self.distance_matrix = get_distance_matrix('user_' + self.metric + '_matrix_' + str(cluster))
        return self.distance_matrix
    
    def predict(self, user_id, k=10):
        predictions = {}
        if user_id not in distance_matrix.index: return {}
        neighbours = self.distance_matrix[str(user_id)].drop(user_id).sort_values(ascending=False)[:k] #similarity
        neighbours = neighbours.fillna(min(neighbours))
        total_dist = sum(neighbours)
        if total_dist == 0: total_dist = 1
        neighbours = neighbours.apply(lambda x: x/total_dist) #求用户权重
        for neighbour in neighbours.index:
            for item, value in self.train_data[neighbour].items():
                if item not in self.train_data[user_id].keys(): #没被点击过
                    predictions.setdefault(item, 0)
                    predictions[item] += neighbours[neighbour]*value
        return dict(sorted(predictions.items(), key=lambda e: e[1], reverse=True)[:k])

class ItemCF:
    def __init__(self, metric='jaccard'):
        self.metric = metric
    
    def fit(self, cluster, train_data):
        self.train_data = train_data
        #self.distance_matrix = generate_distance_matrix(self.train_data, self.metric)
        self.distance_matrix = get_distance_matrix('item_' + self.metric + '_matrix_' + str(cluster))
        return self.distance_matrix
    
    def predict(self, user_id, k=10):
        predictions = {}
        if user_id not in self.train_data.keys(): return {}
        for item, value in self.train_data[user_id].items():
            neighbours = self.distance_matrix[str(item)].drop(item).sort_values(ascending=False)[:k]
            neighbours = neighbours.fillna(min(neighbours))
            #total_dist = sum(neighbours)
            #if total_dist == 0: total_dist = 1
            #neighbours = neighbours.apply(lambda x: x/total_dist) #求用户权重，越近权重越大
            for neighbour in neighbours.index:
                if neighbour not in self.train_data[user_id].keys():
                    predictions.setdefault(item, 0)
                    predictions[item] += neighbours[neighbour]*value
        return dict(sorted(predictions.items(), key=lambda e: e[1], reverse=True)[:k])

### Evaluation : Precision, Recall, F1-score

In [50]:
def evaluation(test_data, predictions):
    common = 0.0
    precision = 0.0
    recall = 0.0
    for user_id, items in test_data.items():
        predict = predictions[user_id]
        common += len(set(items.keys()) & set(predict.keys()))
        precision += len(set(predict.keys()))
        recall += len(set(items.keys()))
    if precision == 0: return 0,0,0
    precision = common/precision
    recall = common/recall
    return precision, recall, 2*precision*recall/(precision+recall)

## Collaborative filtering

In [36]:
train_data, test_data = load_files('user')

In [37]:
i=0
recommender = UserCF(metric='jaccard')
distance_matrix = recommender.fit(i, train_data[i])

In [38]:
distance_matrix

Unnamed: 0_level_0,688130,647171,319492,548869,548871,548872,600748,385034,111959,581027,...,441000,548851,535632,2555893,630774,571650,607572,1990650,602111,584362
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
688130,1.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
647171,0.000000,1.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.100000
319492,0.000000,0.000000,1.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
548869,0.000000,0.000000,0.000000,1.000000,0.000000,0.127660,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
548871,0.000000,0.000000,0.000000,0.000000,1.000000,0.000000,0.000000,0.379310,0.000000,0.000000,...,0.305556,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
548872,0.000000,0.000000,0.000000,0.127660,0.000000,1.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
600748,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.475000,0.000000,0.141304,0.000000,0.000000,0.351351,0.000000
385034,0.000000,0.000000,0.000000,0.000000,0.379310,0.000000,0.000000,1.000000,0.000000,0.000000,...,0.464286,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
111959,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.279720,0.000000,0.297872,0.000000,0.000000,0.000000
581027,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.375000,0.000000,0.000000


In [25]:
predictions = {}
for user_id in test_data[i].keys():
    predictions.setdefault(user_id, {})
    predictions[user_id] = recommender.predict(user_id, k=10)
precision, recall, f1score = evaluation(test_data[i], predictions)
print('precision:', precision)
print('recall:', recall)
print('f1score', f1score)

precision: 0.0002545027407987471
recall: 0.0002143127977711469
f1score 0.00023268509651956792


In [53]:
def recommend_evaluation(recommender, topN_range):
    results = pd.DataFrame(columns=['cluster','topN','precision','recall','f1score'])
    record = pd.DataFrame(columns=['cluster','topN','precision','recall','f1score'], index=range(0,1))
    predictions = {}
    for i in range(0, 6):
        record.iloc[0]['cluster'] = i
        distance_matrix = recommender.fit(i, train_data[i])
        for k in topN_range:
            record.iloc[0]['topN'] = k
            predictions.clear()
            for user_id in test_data[i].keys():
                predictions.setdefault(user_id, {})
                predictions[user_id] = recommender.predict(user_id, k)
            precision, recall, f1score = evaluation(test_data[i], predictions)
            record.iloc[0]['precision'] = precision
            record.iloc[0]['recall'] = recall
            record.iloc[0]['f1score'] = f1score
            results = results.append(record, ignore_index=True)
    return results

In [58]:
import time

train_data, test_data = load_files('user')
print(time.strftime("%Y-%m-%d %X", time.localtime()))
recommender = UserCF(metric='jaccard')
topN_range = range(5,35,5)
results = recommend_evaluation(recommender, topN_range)
results.to_csv('./data/birch6_userCF_jaccard.csv')
print(time.strftime("%Y-%m-%d %X", time.localtime()))

recommender = UserCF(metric='cosine')
topN_range = range(5,35,5)
results = recommend_evaluation(recommender, topN_range)
results.to_csv('./data/birch6_userCF_cosine.csv')
print(time.strftime("%Y-%m-%d %X", time.localtime()))

recommender = ItemCF(metric='jaccard')
topN_range = range(5,35,5)
results = recommend_evaluation(recommender, topN_range)
results.to_csv('./data/birch6_itemCF_jaccard.csv')
print(time.strftime("%Y-%m-%d %X", time.localtime()))

recommender = ItemCF(metric='cosine')
topN_range = range(5,35,5)
results = recommend_evaluation(recommender, topN_range)
results.to_csv('./data/birch6_itemCF_cosine.csv')
print(time.strftime("%Y-%m-%d %X", time.localtime()))

2017-12-27 20:12:45
2017-12-27 20:18:03
2017-12-27 20:23:12
2017-12-27 23:18:36
2017-12-28 02:13:14
