In [None]:
# To support both python 2 and python 3
from __future__ import division, print_function, unicode_literals

# Common imports
import numpy as np
import os

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12

# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = ""
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

In [None]:
import os
import tarfile
from six.moves import urllib
import pandas as pd

student_class = pd.read_csv('./data/student_kmeans.csv')
student_class = student_class.set_index('id_student')
student_class.head()

### Generate user-item dict

In [None]:
import csv

def generate_user_data():
    user_train_data = {}
    item_train_data = {}
    test_data = {}
    # create dataframe
    for i in range(0, 5):
        user_train_data[i] = {}
        item_train_data[i] = {}
        test_data[i] = {}
    # read log file
    data_file = open('./data/student_train.csv')
    array_lines = csv.reader(data_file)
    for line in array_lines:
        user_id = int(line[1])
        item_id = int(line[2])
        if user_id not in student_class.index: 
            continue
        cluster = int(student_class.loc[user_id, 'cluster'])
        user_train_data[cluster].setdefault(user_id, {})
        user_train_data[cluster][user_id].setdefault(item_id, 0)
        user_train_data[cluster][user_id][item_id] += 1
        item_train_data[cluster].setdefault(item_id, {})
        item_train_data[cluster][item_id].setdefault(user_id, 0)
        item_train_data[cluster][item_id][user_id] += 1
    # read log file
    data_file = open('./data/student_test.csv')
    array_lines = csv.reader(data_file)
    for line in array_lines:
        user_id = int(line[1])
        item_id = int(line[2])
        if user_id not in student_class.index: 
            continue
        cluster = int(student_class.loc[user_id, 'cluster'])
        test_data[cluster].setdefault(user_id, {})
        test_data[cluster][user_id].setdefault(item_id, 0)
        test_data[cluster][user_id][item_id] += 1
    # save files
    np.save('./data/user_train_data.npy', user_train_data)
    np.save('./data/item_train_data.npy', item_train_data)
    np.save('./data/user_test_data.npy', test_data)
    print('end of task')

# read files
def load_files(method):
    train_data = np.load('./data/'+ method + '_train_data.npy').item()
    test_data = np.load('./data/user_test_data.npy').item()
    return train_data, test_data

### Distance matrix function

In [None]:
from scipy.stats import pearsonr
from sklearn.metrics.pairwise import cosine_distances # 1-cos similarity
from sklearn.metrics.pairwise import euclidean_distances

def pearson_distances(array1, array2):
    return 1-pearsonr(array1, array2)

def generate_distance_matrix(train, metric='euclidean'):
    df = pd.DataFrame(0, index=train.keys(), columns=train.keys(), dtype=np.float32)
    for index1 in train.keys():
        for index2 in train.keys():
            if df.at[index1,index2] > 0: continue
            df.at[index1,index2] = distance(train[index1], train[index2])
            df.at[index2,index1] = df.at[index1,index2]
    df.fillna(max(df))
    return df

def distance(dict1, dict2, metric='euclidean'):
    if metric=='euclidean': 
        fn = euclidean_distances
    elif metric=='cosine':
        fn = cosine_distances
    elif metric=='pearson':
        fn = pearson_distances
    common = set(dict1.keys()) & set(dict2.keys())
    if len(common) == 0: return float('NaN')
    array1 = []
    array2 = []
    for key in common:
        array1.append(dict1[key])
        array2.append(dict2[key])
    return fn(np.array(array1).reshape(-1, 1), np.array(array2).reshape(-1, 1))[0][0]

def get_distance_matrix(name):
    return pd.read_csv('./data/' + name + '.csv')

In [None]:
def generate_all_files():
    train_data, test_data = load_files('user')
    for i in range(0, 5):
        df = generate_distance_matrix(train_data[i], metric='euclidean')
        df.to_csv('./data/user_euclidean_matrix_' + str(i) + '.csv')
        print('user_euclidean_matrix...done')
        df = generate_distance_matrix(train_data[i], metric='cosine')
        df.to_csv('./data/user_cosine_matrix_' + str(i) + '.csv')
        print('user_cosine_matrix...done')
        df = generate_distance_matrix(train_data[i], metric='pearson')
        df.to_csv('./data/user_pearsonn_matrix_' + str(i) + '.csv')
        print('user_pearson_matrix...done')
        
    train_data, test_data = load_files('item')
    for i in range(0, 5):
        df = generate_distance_matrix(train_data[i], metric='euclidean')
        df.to_csv('./data/item_euclidean_matrix_' + str(i) + '.csv')
        print('item_euclidean_matrix...done')
        df = generate_distance_matrix(train_data[i], metric='cosine')
        df.to_csv('./data/item_cosine_matrix_' + str(i) + '.csv')
        print('item_cosine_matrix...done')
        df = generate_distance_matrix(train_data[i], metric='pearson')
        df.to_csv('./data/item_pearsonn_matrix_' + str(i) + '.csv')
        print('item_pearson_matrix...done')