In [1]:
%matplotlib inline

from pathlib import Path
import heapq
from collections import defaultdict

import numpy as np
import pandas as pd
import matplotlib.pylab as plt
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

from scipy.spatial.distance import cosine
from surprise import Dataset
from surprise import Reader
from surprise import KNNBasic
from surprise.model_selection import train_test_split

DATA = Path('.').resolve()

In [2]:
courserating_df = pd.read_csv(DATA / 'courserating.csv')
courserating_df.columns = [s.strip().replace(' ', '_') for s in courserating_df.columns]
courserating_df.set_index('Unnamed:_0', inplace=True)
courserating_df

Unnamed: 0_level_0,SQL,Spatial,PA1,DM_in_R,Python,Forecast,R_Prog,Hadoop,Regression
Unnamed:_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
LN,4.0,,,,3.0,2.0,4.0,,2.0
MH,3.0,4.0,,,4.0,,,,
JH,2.0,2.0,,,,,,,
EN,4.0,,,4.0,,,4.0,,3.0
DU,4.0,4.0,,,,,,,
FL,,4.0,,,,,,,
GL,,4.0,,,,,,,
AH,,3.0,,,,,,,
SA,,,4.0,,,,,,
RW,,,2.0,,,,,4.0,


Just reviewing the data visually, without doing much calculation, which user(s) would you consider most similar to E.N.? (2 points) 

L.N. is most similar to E.N.

Use Python to compute the cosine similarity between users. (4 points)

In [3]:
def cosine_similarity_NA(data):
    m = data.shape[0]
    result = np.empty((m,m))
    result[:] = np.nan
    for i in range(m):
        maski = ~np.isnan(data.iloc[i])
        for j in range(i, m):
            maskij = maski & ~np.isnan(data.iloc[j])
            if np.any(maskij):
                result[i, j] = 1 - cosine(data.iloc[i][maskij], data.iloc[j][maskij])
                result[j, i] = result[i, j]
    return pd.DataFrame(result, columns=data.index, index=data.index)
cosine_similarity_NA(courserating_df)

Unnamed:_0,LN,MH,JH,EN,DU,FL,GL,AH,SA,RW,BA,MG,AF,KG,DS
Unnamed:_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
LN,1.0,0.96,1.0,0.9891,1.0,,,,,,,1.0,,,1.0
MH,0.96,1.0,0.989949,1.0,0.989949,1.0,1.0,1.0,,,,,,,1.0
JH,1.0,0.989949,1.0,1.0,1.0,1.0,1.0,1.0,,,,,,,1.0
EN,0.9891,1.0,1.0,1.0,1.0,,,,,,,,,,0.96225
DU,1.0,0.989949,1.0,1.0,1.0,1.0,1.0,1.0,,,,,,,1.0
FL,,1.0,1.0,,1.0,1.0,1.0,1.0,,,,,,,
GL,,1.0,1.0,,1.0,1.0,1.0,1.0,,,,,,,
AH,,1.0,1.0,,1.0,1.0,1.0,1.0,,,,,,,
SA,,,,,,,,,1.0,1.0,1.0,1.0,1.0,1.0,
RW,,,,,,,,,1.0,1.0,1.0,1.0,1.0,1.0,


Based on the cosine similarities of the nearest students to E.N., which course should be recommended to E.N.? (4 points)

course Spatial should be recommended to E.N.

If the goal is still to find a recommendation for E.N., for which course pairs is it possible and useful to calculate correlations? (2 points)

SQL and Spatial course pair is useful to calculate correlations.

Just looking at the data, and without yet calculating course pair correlations, which course would you recommend to E.N., relying on item-based filtering? Calculate two course pair correlations involving your guess and report the results. (2 points)

Relying on item-based filtering, I guess Spatial course would be recommended to E.N.

In [4]:
courserating_df.corr(method='pearson')

Unnamed: 0,SQL,Spatial,PA1,DM_in_R,Python,Forecast,R_Prog,Hadoop,Regression
SQL,1.0,0.866025,,,-1.0,,,,
Spatial,0.866025,1.0,,,,,,,
PA1,,,1.0,,,,,,
DM_in_R,,,,1.0,,,,,
Python,-1.0,,,,1.0,,,,
Forecast,,,,,,1.0,,,
R_Prog,,,,,,,,,
Hadoop,,,,,,,,,
Regression,,,,,,,,,1.0


Apply item-based collaborative filtering to the dataset (using Python) and based on the results, recommend a course to E.N. (6 points)

In [5]:
ratings = []
for student, row in courserating_df.iterrows():
    for course, value in row.iteritems():
        if np.isnan(value): continue
        ratings.append([student, course, value])
ratings = pd.DataFrame(ratings, columns=['student', 'course', 'rating'])
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(ratings[['student', 'course', 'rating']], reader)
trainset = data.build_full_trainset()
sim_options = {'name': 'cosine', 'user_based': False}  # compute cosine similarities between items
algo = KNNBasic(sim_options=sim_options)
algo.fit(trainset)
courses = courserating_df.columns
for course in courses: 
    print(course, algo.predict('EN', course).est)

Computing the cosine similarity matrix...
Done computing similarity matrix.
SQL 3.7504416393899813
Spatial 4.0
PA1 3.433333333333333
DM_in_R 3.743416490252569
Python 3.6621621621621623
Forecast 3.6666666666666665
R_Prog 3.7504416393899813
Hadoop 3.433333333333333
Regression 3.747548783981962


based on item-based collaborative filtering result, course Spatial can be recommended E.N.