# Task 2: Recommendation Engine

The dataset used in this notebook is the 'train.csv' provided in the task 1. It can also be downloaded at https://drive.google.com/file/d/1wWryyYaBhI7bAylFUPmWQf0gOGXolK86/view?usp=sharing


## Setting up the Notebook

In [21]:
import pandas as pd
import os
import csv
import numpy as np
from numpy.core.numeric import normalize_axis_tuple
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.neighbors import NearestNeighbors
from scipy.sparse import csr_matrix
from scipy.sparse import rand
from datetime import datetime
import pickle 
from scipy import spatial

np.random.seed(1)

## Load the Data and Convert into Vectors

In [22]:
df_sample = pd.read_csv('train.csv')

df_sample.head()

Unnamed: 0,listing_id,title,make,model,description,manufactured,original_reg_date,reg_date,type_of_vehicle,category,transmission,curb_weight,power,fuel_type,engine_cap,no_of_owners,depreciation,coe,road_tax,dereg_value,mileage,omv,arf,opc_scheme,lifespan,eco_category,features,accessories,indicative_price,price
0,1030324,BMW 3 Series 320i Gran Turismo M-Sport,bmw,320i,1 owner! 320i gt m-sports model! big brake kit...,2013.0,,09-dec-2013,luxury sedan,"parf car, premium ad car, low mileage car",auto,1560.0,135.0,,1997.0,1.0,17700.0,77100.0,1210.0,47514.0,73000.0,45330.0,50462.0,,,uncategorized,"5 doors gt, powerful and fuel efficient 2.0l t...","bmw i-drive, navigation, bluetooth/aux/usb inp...",,71300.0
1,1021510,Toyota Hiace 3.0M,,hiace,high loan available! low mileage unit. wear an...,2014.0,,26-jan-2015,van,premium ad car,manual,1740.0,,diesel,2982.0,3.0,11630.0,10660.0,,3648.0,110112.0,27502.0,1376.0,,25-jan-2035,uncategorized,low mileage unit. well maintained vehicle. vie...,factory radio setting. front recording camera....,,43800.0
2,1026909,Mercedes-Benz CLA-Class CLA180,mercedes-benz,cla180,1 owner c&c unit. full agent service with 1 mo...,2016.0,,25-jul-2016,luxury sedan,"parf car, premium ad car",auto,1430.0,90.0,,1595.0,1.0,15070.0,53694.0,740.0,44517.0,80000.0,27886.0,26041.0,,,uncategorized,responsive and fuel efficient 1.6l inline 4 cy...,dual electric/memory seats. factory fitted aud...,,95500.0
3,1019371,Mercedes-Benz E-Class E180 Avantgarde,mercedes-benz,e180,"fully agent maintained, 3 years warranty 10 ye...",2019.0,,17-nov-2020,luxury sedan,"parf car, almost new car, consignment car",auto,1635.0,115.0,,1497.0,1.0,16400.0,40690.0,684.0,80301.0,9800.0,46412.0,56977.0,,,uncategorized,"1.5l inline-4 twin scroll turbocharged engine,...",64 colour ambient lighting. active parking ass...,,197900.0
4,1031014,Honda Civic 1.6A VTi,,civic,"kah motor unit! 1 owner, lowest 1.98% for full...",2019.0,,20-sep-2019,mid-sized sedan,parf car,auto,1237.0,92.0,,1597.0,1.0,10450.0,26667.0,742.0,36453.0,40000.0,20072.0,20101.0,,,uncategorized,"1.6l i-vtec engine, 123 bhp, earth dreams cvt ...","s/rims, premium leather seats, factory touch s...",,103200.0


In [23]:
def parse(filename):
    reader = csv.reader(open(filename, 'r', encoding='utf8'))

    attr = next(reader)
    data = []

    for _, line in enumerate(reader):
        d = {}
        for idx in range(len(line)):
            d[attr[idx]] = line[idx]
        data.append(d)

    return data


def date2value(s):
    if len(s) == 0:
        value = -1
    else:
        # value = datetime.strptime(s, '%d-%b-%Y').toordinal()
        value = int(s[-4:])
    return value 

attr_ignored = ['listing_id', 'title', 'description', 'features', 'accessories',
                'model', 
                'no_of_owners',  
                # 'original_reg_date',
                 'opc_scheme', 'category']


def get_nominal_matrix(values):
    s = set(values)
    s.add('')
    k = {}
    idx2value = list(s)
    value2idx = {value:idx for idx, value in enumerate(idx2value)}
    arr = np.asarray([[v] for v in idx2value])

    encoder = OneHotEncoder(sparse=False)
    onehot_matrix = encoder.fit_transform(arr)

    value2vec = {value:onehot_matrix[idx] for value, idx in value2idx.items()}
    return value2vec


def analyze_attribute(data):
    attrs = list(data[0].keys())
    attr2vec = {}
    data_cleaned  =[]
    for key in attrs:
        if key in attr_ignored:
            continue
        if key == 'price':
            attr2vec[key] = {}
            attr2vec[key] = lambda x: float(x.strip())

        set_attr = set()
        for elm in data:
            # Special consideration for some keys...
            if key == 'make' and elm[key] == '':
                elm[key] = elm['title'].split(' ')[0].lower()
                # print('add %s'%elm[key]
            if key == 'original_reg_date' and elm[key] == '':
                elm['original_reg_date'] = elm['reg_date']

            # if key in ['original_reg_date', 'reg_date', 'lifespan' ]:
            if key in ['reg_date', 'lifespan', 'original_reg_date']:
                attr2vec[key] = date2value
            elif key in ['curb_weight', 'power', 'engine_cap', \
                         'depreciation', 'coe', 'road_tax', \
                         'dereg_value', 'mileage', 'omv', \
                         'arf']: # ratio
                attr2vec[key] = lambda x: float(x.strip()) if len(x.strip()) != 0 else -1
            else:
                value = elm[key].strip()
                set_attr.add(value.lower())
        if 0 < len(set_attr) < 700: # If one attribute only has a small number of value set, we index them
            attr2vec[key] = get_nominal_matrix(set_attr)
            print('%s is added as a nominal, whose size is %d'%(key, len(set_attr)))
        elif key in attr2vec:
            print('Attribute "%s" is added as a function'%(key))
        else:
            # print(key, len(set_attr))
            raise ValueError
    return attrs, attr2vec


def get_vector(d, attr2vec, attrs, has_label):
    """
        attrs is a list of attributes excluding the price. It is used to order the vector
    """
    vector = []
    vec = None
    for attr in attrs:
        if not has_label and attr == 'price':
            continue
        if attr in attr_ignored:
            continue
        value = d[attr]

        # Special consideration
        if attr == 'make' and value == '':
            value = d['title'].split(' ')[0].lower()
        if attr == 'original_reg_date' and value == '':
            value = d['reg_date']
        if attr in attr2vec:
            if hasattr(attr2vec[attr], 'shape') or isinstance(attr2vec[attr], dict): # 2 ways of indexing...
                if value not in attr2vec[attr]:
                    value = ''
                vec = attr2vec[attr][value]
            else:
                vec = attr2vec[attr](value)
        # if vec is None:
            # print(attr, value)

        if isinstance(vec, list) or isinstance(vec, np.ndarray):
            vector += [*vec]
        else:
            vector += [vec]
    return vector


def build_vectors(data, attr2vec, attrs, has_label=True):
    vectors = []
    for idx, elm in enumerate(data):
        vector = get_vector(elm, attr2vec, attrs, has_label)
        vectors.append(vector)
    return np.float32(vectors)


In [24]:
data_train = parse('train.csv')
attrs, nominal2value = analyze_attribute(data_train)
data_train_vec = build_vectors(data_train, nominal2value, attrs)

print('vectors size: [', len(data_train_vec), ', ', len(data_train_vec[0]),']')

make is added as a nominal, whose size is 77
manufactured is added as a nominal, whose size is 62
Attribute "original_reg_date" is added as a function
Attribute "reg_date" is added as a function
type_of_vehicle is added as a nominal, whose size is 11
transmission is added as a nominal, whose size is 2
Attribute "curb_weight" is added as a function
Attribute "power" is added as a function
fuel_type is added as a nominal, whose size is 5
Attribute "engine_cap" is added as a function
Attribute "depreciation" is added as a function
Attribute "coe" is added as a function
Attribute "road_tax" is added as a function
Attribute "dereg_value" is added as a function
Attribute "mileage" is added as a function
Attribute "omv" is added as a function
Attribute "arf" is added as a function
Attribute "lifespan" is added as a function
eco_category is added as a nominal, whose size is 1
indicative_price is added as a nominal, whose size is 1
Attribute "price" is added as a function
vectors size: [ 16784 

## Computing the Top Recommendations

The method `get_top_recommendations()` shows an example of how to get the top recommendations for a given data sample (data sample = row in the dataframe of the dataset). The input is a row from the dataset and a list of optional input parameters which will depend on your approach; `k` is the number of returned recommendations seems useful, though.

The output should be a `pd.DataFrame` containing the recommendations. The output dataframe should have the same columns as the row + any additional columns you deem important (e.g., any score or tags that you might want to add to your recommendations).

In principle, the method `get_top_recommendations()` may be imported from a external Python (.py) script as well.

In [25]:
def content_based_item_similarity(data_train_vec, row_index, k):
    data_train_vec_matrix = csr_matrix(data_train_vec)
    # row_matrix = csr_matrix([row])

    model_knn = NearestNeighbors(metric = 'cosine', algorithm = 'brute')
    model_knn.fit(data_train_vec_matrix)

    distances, indices = model_knn.kneighbors(data_train_vec_matrix[row_index], n_neighbors = k+1)
    results = []
    for i in range(0, len(distances.flatten())):
        if i == 0:
            # print('Recommendations for {0}:, with distance of {1}\n'.format(data_train[indices.flatten()[i]], distances.flatten()[i]))
            pass
        else:
            results.append(data_train[indices.flatten()[i]])
            # print('{0}: {1}, with distance of {2}:'.format(i, data_train[indices.flatten()[i]], distances.flatten()[i]))

    return results

In [26]:
def user_based_cf(k, browsing_history):
    browsing_history_current_user = browsing_history[0]
    tree = spatial.KDTree(browsing_history[1:])
    dist, i = tree.query(browsing_history_current_user)
    # print('{0}: {1}, with distance of {2}:'.format(i+1, browsing_history[i+1], dist))

    indices = [i for i, value in enumerate(browsing_history[i+1]) if value == 1 and browsing_history[0][i] != 1]
    recommendations = np.random.choice(indices, k)
    results = df_sample.iloc[recommendations]
    results = pd.DataFrame(results)

    return results

In [47]:
def matrix_factorization(R, W, H, K, steps=5000, alpha=0.0002, beta=0.02):

    H = H.T

    for step in range(steps):
        for i in range(len(R)):
            for j in range(len(R[i])):
                if R[i][j] > 0:
                    eij = R[i][j] - np.dot(W[i,:],H[:,j])
                    for k in range(K):
                        W[i][k] = W[i][k] + alpha * (2 * eij * H[k][j] - beta * W[i][k])
                        H[k][j] = H[k][j] + alpha * (2 * eij * W[i][k] - beta * H[k][j])
        eR = np.dot(W,H)
        e = 0
        for i in range(len(R)):
            for j in range(len(R[i])):
                if R[i][j] > 0:
                    e = e + pow(R[i][j] - np.dot(W[i,:],H[:,j]), 2)
                    for k in range(K):
                        e = e + (beta/2) * (pow(W[i][k],2) + pow(H[k][j],2))
        if e < 0.001:
            break

    return W, H.T


def model_based_cf(k, browsing_history):
    R = browsing_history
    num_features = 3

    W = np.random.rand(len(R), num_features)
    H = np.random.rand(len(R[0]), num_features)

    nP, nQ = matrix_factorization(R, W, H, num_features, 100)

    nR = np.dot(nP, nQ.T)
    print(nR[0])

    # set browsed items to 0
    for i in range(len(browsing_history)):
        if browsing_history[0][i] == 1:
            nR[0][i] = 0
    print(nR[0])
    # indices = [i for i, value in enumerate(browsing_history[i+1]) if value == 1 and browsing_history[0][i] != 1]
    recommendations = nR[0].argsort()[-k:][::-1]
    # recommendations = np.random.choice(indices, k)
    results = df_sample.iloc[recommendations]
    results = pd.DataFrame(results)

    return results

In [48]:
def get_top_recommendations(row, **kwargs) -> pd.DataFrame:
    
    #####################################################
    ## Initialize the required parameters
    
    # The number of recommendations seem recommended
    # Additional input parameters are up to you
    k = None
    
    # Extract all **kwargs input parameters
    # and set the used paramaters (here: k)
    for key, value in kwargs.items():
        if key == 'k':
            k = value
            
    # split the k recommendations into 3 parts where each part represents a type of recommendation   
    k_1 = k//3 + 1 if k%3 > 0 else k//3 
    k_2 = k//3 + 1 if k%3 > 1 else k//3 
    k_3 = k - k_1 - k_2 
    print(k_1, k_2, k_3)

    #####################################################
    ## Compute your recommendations
    #
    # This is where your magic happens. Of course, you can call methods
    # defined in this notebook or in external Python (.py) scripts
    
    index = df_sample.index
    condition = df_sample["listing_id"] == row["listing_id"]
    row_index = index[condition][0]
    results = content_based_item_similarity(data_train_vec, row_index, k_1)
    df_result = pd.DataFrame.from_dict(results)

    # simulate user's browsing history
    # assume 10 users, and the first user is our current user
    x = rand(10, len(data_train), density=0.005, format='csr')
    x.data[:] = 1
    browsing_history = x.A 
    df_result_2 = user_based_cf(k_2, browsing_history)

    df_result_3 = model_based_cf(k_3, browsing_history)
    df_result = pd.concat([df_result, df_result_2, df_result_3], ignore_index=True)
        
        
    # Return the dataset with the k recommendations
    return df_result


## Testing the Recommendation Engine

This will be the main part of your notebook to allow for testing your solutions. Most basically, for a given listing (defined by the row id in your input dataframe), we would like to see the recommendations you make. So however you set up your notebook, it should have at least a comparable section that will allow us to run your solution for different inputs.

### Pick a Sample Listing as Input

In [29]:
# Pick a row id of choice
row_id = 10
# row_id = 20
#row_id = 30
#row_id = 40
#row_id = 50

# Get the row from the dataframe (an valid row ids will throw an error)
row = df_sample.iloc[row_id]

# Just for printing it nicely, we create a new dataframe from this single row
pd.DataFrame([row])

Unnamed: 0,listing_id,title,make,model,description,manufactured,original_reg_date,reg_date,type_of_vehicle,category,transmission,curb_weight,power,fuel_type,engine_cap,no_of_owners,depreciation,coe,road_tax,dereg_value,mileage,omv,arf,opc_scheme,lifespan,eco_category,features,accessories,indicative_price,price
10,1004029,Kia Cerato K3 1.6A,kia,cerato,"1 owner, 1.98% interest, **2 years extended wa...",2017.0,,14-aug-2017,mid-sized sedan,parf car,auto,1295.0,95.3,,1591.0,1.0,8270.0,42801.0,738.0,35227.0,,12900.0,12900.0,,,uncategorized,"1.6l 4 cylinders, dual cvvt engine, 6-speed au...","leather seats, sports rims, factory fitted aud...",,61400.0


## Compute and Display the recommendations

Since the method `get_top_recommendations()` returns a `pd.DataFrame`, it's easy to display the result.

In [49]:
k = 10

df_recommendations = get_top_recommendations(row, k=k)

df_recommendations.head(k)

4 3 3
[1.41433364 1.16591494 0.66524349 ... 0.56796495 1.0117231  1.27560251]
[1.41433364 1.16591494 0.66524349 ... 0.56796495 1.0117231  1.27560251]


Unnamed: 0,listing_id,title,make,model,description,manufactured,original_reg_date,reg_date,type_of_vehicle,category,transmission,curb_weight,power,fuel_type,engine_cap,no_of_owners,depreciation,coe,road_tax,dereg_value,mileage,omv,arf,opc_scheme,lifespan,eco_category,features,accessories,indicative_price,price
0,1020292,Hyundai Elantra 1.6A GLS S,hyundai,elantra,"hyundai specialist! 1 owner only. high spec ""s...",2017.0,07-sep-2017,07-sep-2017,mid-sized sedan,parf car,auto,1345.0,93.8,,1591.0,1.0,8380.0,42900.0,738.0,35398.0,,12674.0,12674.0,,,uncategorized,"1.6l cvvt engine, 125 bhp, 6 speed cvt automat...","leather seats, sports rims, factory fitted aud...",,62600.0
1,1006884,Kia Cerato K3 1.6A,kia,cerato,623,2017.0,20-jul-2017,20-jul-2017,mid-sized sedan,parf car,auto,1295.0,95.3,,1591.0,1.0,8120.0,42801.0,738.0,34171.0,,11884.0,11884.0,,,uncategorized,"1.6l 4 cylinders 16 valves, dohc dual cvvt eng...",multi-function steering wheel. leather seats. ...,,59300.0
2,1010188,Subaru XV 1.6i-S,subaru,xv,"elegant beige suv, all wheel drive, fully serv...",2017.0,28-apr-2017,28-apr-2017,suv,parf car,auto,1480.0,84.0,,1600.0,1.0,9730.0,50789.0,744.0,40358.0,,15387.0,15387.0,,,uncategorized,"airbags abs, powerful 1.6l boxei engine, linea...","leather seats, sports rims, audio system, blue...",,69200.0
3,1029266,Hyundai Elantra 1.6A GLS S,hyundai,elantra,1 owner. most sought after korean sedan. well ...,2017.0,23-oct-2017,23-oct-2017,mid-sized sedan,"parf car, premium ad car",auto,1345.0,93.8,,1591.0,1.0,8600.0,42900.0,738.0,35620.0,,12547.0,12547.0,,,uncategorized,1.6l dohc cvvt engine mated with 6 speed cvt a...,"leather seats, sports rims, original lcd playe...",,64700.0
4,1022978,Mitsubishi Fuso Canter FEA01,mitsubishi,fuso,let canter bring your business towards higher ...,2014.0,,22-sep-2014,truck,premium ad car,auto,2000.0,,diesel,2998.0,3.0,10340.0,52010.0,,16002.0,,28699.0,1435.0,,21-sep-2034,uncategorized,new paintwork.,cabin height freezer,,35000.0
5,1012498,Toyota Vios 1.5A E,toyota,vios,"1 owner with low mileage, service done, most f...",2019.0,,29-nov-2019,mid-sized sedan,"parf car, low mileage car",auto,1085.0,79.0,,1496.0,1.0,8360.0,32309.0,682.0,37046.0,17449.0,13787.0,13787.0,,,uncategorized,1.5l inline 4 cylinders 16 valve dohc vvt-i en...,"factory fitted audio systems, sport rims, reve...",,83600.0
6,990822,Toyota Sienta Hybrid 1.5A G,toyota,sienta,"high loan available, make and assemble in japa...",2019.0,,29-apr-2021,mpv,"parf car, almost new car, premium ad car, low ...",auto,,73.0,,1496.0,1.0,10890.0,49640.0,682.0,57941.0,652.0,25849.0,13189.0,,,uncategorized,fuel economical 1.5l 4 cylinders vvt-i dohc en...,"dvd with bluetooth, usb, reverse camera and se...",,123200.0
7,1006761,BMW M Series M135i 5DR,bmw,m135i,"m bootmod3 stage 1, ftp aluminium charge and b...",2013.0,,18-apr-2013,sports car,"parf car, premium ad car",auto,1445.0,235.0,,2979.0,4.0,22220.0,67010.0,2362.0,36612.0,104000.0,38956.0,46539.0,,,uncategorized,full factory pml unit. pls call for appt to vi...,full accessories. list too long to state!,,65800.0
8,1021409,Volvo V40 T2,volvo,v40,0,2015.0,,28-jan-2016,hatchback,"parf car, premium ad car",auto,1450.0,90.0,,1498.0,2.0,9860.0,51301.0,684.0,35824.0,106000.0,22664.0,18730.0,,,uncategorized,"1.5l inline 4 cylinders turbocharged engine, 6...","original factory audio system, alloy rims, mem...",,58300.0
9,1029708,Mercedes-Benz E-Class E200 Avantgarde,mercedes-benz,e200,strictly no dealers please. full digital dashb...,2016.0,,17-jan-2017,luxury sedan,"parf car, direct owner sale, low mileage car",auto,1605.0,135.0,,1991.0,2.0,19320.0,53001.0,1202.0,72662.0,43800.0,47935.0,59109.0,,,uncategorized,"2.0l 4 cylinders turbocharged engine, 184bhp, ...","led headlights, reverse camera/sensors with 36...",,146100.0
