In [203]:
import os
import csv
from numpy.core.numeric import normalize_axis_tuple
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
import numpy as np
from datetime import datetime
import pickle

from xgboost import XGBRegressor
from sklearn.metrics import accuracy_score

def parse(filename):
    reader = csv.reader(open(filename, 'r', encoding='utf8'))

    attr = next(reader)
    data = []

    for _, line in enumerate(reader):
        d = {}
        for idx in range(len(line)):
            d[attr[idx]] = line[idx]
        data.append(d)

    return data

def select_attr(data):
    filtered = []
    keys_to_extract = ['title', 'make', 'manufactured', 'type_of_vehicle', 'eco_category', 'price']
    for item in data:
        subset = {key: item[key] for key in keys_to_extract}
        filtered.append(subset)
    return filtered

# def get_vector(d):
#     vector = []
#     for k, v in d.items():
#         if k == 

# nominal2value = {} 
# This is a dictionary, where the key is the attribute name
# Each element is also a dictionary, where the key is "value2idx" and "onehot_matrix"
# value2idx is to map an attribute value to the index
# With the index from value2idx, we can use the onehot_matrix[idx] to get the corresponding vector for that value
def date2value(s):
    if len(s) == 0:
        value = 0
    else:
        value = datetime.strptime(s, '%d-%b-%Y').toordinal()
    return value 

attr_ignored = ['listing_id', 'title', 'description', 'features', 'accessories',
                'model']

def analyze_attribute(data):
#     attrs = list(data[0].keys())
    attrs = ['title', 'make', 'manufactured', 'type_of_vehicle', 'eco_category', 'price']
#     attrs = ['title', 'price']
    nominal2value = {}
    for key in attrs:
        if key in attr_ignored:
            continue
        if key == 'price':
            nominal2value[key] = {}
            nominal2value[key]['value2vec'] = lambda x: float(x.strip())

        set_attr = set()
        for elm in data:
            if key in ['original_reg_date', 'reg_date', 'lifespan' ]:
                nominal2value[key] = {}
                nominal2value[key]['value2vec'] = date2value
            elif key in ['curb_weight', 'power', 'engine_cap', \
                         'depreciation', 'coe', 'road_tax', \
                         'dereg_value', 'mileage', 'omv', \
                         'arf']: # ratio
                nominal2value[key] = {}
                nominal2value[key]['value2vec'] = lambda x: float(x.strip()) if len(x.strip()) != 0 else -1
            else:
                value = elm[key].strip()
                set_attr.add(value)
        if 0 < len(set_attr) < 300: # If one attribute only has a small number of value set, we index them
            # if key not in nominal2value:
                # nominal2value[key] = {}
            set_attr.add('') # For unseen data
            nominal2value[key] = {}
            nominal2value[key]['idx2value'] = list(set_attr)
            nominal2value[key]['value2idx'] = {value:idx for idx, value in enumerate(nominal2value[key]['idx2value'])}
            arr = np.asarray([[v] for v in nominal2value[key]['idx2value']])
            encoder = OneHotEncoder(sparse=False)
            nominal2value[key]['onehot_matrix'] = encoder.fit_transform(arr)
            nominal2value[key]['value2vec'] = {value:nominal2value[key]['onehot_matrix'][idx] for value, idx in nominal2value[key]['value2idx'].items()}
            print('%s is added as a nominal, whose size is %d'%(key, len(set_attr)))
        elif key in nominal2value:
            print('Attribute "%s" is added as a function'%(key))
        else:
            print('Attribute "%s" needs care... The size is %d.'%(key, len(set_attr)))
            print('Example value:')
            for _ in range(5):
                print(data[_][key])
            raise ValueError
        # print(nominal2value[key])
        # print(nominal2value)
        # assert False
    return attrs, nominal2value

def get_vector(d, nominal2value, attrs, has_label):
    """
        attrs is a list of attributes excluding the price. It is used to order the vector
    """
    vector = []
    for attr in attrs:
        if not has_label and attr == 'price':
            continue
        if attr in attr_ignored:
            continue
        value = d[attr]
        if isinstance(nominal2value[attr]['value2vec'], dict):
            if value not in nominal2value[attr]['value2vec']: # This value is unseen value for that attribute
                value = ''
            vec = nominal2value[attr]['value2vec'][value]
        else:
            vec = nominal2value[attr]['value2vec'](value)
        if vec is None:
            print(attr, value)

        if isinstance(vec, list) or isinstance(vec, np.ndarray):
            vector += [*vec]
        else:
            vector += [vec]
    return vector



def build_vectors(data, nominal2value, attrs, has_label=True):
    vectors = []
    for idx, elm in enumerate(data):
        vector = get_vector(elm, nominal2value, attrs, has_label)
        vectors.append(vector)
    return np.float32(vectors)

data_train = parse('data/train.csv')
data_test = parse('data/test.csv')

print(data_train[0].keys())

# print(data_train)

data_train_pivot = select_attr(data_train)
print(data_train_pivot[0].keys())


dict_keys(['listing_id', 'title', 'make', 'model', 'description', 'manufactured', 'original_reg_date', 'reg_date', 'type_of_vehicle', 'category', 'transmission', 'curb_weight', 'power', 'fuel_type', 'engine_cap', 'no_of_owners', 'depreciation', 'coe', 'road_tax', 'dereg_value', 'mileage', 'omv', 'arf', 'opc_scheme', 'lifespan', 'eco_category', 'features', 'accessories', 'indicative_price', 'price'])
dict_keys(['title', 'make', 'manufactured', 'type_of_vehicle', 'eco_category', 'price'])


In [204]:
attrs, nominal2value = analyze_attribute(data_train_pivot)

data_train_vec = build_vectors(data_train_pivot, nominal2value, attrs)
# data_test_vec = build_vectors(data_test, nominal2value, attrs, has_label=False)

print(data_train_vec[0])

make is added as a nominal, whose size is 78
manufactured is added as a nominal, whose size is 62
type_of_vehicle is added as a nominal, whose size is 12
eco_category is added as a nominal, whose size is 2
Attribute "price" is added as a function
[0.00e+00 0.00e+00 0.00e+00 0.00e+00 0.00e+00 0.00e+00 0.00e+00 1.00e+00
 0.00e+00 0.00e+00 0.00e+00 0.00e+00 0.00e+00 0.00e+00 0.00e+00 0.00e+00
 0.00e+00 0.00e+00 0.00e+00 0.00e+00 0.00e+00 0.00e+00 0.00e+00 0.00e+00
 0.00e+00 0.00e+00 0.00e+00 0.00e+00 0.00e+00 0.00e+00 0.00e+00 0.00e+00
 0.00e+00 0.00e+00 0.00e+00 0.00e+00 0.00e+00 0.00e+00 0.00e+00 0.00e+00
 0.00e+00 0.00e+00 0.00e+00 0.00e+00 0.00e+00 0.00e+00 0.00e+00 0.00e+00
 0.00e+00 0.00e+00 0.00e+00 0.00e+00 0.00e+00 0.00e+00 0.00e+00 0.00e+00
 0.00e+00 0.00e+00 0.00e+00 0.00e+00 0.00e+00 0.00e+00 0.00e+00 0.00e+00
 0.00e+00 0.00e+00 0.00e+00 0.00e+00 0.00e+00 0.00e+00 0.00e+00 0.00e+00
 0.00e+00 0.00e+00 0.00e+00 0.00e+00 0.00e+00 0.00e+00 0.00e+00 0.00e+00
 0.00e+00 0.00e+00 0.00

In [205]:
from sklearn.neighbors import NearestNeighbors
from scipy.sparse import csr_matrix

data_train_vec_matrix = csr_matrix(data_train_vec)

print(data_train_vec_matrix)

model_knn = NearestNeighbors(metric = 'euclidean', algorithm = 'brute')
model_knn.fit(data_train_vec_matrix)

  (0, 7)	1.0
  (0, 130)	1.0
  (0, 143)	1.0
  (0, 153)	1.0
  (0, 154)	71300.0
  (1, 0)	1.0
  (1, 131)	1.0
  (1, 151)	1.0
  (1, 153)	1.0
  (1, 154)	43800.0
  (2, 48)	1.0
  (2, 133)	1.0
  (2, 143)	1.0
  (2, 153)	1.0
  (2, 154)	95500.0
  (3, 48)	1.0
  (3, 136)	1.0
  (3, 143)	1.0
  (3, 153)	1.0
  (3, 154)	197900.0
  (4, 0)	1.0
  (4, 136)	1.0
  (4, 144)	1.0
  (4, 153)	1.0
  (4, 154)	103200.0
  :	:
  (16779, 76)	1.0
  (16779, 132)	1.0
  (16779, 149)	1.0
  (16779, 153)	1.0
  (16779, 154)	144400.0
  (16780, 29)	1.0
  (16780, 134)	1.0
  (16780, 142)	1.0
  (16780, 153)	1.0
  (16780, 154)	70200.0
  (16781, 50)	1.0
  (16781, 127)	1.0
  (16781, 147)	1.0
  (16781, 153)	1.0
  (16781, 154)	71300.0
  (16782, 0)	1.0
  (16782, 136)	1.0
  (16782, 142)	1.0
  (16782, 153)	1.0
  (16782, 154)	81200.0
  (16783, 3)	1.0
  (16783, 135)	1.0
  (16783, 147)	1.0
  (16783, 153)	1.0
  (16783, 154)	638000.0


NearestNeighbors(algorithm='brute', leaf_size=30, metric='euclidean',
                 metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                 radius=1.0)

In [206]:
query_index = np.random.choice(data_train_vec.shape[0])
distances, indices = model_knn.kneighbors(data_train_vec_matrix[query_index], n_neighbors = 6)
print(query_index, indices)

5763 [[ 3526  3764 16032  5763  1502   818]]


In [208]:
for i in range(0, len(distances.flatten())):
    if i == 0:
        print('Recommendations for {0}:\n'.format(data_train[indices.flatten()[i]]))
    else:
        print('{0}: {1}, with distance of {2}:'.format(i, data_train[indices.flatten()[i]], distances.flatten()[i]))

Recommendations for {'listing_id': '1021516', 'title': 'Toyota Allion 1.5A (COE till 02/2024)', 'make': 'toyota', 'model': 'allion', 'description': '', 'manufactured': '2008.0', 'original_reg_date': '', 'reg_date': '02-feb-2009', 'type_of_vehicle': 'mid-sized sedan', 'category': 'coe car, direct owner sale', 'transmission': 'auto', 'curb_weight': '1200.0', 'power': '81.0', 'fuel_type': '', 'engine_cap': '1496.0', 'no_of_owners': '4.0', 'depreciation': '7710.0', 'coe': '12864.0', 'road_tax': '887.0', 'dereg_value': '6273.0', 'mileage': '160000.0', 'omv': '16308.0', 'arf': '16308.0', 'opc_scheme': '', 'lifespan': '', 'eco_category': 'uncategorized', 'features': 'car in very good condition. road tax just renewed. view specs of the toyota allion', 'accessories': '', 'indicative_price': '', 'price': '20700.0'}:

1: {'listing_id': '1030959', 'title': 'Toyota Vios 1.5A E (COE till 02/2024)', 'make': 'toyota', 'model': 'vios', 'description': '700', 'manufactured': '2008.0', 'original_reg_date'