In [236]:
import os
import csv
from numpy.core.numeric import normalize_axis_tuple
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
import numpy as np
from datetime import datetime
import pickle

from xgboost import XGBRegressor
from sklearn.metrics import accuracy_score

def parse(filename):
    reader = csv.reader(open(filename, 'r', encoding='utf8'))

    attr = next(reader)
    data = []

    for _, line in enumerate(reader):
        d = {}
        for idx in range(len(line)):
            d[attr[idx]] = line[idx]
        data.append(d)

    return data

def select_attr(data, keys_to_extract):
    filtered = []
#     keys_to_extract = ['title', 'make', 'manufactured', 'type_of_vehicle', 'eco_category', 'price']
    for item in data:
        subset = {key: item[key] for key in keys_to_extract}
        filtered.append(subset)
    return filtered

# def get_vector(d):
#     vector = []
#     for k, v in d.items():
#         if k == 

# nominal2value = {} 
# This is a dictionary, where the key is the attribute name
# Each element is also a dictionary, where the key is "value2idx" and "onehot_matrix"
# value2idx is to map an attribute value to the index
# With the index from value2idx, we can use the onehot_matrix[idx] to get the corresponding vector for that value
def date2value(s):
    if len(s) == 0:
        value = 0
    else:
        value = datetime.strptime(s, '%d-%b-%Y').toordinal()
    return value 

attr_ignored = ['listing_id', 'title', 'description', 'features', 'accessories',
                'model']

def analyze_attribute(data, attrs):
#     attrs = list(data[0].keys())
#     attrs = ['title', 'make', 'manufactured', 'type_of_vehicle', 'eco_category', 'price']
#     attrs = ['title', 'price']
    nominal2value = {}
    for key in attrs:
        if key in attr_ignored:
            continue
        if key == 'price':
            nominal2value[key] = {}
            nominal2value[key]['value2vec'] = lambda x: float(x.strip())

        set_attr = set()
        for elm in data:
            if key in ['original_reg_date', 'reg_date', 'lifespan' ]:
                nominal2value[key] = {}
                nominal2value[key]['value2vec'] = date2value
            elif key in ['curb_weight', 'power', 'engine_cap', \
                         'depreciation', 'coe', 'road_tax', \
                         'dereg_value', 'mileage', 'omv', \
                         'arf']: # ratio
                nominal2value[key] = {}
                nominal2value[key]['value2vec'] = lambda x: float(x.strip()) if len(x.strip()) != 0 else -1
            else:
                value = elm[key].strip()
                set_attr.add(value)
        if 0 < len(set_attr) < 300: # If one attribute only has a small number of value set, we index them
            # if key not in nominal2value:
                # nominal2value[key] = {}
            set_attr.add('') # For unseen data
            nominal2value[key] = {}
            nominal2value[key]['idx2value'] = list(set_attr)
            nominal2value[key]['value2idx'] = {value:idx for idx, value in enumerate(nominal2value[key]['idx2value'])}
            arr = np.asarray([[v] for v in nominal2value[key]['idx2value']])
            encoder = OneHotEncoder(sparse=False)
            nominal2value[key]['onehot_matrix'] = encoder.fit_transform(arr)
            nominal2value[key]['value2vec'] = {value:nominal2value[key]['onehot_matrix'][idx] for value, idx in nominal2value[key]['value2idx'].items()}
            print('%s is added as a nominal, whose size is %d'%(key, len(set_attr)))
        elif key in nominal2value:
            print('Attribute "%s" is added as a function'%(key))
        else:
            print('Attribute "%s" needs care... The size is %d.'%(key, len(set_attr)))
            print('Example value:')
            for _ in range(5):
                print(data[_][key])
            raise ValueError
        # print(nominal2value[key])
        # print(nominal2value)
        # assert False
    return nominal2value

def get_vector(d, nominal2value, attrs, has_label):
    """
        attrs is a list of attributes excluding the price. It is used to order the vector
    """
    vector = []
    for attr in attrs:
        if not has_label and attr == 'price':
            continue
        if attr in attr_ignored:
            continue
        value = d[attr]
        if isinstance(nominal2value[attr]['value2vec'], dict):
            if value not in nominal2value[attr]['value2vec']: # This value is unseen value for that attribute
                value = ''
            vec = nominal2value[attr]['value2vec'][value]
        else:
            vec = nominal2value[attr]['value2vec'](value)
        if vec is None:
            print(attr, value)

        if isinstance(vec, list) or isinstance(vec, np.ndarray):
            vector += [*vec]
        else:
            vector += [vec]
    return vector



def build_vectors(data, nominal2value, attrs, has_label=True):
    vectors = []
    for idx, elm in enumerate(data):
        vector = get_vector(elm, nominal2value, attrs, has_label)
        vectors.append(vector)
    return np.float32(vectors)

data_train = parse('data/train.csv')
data_test = parse('data/test.csv')

print(data_train[0].keys())

# print(data_train)
attrs = ['type_of_vehicle', 'transmission', 'fuel_type', 'eco_category', 'price']
data_train_pivot = select_attr(data_train, attrs)
print(data_train_pivot[0].keys())

print(data_train[0])


dict_keys(['listing_id', 'title', 'make', 'model', 'description', 'manufactured', 'original_reg_date', 'reg_date', 'type_of_vehicle', 'category', 'transmission', 'curb_weight', 'power', 'fuel_type', 'engine_cap', 'no_of_owners', 'depreciation', 'coe', 'road_tax', 'dereg_value', 'mileage', 'omv', 'arf', 'opc_scheme', 'lifespan', 'eco_category', 'features', 'accessories', 'indicative_price', 'price'])
dict_keys(['type_of_vehicle', 'transmission', 'fuel_type', 'eco_category', 'price'])
{'listing_id': '1030324', 'title': 'BMW 3 Series 320i Gran Turismo M-Sport', 'make': 'bmw', 'model': '320i', 'description': '1 owner! 320i gt m-sports model! big brake kit, m-sport steering wheel! very well taken care of by ex-owner! in elegant white! big boot space as well! immaculate showroom condition! special promotional bank interest rates in house finance available with competitive interest rates. call today to enquire!', 'manufactured': '2013.0', 'original_reg_date': '', 'reg_date': '09-dec-2013', 't

In [237]:
pd.DataFrame(data_train_pivot).head()

Unnamed: 0,type_of_vehicle,transmission,fuel_type,eco_category,price
0,luxury sedan,auto,,uncategorized,71300.0
1,van,manual,diesel,uncategorized,43800.0
2,luxury sedan,auto,,uncategorized,95500.0
3,luxury sedan,auto,,uncategorized,197900.0
4,mid-sized sedan,auto,,uncategorized,103200.0


In [238]:
nominal2value = analyze_attribute(data_train_pivot, attrs)

data_train_vec = build_vectors(data_train_pivot, nominal2value, attrs)
# data_test_vec = build_vectors(data_test, nominal2value, attrs, has_label=False)

print(data_train_vec[0])

type_of_vehicle is added as a nominal, whose size is 12
transmission is added as a nominal, whose size is 3
fuel_type is added as a nominal, whose size is 5
eco_category is added as a nominal, whose size is 2
Attribute "price" is added as a function
[0.00e+00 0.00e+00 0.00e+00 1.00e+00 0.00e+00 0.00e+00 0.00e+00 0.00e+00
 0.00e+00 0.00e+00 0.00e+00 0.00e+00 0.00e+00 1.00e+00 0.00e+00 1.00e+00
 0.00e+00 0.00e+00 0.00e+00 0.00e+00 0.00e+00 1.00e+00 7.13e+04]


In [239]:
from sklearn.neighbors import NearestNeighbors
from scipy.sparse import csr_matrix

data_train_vec_matrix = csr_matrix(data_train_vec)

print(data_train_vec_matrix)

model_knn = NearestNeighbors(metric = 'euclidean', algorithm = 'brute')
model_knn.fit(data_train_vec_matrix)

  (0, 3)	1.0
  (0, 13)	1.0
  (0, 15)	1.0
  (0, 21)	1.0
  (0, 22)	71300.0
  (1, 11)	1.0
  (1, 14)	1.0
  (1, 16)	1.0
  (1, 21)	1.0
  (1, 22)	43800.0
  (2, 3)	1.0
  (2, 13)	1.0
  (2, 15)	1.0
  (2, 21)	1.0
  (2, 22)	95500.0
  (3, 3)	1.0
  (3, 13)	1.0
  (3, 15)	1.0
  (3, 21)	1.0
  (3, 22)	197900.0
  (4, 4)	1.0
  (4, 13)	1.0
  (4, 15)	1.0
  (4, 21)	1.0
  (4, 22)	103200.0
  :	:
  (16779, 9)	1.0
  (16779, 13)	1.0
  (16779, 15)	1.0
  (16779, 21)	1.0
  (16779, 22)	144400.0
  (16780, 2)	1.0
  (16780, 13)	1.0
  (16780, 19)	1.0
  (16780, 21)	1.0
  (16780, 22)	70200.0
  (16781, 7)	1.0
  (16781, 13)	1.0
  (16781, 15)	1.0
  (16781, 21)	1.0
  (16781, 22)	71300.0
  (16782, 2)	1.0
  (16782, 13)	1.0
  (16782, 15)	1.0
  (16782, 21)	1.0
  (16782, 22)	81200.0
  (16783, 7)	1.0
  (16783, 13)	1.0
  (16783, 15)	1.0
  (16783, 21)	1.0
  (16783, 22)	638000.0


NearestNeighbors(algorithm='brute', leaf_size=30, metric='euclidean',
                 metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                 radius=1.0)

In [240]:
query_index = np.random.choice(data_train_vec.shape[0])
distances, indices = model_knn.kneighbors(data_train_vec_matrix[query_index], n_neighbors = 6)
print(attrs)

['type_of_vehicle', 'transmission', 'fuel_type', 'eco_category', 'price']


In [242]:
for i in range(0, len(distances.flatten())):
    if i == 0:
        print('Recommendations for {0}:\n'.format(data_train_pivot[indices.flatten()[i]]))
    else:
        print('{0}: {1}, with distance of {2}:'.format(i, data_train_pivot[indices.flatten()[i]], distances.flatten()[i]))

Recommendations for {'type_of_vehicle': 'mpv', 'transmission': 'auto', 'fuel_type': 'petrol-electric', 'eco_category': 'uncategorized', 'price': '64400.0'}:

1: {'type_of_vehicle': 'mpv', 'transmission': 'auto', 'fuel_type': '', 'eco_category': 'uncategorized', 'price': '64400.0'}, with distance of 1.4142135381698608:
2: {'type_of_vehicle': 'mpv', 'transmission': 'auto', 'fuel_type': '', 'eco_category': 'uncategorized', 'price': '64400.0'}, with distance of 1.4142135381698608:
3: {'type_of_vehicle': 'suv', 'transmission': 'auto', 'fuel_type': 'petrol-electric', 'eco_category': 'uncategorized', 'price': '64400.0'}, with distance of 1.4142135381698608:
4: {'type_of_vehicle': 'suv', 'transmission': 'auto', 'fuel_type': 'petrol-electric', 'eco_category': 'uncategorized', 'price': '64400.0'}, with distance of 1.4142135381698608:
5: {'type_of_vehicle': 'suv', 'transmission': 'auto', 'fuel_type': '', 'eco_category': 'uncategorized', 'price': '64400.0'}, with distance of 2.0:
