In [138]:
import pandas as pd
import json
from tqdm import tqdm
from pathlib import Path
import numpy as np
from datetime import datetime

In [139]:
path = Path('cleaned_data/')
aggregated_data = {}
for file in tqdm(list(path.glob('page*'))):
    file_name = file.name
    file_id = file_name.split('.')[0]
    file_id = file_id[4:]
    with open(file, 'r') as f:
        data = json.loads(f.read())
    aggregated_data[file_id] = data


100%|███████████████████████████████████████████████████████████████████████████| 10914/10914 [00:51<00:00, 210.89it/s]


In [140]:
df = pd.DataFrame().from_dict(data = aggregated_data)
df = df.transpose()

In [155]:
new_df = pd.DataFrame()

mapping = {'dealer': ('dealer', spilt_name),
          'location': ('location', get_attr),
          'vehicle': ('make', get_make),
          'vehicle': ('model', get_model),
          'year': ('year', get_year),
          'mileage': ('mileage', get_mileage),
          'last_updated': ('last_updated', get_last_updated),
          'price': ('price', get_price),
          'type': ('transmission', get_attr),
          'engine': ('engine size', get_engine),
          'body': ('body', get_attr),
          'fuel': ('fuel', get_attr),
          'doors': ('doors', get_doors),
          'mpg': ('mpg',get_mpg),
          'owners': ('owners', get_owners),
          'color': ('color', get_attr),
          'tax': ('tax', get_tax),
          'nct': ('nct expiration month', get_nct_month),
          'nct': ('nct expiration year', get_nct_year)}

for column in mapping:
    name, func = mapping[column]
    new_df[name] = df[column].apply(lambda entry:func(entry))
new_df['make'] = df['vehicle'].apply(lambda entry:get_make(entry))

In [159]:
prices = new_df[new_df['price'] != 'P.O.A'].copy()
prices['price'] =  prices['price'].astype('float')
prices.to_csv('price_data.csv')

In [132]:
makes = ['toyota', 'mitsubishi', 'nissan', 'ford', 'hyundai', 'mercedes-benz', 'renault', 'vauxhall', 'smart',
         'skoda', 'volkswagen', 'opel', 'bmw', 'lexus', 'fiat', 'audi', 'seat', 'dacia', 'kia', 'citroen', 'daihatsu',
        'mazda', 'landrover', 'volvo', 'peugeot', 'suzuki', 'honda', 'jaguar', 'mini', 'suzuki', 'saab', 'porsche',
        'alfaromeo', 'jeep', 'subaru', 'chevrole', 'ssangyong', 'cupra', 'isuzu', 'bentley', 'maserati', 'abarth', 'dsds']

def get_make(vehicle):
    vehicle = vehicle[0]
    vehicle = vehicle.lower()
    for make in makes:
        if vehicle.find(make) != -1:
            return make.title()
    return False

def get_model(vehicle):
    make = get_make(vehicle).lower()    
    vehicle = vehicle[0]
    vehicle = vehicle.lower()
    if make is not False:
        vehicle = vehicle.replace(make, '')
        return vehicle.title()
    return False

def spilt_name(dealer):
    name = dealer[0]
    new_name = ''
    for idx, char in enumerate(name[:-1]):
        if char.isupper() and name[idx+1].islower():
            new_name = new_name + ' '
        new_name = new_name + char
    new_name = new_name + name[-1]
    return new_name.strip()

def get_owners(owners):
    if owners == ['']:
        return np.nan
    return int(owners[0][0])

def get_mpg(mpg):
    if mpg == ['']:
        return np.nan
    return int(mpg[0][:-3])

def get_doors(doors):
    if doors == ['']:
        return np.nan
    return int(doors[0][:1])

def get_year(year):
    if year == ['']:
        return np.nan
    return int(year[0])

def get_last_updated(last_updated):
    if last_updated == ['']:
        return np.nan
    last_updated = last_updated[0][8:]
    
    last_updated = last_updated.replace('day', ' day')
    last_updated = last_updated.replace('hour', ' hour')
    last_updated = last_updated.replace('minute', ' minute')
    last_updated = last_updated.replace('week', ' week')
    last_updated = last_updated.replace('ago', ' ago')
    return last_updated

def get_price(price):
    if price == ['']:
        return np.nan
    price = price[0]
    if price == 'P.O.A':
        return price
    price = price.replace(',', '') 
    price = price[1:]
    return int(price)

def get_engine(engine):
    if engine == ['']:
        return np.nan
    engine = engine[0]    
    return engine

def get_mileage(mileage):
    if mileage == ['']:
        return np.nan
    mileage = mileage[0]
    mileage = mileage.replace(',', '')
    return int(mileage[:-2])

def get_tax(tax):
    if tax == ['']:
        return np.nan
    tax = tax[0]
    tax = tax.replace(',', '')
    return int(tax[1:])

def get_nct_month(nct):
    if nct == ['']:
        return np.nan
    nct = nct[0]
    nct = nct[3:-5]
    return datetime.strptime(nct, '%B')

def get_nct_year(nct):
    if nct == ['']:
        return np.nan
    nct = nct[0]
    nct = nct[-4:]
    return int(nct)

def get_attr(attr):
    if attr == ['']:
        return np.nan
    return attr[0]
    