In [1]:
import pandas as pd
import json, re, collections

In [2]:
items = []
with open('raw/sfbay.json', 'r') as fh:
    for line in fh:
        items.append(json.loads(line))

In [3]:
make_model = collections.defaultdict(set)
with open('raw/make_model.txt', 'r') as fh:
    for line in fh:
        make, model = line.strip().split(',')
        make_model[make].add(model)

models_mapping = {make:{model:set([model]) for model in models} for make,models in make_model.items()}
with open('raw/model_synonym.txt', 'r') as fh:
    for line in fh:
        line = line.strip()
        if line:
            make, models = line.split(',', 1)
            models = models.split(',')
            for model in models:
                if make not in models_mapping:
                    models_mapping[make] = {}
                if models[0] not in models_mapping[make]:
                    models_mapping[make][models[0]] = set()
                models_mapping[make][models[0]].add(model)

makes_mapping = {make:make for make in make_model.keys()}
    
with open('raw/make_synonym.txt', 'r') as fh:
    for line in fh:
        line = line.strip()
        if line:
            makes = line.split(',')
            for make in makes:
                makes_mapping[make] = makes[0]
    
# combine model and make
for make in models_mapping:
    if make not in makes_mapping:
        makes_mapping[make] = make

In [4]:
#import enchant

#make_pwl = 'raw/make_pwl'
#with open(make_pwl, 'w') as fh:
#    for make in sorted(make_model.keys()):
#        fh.write(make + '\n')

#model_pwl = 'raw/model_pwl'
#with open(model_pwl, 'w') as fh:
#    for model in sorted(model for models in make_model.values() for model in models):
#        fh.write(model + '\n')

#makes_dict = enchant.request_pwl_dict(make_pwl)
#models_dict = enchant.request_pwl_dict(model_pwl)

In [5]:
class UsedCar(object):
    __slots__ = [
        'year', 'make', 'model', 'odometer',
        'dealer', 'posted_at', 'latitude', 'longitude',
        'title_status', 'cylinders', 'drive', 'fuel',
        'transmission', 'category', 'color', 'condition',
        'size', 'post_url', 'price'
    ]
    
    excluded_attr = []
    
    @classmethod
    def get_attrs(cls):
        return [attr for attr in cls.__slots__ if attr not in cls.excluded_attr]
    
    def __init__(self, **kwargs):
        for attr in self.__slots__:
            setattr(self, attr, kwargs.get(attr))
    
    def __str__(self):
        return ','.join(str(getattr(self, attr)) for attr in self.get_attrs())

In [6]:
from nltk.corpus import stopwords
from nltk.tokenize import wordpunct_tokenize, regexp_tokenize
from nltk import ngrams
import itertools

#from geopy.geocoders import Nominatim

stop_words = set(stopwords.words('english'))
stop_words.update(['.', ',', '"', "'", '?', '!', ':', ';', '(', ')', '[', ']', '{', '}', '&','/','...','--', '+','*','|',"),","**"])

# utility method
def log(reason, i):
    print("drop %s: %d" % (reason, i))
    
def debug(i, attr_only=True):
    item = items[i]
    if not attr_only:
        text = ' '.join(item['attr_text'] + [item['title'], item['body'][:BODY_LEN]]).lower().encode('utf-8')
        print("text: %s" % text)
    attr_text = ' '.join(item['attr_text'])
    print("attr: %s" % attr_text)

used_cars = []

#geolocator = Nominatim()

BODY_LEN = 50
dropped_year = 0
dropped_price = 0
dropped_title_status = 0
dropped_make = 0
dropped_model = 0

for i, item in enumerate(items):
    text = ' '.join(item['attr_text'] + [item['title'], item['body'][:BODY_LEN]]).lower().encode('utf-8')
    attr_text = ' '.join(item['attr_text'])
    
    # tokens
    digit_tokens = [x for x in regexp_tokenize(text, pattern='\d+')]
    tokens = [x for x in regexp_tokenize(text, pattern='[a-z0-9-]+')]
    
    used_car = UsedCar()
    
    # get year, only cars after 1990
    year = [x for x in digit_tokens]
    if len(year) == 0 or int(year[0]) < 1990:
        dropped_year += 1
        log('year', i)
        continue # invalid year
    year = year[0]
    used_car.year = year
    
    # get price
    if not item['price']:
        dropped_price += 1
        log('price', i)
        continue # invalid price
    used_car.price = item['price']
    
    # get title status
    m = re.match('.*title status: ([\w:]+) .*', text)
    if not m or ':' in m.group(1):
        dropped_title_status += 1
        log('title status', i)
        continue
    used_car.title_status = m.group(1)
    
    # get make
    used_car.make = ''
    for make in itertools.chain(ngrams(tokens, 3), ngrams(tokens, 2), tokens):
        if isinstance(make, tuple):
            make = ' '.join(make)
        if make in makes_mapping:
            used_car.make = makes_mapping[make]
            break
    if not used_car.make:
        dropped_make += 1
        log('make', i)
        continue
    
    # get model
    used_car.model = ''
    if used_car.make not in models_mapping:
        dropped_model += 1
        log('model', i)
        continue
    for model in itertools.chain(ngrams(tokens, 3), ngrams(tokens, 2), tokens):
        models_to_check = [model]
        if isinstance(model, tuple):
            models_to_check = [' '.join(model), ''.join(model), '-'.join(model)]
        for model_name in models_mapping[used_car.make]:
            for model in models_to_check:
                if model in models_mapping[used_car.make][model_name]:
                    used_car.model = model_name
                    break
    if not used_car.model:
        dropped_model += 1
        log('model', i)
        continue
    
    # get odometer
    m = re.match('.*odometer: (\d+)[^\d]*.*', attr_text)
    used_car.odometer = m.group(1) if m else ''

    # get cylinders
    m = re.match('.*cylinders: (\d+)[^\d]*.*', attr_text)
    used_car.cylinders = m.group(1) if m else ''
    
    # get drive
    m = re.match('.*drive: ([\w:]+) .*', attr_text)
    used_car.drive = m.group(1) if m and ':' not in m.group(1) else ''
    
    # get fuel
    m = re.match('.*fuel: ([\w:]+) .*', attr_text)
    used_car.fuel = m.group(1) if m and ':' not in m.group(1) else ''
    
    # get color
    m = re.match('.*paint color: ([\w:]+) .*', attr_text)
    used_car.color = m.group(1) if m and ':' not in m.group(1) else ''
    
    # get type
    m = re.match('.*type: ([-\w:]+) .*', attr_text)
    used_car.category = m.group(1) if m and ':' not in m.group(1) else ''
    
    # get size
    m = re.match('.*size: ([-\w:]+) .*', attr_text)
    used_car.size = m.group(1) if m and ':' not in m.group(1) else ''
    
    # get condition
    m = re.match('.*condition: (like new|[\w:]+) .*', attr_text)
    used_car.condition = m.group(1) if m and ':' not in m.group(1) else ''
    
    # get transmission
    m = re.match('.*transmission: ([\w:]+) .*', attr_text)
    used_car.transmission = m.group(1) if m and ':' not in m.group(1) else ''
    
    # get location information
    #if item['latitude'] and item['longitude'] and not item['address']:
    #    location = geolocator.reverse(','.join([item['latitude'], item['longitude']]))
    #    item['address'] = location.address
    #    print location.address
    #elif item['address'] and not item['latitude'] and not item['longitude']:
    #    location = geolocator.geocode(item['address'])
    #    item['latitude'] = location.latitude
    #    item['longitude'] = location.longitude
    
    # get others
    used_car.dealer = item['dealer']
    used_car.latitude = item['latitude']
    used_car.longitude = item['longitude']
    #used_car.address = item['address']
    used_car.posted_at = item['posted_at']
    used_car.post_url = item['url']
    
    used_cars.append(used_car)
    
print("cars: %d, drop price: %d, drop title status: %d, drop year: %d, drop make: %d, drop model: %d" % (
    len(used_cars), dropped_price, dropped_title_status, dropped_year, dropped_make, dropped_model))

drop model: 56
drop model: 65
drop model: 76
drop model: 100
drop model: 127
drop model: 137
drop model: 188
drop model: 202
drop model: 227
drop model: 287
drop model: 301
drop model: 387
drop model: 441
drop model: 459
drop model: 475
drop model: 502
drop model: 515
drop model: 544
drop model: 592
drop make: 650
drop model: 654
drop model: 660
drop model: 753
drop model: 773
drop model: 777
drop model: 822
drop model: 838
drop model: 942
drop model: 951
drop model: 982
drop model: 1003
drop model: 1024
drop model: 1091
drop model: 1103
drop model: 1107
drop model: 1109
drop model: 1146
drop model: 1163
drop model: 1258
drop model: 1276
drop model: 1278
drop model: 1311
drop model: 1396
drop model: 1407
drop model: 1425
drop model: 1476
drop model: 1477
drop model: 1611
drop model: 1659
drop model: 1668
drop model: 1740
drop model: 1758
drop model: 1759
drop model: 1796
drop model: 1832
drop model: 1862
drop model: 1885
drop model: 1900
drop model: 1976
drop model: 2040
drop model: 22

In [7]:
with open('output/used_cars.csv', 'w') as fh:
    fh.write(','.join(UsedCar.get_attrs()) + '\n')
    for used_car in used_cars:
        fh.write(str(used_car) + '\n')

In [8]:
######### debug ############

In [9]:
items[3072]

{u'_id': {u'$oid': u'59e75cfaf2f22b68a3dd648f'},
 u'address': u'',
 u'attr_text': [u'1997 Cushman White Truck',
  u'condition:',
  u'good',
  u'cylinders:',
  u'4 cylinders',
  u'drive:',
  u'rwd',
  u'fuel:',
  u'gas',
  u'odometer:',
  u'27220',
  u'paint color:',
  u'white',
  u'size:',
  u'compact',
  u'title status:',
  u'clean',
  u'transmission:',
  u'manual',
  u'type:',
  u'van'],
 u'body': u'Cushman white van with low mileage (27,000).  Transmission has been opened up to 5 speed (manual) and the governor removed.  It will drive on the freeway, top speed is 65.  Currently registered in California as a 1974 SPCN, no smog required.  It\'s an awesome van for the city and can park anywhere.  Storage is huge (4 1/2ft by 7ft)and has been used as a small camper at times in the sierras.  I\'ve slept in it comfortably several times and I\'m 6 feet tall.  I originally got it for the short commute to work and to haul gear around the city but I no longer need it.  Comes with cd player/blu

In [59]:
used_cars[8506].url

IndexError: list index out of range

In [75]:
str(used_cars[8506])

'2005,toyota,tacoma,,102890,True,2017-10-14T18:31:22-0700,38.010263,-122.101866,clean,,4wd,gas,manual,16500'

In [76]:
for item in items:
    if item['url'] == used_cars[8506].url:
        text = ' '.join(item['attr_text'] + [item['title'], item['body'][:BODY_LEN]]).lower().encode('utf-8')

        print text

2005 toyota tacoma v6 4dr access cab vin: 5teuu42n45z014447 drive: 4wd fuel: gas odometer: 102890 paint color: blue title status: clean transmission: manual type: pickup more ads  by this user 2005 toyota tacoma v6 4dr access cab v6 transmission: 5 speed manual interior color: graph


In [77]:
used_cars[8506].transmission

u'manual'