### import packages

In [1]:
import redis
from urllib.request import urlopen
import pandas as pd
import re
from bs4 import BeautifulSoup
import json

### scrape car retail information from a chosen car dealer website

#### Titles

In [2]:
url = 'http://www.statecollege.com/auto/dealers/stocker-chevrolet,9803/'
page = urlopen(url=url)
soup = BeautifulSoup(page, 'html.parser')
titles = soup.findAll('strong', {"class": "title"})
titles_df = pd.DataFrame(list(map(lambda x: x.getText(), titles)), columns=['Title'])

#### Embedded urls

In [3]:
urls_df = pd.DataFrame(['http://www.statecollege.com/'+ a['href'] 
                        for i in soup.findAll('td', {'align': 'left'}) 
                        for a in i.findAll('a', href=True)], columns=['urls'])

#### Addresses, phones of sellers & descriptions of cars

In [4]:
addresses = []
phones = []
descriptions = []
for url in urls_df.values:
    page = urlopen(url=url[0])
    soup = BeautifulSoup(page, 'html.parser')
    for address in soup.findAll('p', {'class': 'business_address_address'}):
        addresses.append(re.sub('\s+', ' ', address.getText().replace('\n', '')))
    for phone in soup.find('p', {'class': 'business_address_phone'}):
        phones.append(phone)
    for i, c in enumerate(soup.findAll('p')):
        if i==5:
            descriptions.append(c.getText())
address_df = pd.DataFrame(addresses, columns=['address'])
phones_df = pd.DataFrame(phones, columns=['phone'])
descriptions_df = pd.DataFrame(descriptions, columns=['description'])

#### Other attributes

In [5]:
tables = []
for url in urls_df.values:
    page = urlopen(url=url[0])
    soup = BeautifulSoup(page, 'html.parser')
    for table in soup.findAll('table', {'class': 'auto_detail_data_table'}):
        table = pd.read_html(str(table)) 
        tables.append(table)

In [6]:
attributes = ['Year', 'Mileage', 'Make', 'Model', 'Trim', 'Style', 
              'Engine', 'Exterior Color', 'Interior Color', 'VIN', 'Stock #']
attributes_dict = {}
for i in attributes:
    attributes_dict[i] = []
for table in tables:
    table = table[0]
    for attribute in attributes:
        attributes_dict[attribute].append(table[table.iloc[:, 0] ==
                                                attribute+':'].iloc[:, 1].values[0])
cars = pd.concat([titles_df, pd.DataFrame(attributes_dict), descriptions_df], axis=1)

#### Combine all attributes

In [14]:
cars.to_csv('cars.csv')

In [8]:
sellers = pd.concat([address_df, phones_df], axis=1)
sellers

Unnamed: 0,address,phone
0,"701 Benner Pike State College PA, 16801",(866) 235-0270
1,"701 Benner Pike State College PA, 16801",(866) 235-0270
2,"701 Benner Pike State College PA, 16801",(866) 235-0270
3,"701 Benner Pike State College PA, 16801",(866) 235-0270
4,"701 Benner Pike State College PA, 16801",(866) 235-0270
5,"701 Benner Pike State College PA, 16801",(866) 235-0270
6,"701 Benner Pike State College PA, 16801",(866) 235-0270
7,"701 Benner Pike State College PA, 16801",(866) 235-0270
8,"701 Benner Pike State College PA, 16801",(866) 235-0270
9,"701 Benner Pike State College PA, 16801",(866) 235-0270


#### Insert data into Redis

In [9]:
r = redis.StrictRedis(host='localhost', port=6379, db=0)
for idx in range(len(cars)):
    v = dict(zip(cars.columns, cars.values[idx]))
    serialized = json.dumps(dict(zip(sellers.columns,sellers.values[idx])))
    v['seller'] = serialized
    r.hmset(idx, v)

#### Data model design

- Attributes 
    - Title
    - Year
    - Engine
    - Exterior Color
    - Interior Color
    - Make
    - Mileage
    - Model
    - Stock 
    - Style
    - Trim
    - VIN
    - Description
    - Seller
        - address
        - phone
        
The data model covers all basic attributes of a car that could be found on the retail website and that buyers should be aware of.

#### Display the result

![](7.png)

![](8.png)

![](9.png)

![](10.png)






#### Filter contents examples

In [10]:
for i in range(10):
    # find 'Subaru' cars
    if 'Subaru' in r.hget(name=i, key=b'Title').decode('utf-8'):
        print(r.hget(name=i, key=b'Title'))

b'2014 Subaru Outback'
b'2013 Subaru XV Crosstrek'
b'2016 Subaru Forester'
b'2017 Subaru Legacy'
b'2017 Subaru Outback'


In [11]:
for i in range(10):
    # find cars with Malibu model:
    if 'Malibu' in r.hget(name=i, key=b'Model').decode('utf-8'):
        print(r.hmget(name=i, keys=[b'Title', b'Model']))

[b'2015 Chevrolet Malibu', b'Malibu']
[b'2016 Chevrolet Malibu Limited', b'Malibu Limited']


In [12]:
for i in range(10):
    # find VIN numbers ends with 7:
    if r.hget(name=i, key=b'VIN').decode('utf-8').endswith('7'):
        print(r.hmget(name=i, keys=[b'Title', b'VIN']))

[b'2013 Subaru XV Crosstrek', b'JF2GPACCXD1843307']
[b'2015 Chevrolet Malibu', b'1G11B5SL2FF313357']
[b'2016 Chevrolet Malibu Limited', b'1G11B5SAXGF135777']
[b'2015 Chevrolet Silverado 1500', b'3GCUKREC9FG459437']


In [13]:
for i in range(3):
    # get seller info
    seller = json.loads(r.hget(name=i, key=b'seller').decode('utf-8'))
    seller_address = seller['address']
    seller_phone = seller['phone']
    print('The seller lives in {}, phone number is {}'
          .format(seller_address, seller_phone))

The seller lives in  701 Benner Pike State College PA, 16801 , phone number is (866) 235-0270
The seller lives in  701 Benner Pike State College PA, 16801 , phone number is (866) 235-0270
The seller lives in  701 Benner Pike State College PA, 16801 , phone number is (866) 235-0270
