In [1]:
import pandas as pd, numpy as np, os, re
from bs4 import BeautifulSoup
from glob import iglob
import codecs
pd.set_option('display.max_columns', 40)

In [2]:
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import KFold, LeaveOneOut

from sklearn.model_selection import cross_val_score


In [3]:
from matplotlib import pyplot as plt

#### Rename html files

In [8]:
path_al = '/home/jeferson/personal_projects/scraping_cars/results/ny'
files = os.listdir(path_al)


for file in files:
    os.rename(os.path.join(path, file), os.path.join(path, file.split('-')[1] + '.html'))

In [10]:
# Alabama : 724716247 , 706708719

def soup_func(filename):
    html = codecs.open(path_al + '/' + filename, 'r').read()
    return BeautifulSoup(html, 'lxml')



In [11]:
def get_price(soup):
    vehicle_price = soup.find('span', class_=
              'vehicle-info__price-display')
    if vehicle_price:
        return vehicle_price.text

In [12]:
def get_msrp_price(soup):
    vehicle_msrp_tag = soup.find('span',
        attrs={'class':'vehicle-info__price-label vehicle-info__price-label--msrp'})
    if vehicle_msrp_tag:
        sibling_msrp = vehicle_msrp_tag.nextSibling
        if sibling_msrp:
            return sibling_msrp.text


In [13]:
def get_price_drop(soup):
    price_drop = soup.find('span',
            attrs={'class':'vdp-cap-price__prev vehicle-info__price-drop'})

    if price_drop:
        tag_drop = price_drop.find('s', class_='strike-through')
        if tag_drop:
            return tag_drop.text

In [14]:
def price_comparison_tool(soup):
    badge_tag = soup.find('a', attrs={'cars-smooth-scroll':'price-comparison-tool'} )

    if badge_tag:
        return badge_tag.find('span').text

In [15]:
def dealer_rating(soup):
    dealer_ratings = soup.find('p', class_='rating__link rating__link-details' )
    if dealer_ratings:
        list_rate = re.findall(r"[+-]? *(?:\d+(?:\.\d*)?|\.\d+)(?:[eE][+-]?\d+)?", dealer_ratings.text)
        if len(list_rate) > 1:
            return [list_rate[0], list_rate[1]]
        else:
            return [-1, -1]
    else:
        return [-1, -1]

In [16]:
def overall_rating(soup):
    overall_rating = soup.find('cars-star-ratings' )
    if overall_rating:
        return overall_rating.get('bound-rating')

In [17]:
def consumers_review(soup):
    consumer_reviews = soup.find('div', class_='review-stars-average' )
    if consumer_reviews:
        a_lnk = consumer_reviews.find('a', attrs={'data-linkname':'consumer-reviews-total-reviews'})
        if a_lnk:
            return a_lnk.text



In [18]:
def mileage(soup):
    mileage = soup.find('div', class_='vdp-cap-price__mileage--mobile vehicle-info__mileage')
    if mileage:
        return mileage.text

In [19]:
def count_photo(soup):
    count_photos = soup.find('div', class_='photo-count__count')
    if count_photos:
        return count_photos.text    
    

In [20]:
def get_all_details(soup):
    details = dict()
    for x in soup.findAll('li', class_= 'vdp-details-basics__item' ):
        if x:
            s = x.text.strip()
            key = s[ : s.find(':')].strip()
            value = s[s.find(':')+1 : ].strip()
            details[key] = value
    return details


In [21]:
# s = '(4.8) 1086 Reviews'
# list_rate = re.findall(r"[+-]? *(?:\d+(?:\.\d*)?|\.\d+)(?:[eE][+-]?\d+)?", s)
# print(list_rate[0], list_rate[1])

In [22]:
# soup = soup_func('722348066.html')
# print(f)
# print(get_price(soup))
# print(get_msrp_price(soup))
# print(get_price_drop(soup))
# print(price_comparison_tool(soup))

# list_rate_seller = dealer_rating(soup)
# seller_rating, seller_count_rating = list_rate_seller[0], list_rate_seller[1]
# print('seller_rate', seller_rating , '\n', 'seller_count_review', seller_count_rating)
# ## print('\n', list_rate_seller, '\n')

# print(overall_rating(soup))
# print(consumers_review(soup))
# print(mileage(soup))
# print(count_photo(soup))
# print('\n\n')

In [23]:
tab = []
for f in files[:]:
    soup = soup_func(f)
    
    list_rate_seller = dealer_rating(soup)   
    seller_rating, seller_count_rating = list_rate_seller[0], list_rate_seller[1]

    tab.append([f, get_price(soup), get_msrp_price(soup), get_price_drop(soup),
              price_comparison_tool(soup), seller_rating, seller_count_rating, overall_rating(soup),
              consumers_review(soup), count_photo(soup), get_all_details(soup)])


In [24]:
df_al = pd.DataFrame(tab, columns=['id','price', 'msrp_price', 'original_price', 
                                  'good_offer', 'seller_rating', 'seller_count_rating', 'car_rate', 'consumers_review',
                                   'n_photos', 'details'])

In [25]:
# df_al_bkp = df_al.copy()
# df_al = df_al_bkp.copy()

In [26]:
df_al = df_al.join(pd.DataFrame(df_al.pop('details').tolist()))

In [27]:
df_al

Unnamed: 0,id,price,msrp_price,original_price,good_offer,seller_rating,seller_count_rating,car_rate,consumers_review,n_photos,City MPG,Drivetrain,Engine,Exterior Color,Fuel Type,Highway MPG,Interior Color,Mileage,Stock,Transmission,VIN
0,244-761069807,"$24,645","$24,645",,,2.7,77,4.9,55 reviews,3,31,FWD,1.5L I4 16V GDI DOHC Turbo,Crystal Black Pearl,Gasoline,40,,10,00390611,Automatic CVT,SHHFK7H63KU406881
1,402-763201995,"$35,302",,,Good Deal,4.7,1044,4.3,22 reviews,32,24,AWD,2.0L I4 16V GDI DOHC Turbo,Majestic White,Gasoline,33,Graphite,5098,3376U,7-Speed Automatic with Auto-Shift,SJKCH5CR0JA004135
2,303-760249278,"$63,165",,,,4.9,740,5,1 reviews,6,19,AWD,6 Cylinder,,Gasoline,27,,0,19363,9-Speed Shiftable Automatic,55SWF6EB6KU285098
3,464-764871586,"$57,855","$57,855",,,4.8,46,4.7,3 reviews,,,AWD,2.0L I4 16V GDI DOHC Turbo Hybrid,Gray,Hybrid,,Black,Not provided,KF600273,7-Speed Automatic,WDC0G5EB2KF600273
4,309-759772495,"$28,997",,,Good Deal,4.8,660,4.8,23 reviews,32,17,AWD,Intercooled Turbo Premium Unleaded H-4 2.5 L/150,Galaxy Blue Silica,Gasoline,23,Carbon Black,31698,CR7185A,6-Speed Manual w/OD,JF1VA2S66F9816513
5,80-763690160,"$43,336",,,,4.5,80,4.5,14 reviews,32,19,AWD,3.5L V6 24V PDI DOHC,Midnight Black Metallic,Gasoline,27,Ash,0,KS222720,8-Speed Automatic,5TDDZ3DC6KS222720
6,10-765898406,"$43,340",,,,4.9,740,5,6 reviews,4,23,AWD,4 Cylinder,Denim Blu Met,Gasoline,31,Sahara Beige,0,195587,Dual Shift Gearbox,WDCTG4GB2KJ610920
7,258-753004813,"$38,174","$41,635",,,3,10,4.5,27 reviews,32,15,RWD,5.0L V8 32V PDI DOHC,Magnetic Metallic,Gasoline,24,Ebony Cloth,5,9008,6-Speed Manual,1FA6P8CF3K5124098
8,120-758761274,"$58,460","$58,960",,,4.9,869,4.8,6 reviews,11,,quattro,Intercooled Turbo Premium Unleaded I-4 2.0 L/121,Samurai Gray Metallic,Gasoline,,Nougat Brown,10,PA219152,8-Speed Automatic w/OD,WA1AHAF76KD017799
9,422-761838668,"$69,425",,,,4.6,48,,,3,21,AWD,2.0L I4 16V GDI DOHC,Onyx Black,Gasoline,31,Charcoal,2,19V888,8-Speed Automatic,LVYA22ML8KP099622


In [28]:
df_al.to_csv('df_ny.csv', sep=';', index=False)