### Cleaning up dataframe make_model to url format (to be scraped)

In [202]:
import pickle
import pandas as pd

In [203]:
file = open('../iihs_death/iihsdeath2017', 'rb')
iihs_rating = pickle.load(file)

In [204]:
iihs_rating = iihs_rating[['make_model','type']]

In [205]:
iihs_rating

Unnamed: 0,make_model,type
0,Mitsubishi Mirage hatchback,4-Door Car
1,Kia Rio,4-Door Car
2,Hyundai Accent,4-Door Car
3,Ford Fiesta,4-Door Car
4,Volkswagen Golf,4-Door Car
...,...,...
217,Ford F250 Crew Cab 4WD,Pickup
218,Ram 2500 Crew Cab short bed 4WD,Pickup
219,Ram 3500 Crew Cab long bed 4WD,Pickup
220,Ram 2500 Mega Cab 4WD,Pickup


In [206]:
# write function: use iihs death statistics and get their model & make
# get url to IIHS crash testing results
def model_to_url(make_model, car_type):
    '''
    Turn string of the make_model from iihs2017 to url that gets iihs crash testing data

    Input: make_model, str (e.g. 'Kia Rio'); type, str (e.g. 4-Door-Car)
    Output: str (e.g. 'https://www.iihs.org/ratings/vehicle/Kia/rio-4-door-sedan/2017')
    '''
    # rid of 2WD, 4WD in name, and make all lowercase
    make_model = make_model.replace(' 4WD', '')
    make_model = make_model.replace(' 2WD', '')
    make_model = make_model.lower()

    # clean up some termininology for pickups
    if car_type == 'Pickup':
        make_model = make_model.replace(' short bed', '')
        make_model = make_model.replace(' long bed', '')
        make_model = make_model.replace('ext.','extended')
        make_model = make_model.replace('double cab', 'crew cab') #for Toyotas
        make_model = make_model.replace('crew max', 'crew cab') #for Toyotas
        make_model = make_model.replace('king cab', 'extended cab') #for Nissans
        make_model = make_model.replace('supercab', 'extended cab') #for Fords
        make_model = make_model.replace('quad cab', 'extended cab') #for Rams

    # turn make_model into a list of words
    make_model_li = make_model.split()

    # figure out make
    if make_model_li[:2] == ['land','rover']:
        make = 'land-rover'
    else:
        make = make_model_li[0]
    
    # figure out model
    if make_model_li[:2] == ['land','rover']:
        model = make_model_li[2:]
    else:
        model = make_model_li[1:]
        
    # figure out door number
    if car_type == '2-Door Car':
        doors = '2-door'
    elif car_type == 'Sports Car':
        doors = '2-door'
    elif car_type == 'Minivan':
        doors = None
    elif car_type == 'Pickup':
        doors = None
    else:
        doors = '4-door'

    # figure out body
    if model[-1] == 'coupe':
        body = 'coupe'
        model = model[:-1]
    elif model[-1] == 'hatchback':
        body = 'hatchback'
        model = model[:-1]
    elif model[-1] == 'sedan':
        body = 'sedan'
        model = model[:-1]
    elif car_type == '4-Door Car' or car_type == 'Luxury Car':
        body = 'sedan'
    elif car_type == '2-Door Car':
        body = 'hatchback'
    elif car_type == 'Station Wagon':
        body = 'wagon'
    elif car_type == 'Minivan':
        body = 'minivan'
    elif car_type == 'SUV' or car_type == 'Luxury SUV':
        body = 'suv'
    elif car_type == 'Pickup':
        body = 'pickup'
    
    #concat model back into a string 
    model = '-'.join(model)

    if doors:
        url_predict = 'https://www.iihs.org/ratings/vehicle/{}/{}-{}-{}/2017'.format(make,model,doors,body)
    else: 
        url_predict = 'https://www.iihs.org/ratings/vehicle/{}/{}-{}/2017'.format(make,model,body)

    return url_predict

In [207]:
for i in range(len(iihs_rating)):
    make_model = iihs_rating.loc[i, 'make_model']
    car_type = iihs_rating.loc[i, 'type']
    iihs_rating.loc[i,'predicted_url'] = model_to_url(make_model, car_type)

In [208]:
# for i in range(210,220):
#     print(i)
#     print(iihs_rating['predicted_url'].iloc[i])

In [43]:
# Checked all the urls and fixed ones that weren't working
# Majority of them were misclassification of car type (hatchback, coupe, sedan, 4-door)
# For trucks, IIHS does not do single-cab testings. I substituted those with the closest variant (extended cab)

iihs_rating.loc[4,'amended_url'] = 'https://www.iihs.org/ratings/vehicle/volkswagen/golf-4-door-hatchback/2017'
iihs_rating.loc[5,'amended_url'] = 'https://www.iihs.org/ratings/vehicle/Nissan/leaf-4-door-hatchback/2017'
iihs_rating.loc[6,'amended_url'] = 'https://www.iihs.org/ratings/vehicle/volkswagen/gti-4-door-hatchback/2017'
iihs_rating.loc[7,'amended_url'] = 'https://www.iihs.org/ratings/vehicle/nissan/juke-4-door-hatchback/2017'
iihs_rating.loc[12,'amended_url'] = 'https://www.iihs.org/ratings/vehicle/hyundai/elantra-4-door-hatchback/2017'
iihs_rating.loc[17,'amended_url'] = 'https://www.iihs.org/ratings/vehicle/nissan/juke-4-door-hatchback/2017'
iihs_rating.loc[25,'amended_url'] = 'https://www.iihs.org/ratings/vehicle/ford/fusion-4-door-sedan/2017'
iihs_rating.loc[27,'amended_url'] = 'https://www.iihs.org/ratings/vehicle/ford/fusion-4-door-sedan/2017'
iihs_rating.loc[32,'amended_url'] = 'https://www.iihs.org/ratings/vehicle/Toyota/camry-4-door-sedan/2017'
iihs_rating.loc[46,'amended_url'] = 'https://www.iihs.org/ratings/vehicle/dodge/charger-4-door-sedan/2017'
iihs_rating.loc[52,'amended_url'] = 'https://www.iihs.org/ratings/vehicle/honda/accord-2-door-coupe/2017'
iihs_rating.loc[53,'amended_url'] = 'https://www.iihs.org/ratings/vehicle/dodge/challenger-2-door-coupe/2017'
iihs_rating.loc[55,'amended_url'] = 'N/A'
iihs_rating.loc[56,'amended_url'] = 'https://www.iihs.org/ratings/vehicle/ford/mustang-2-door-coupe/2017'
iihs_rating.loc[59,'amended_url'] = 'https://www.iihs.org/ratings/vehicle/lexus/ct-200h-4-door-hatchback/2017'
iihs_rating.loc[64,'amended_url'] = 'https://www.iihs.org/ratings/vehicle/bmw/3-series-4-door-sedan/2020'
iihs_rating.loc[71,'amended_url'] = 'https://www.iihs.org/ratings/vehicle/Hyundai/accent-4-door-sedan/2017'
iihs_rating.loc[72,'amended_url'] = 'https://www.iihs.org/ratings/vehicle/subaru/crosstrek-4-door-wagon/2020'
iihs_rating.loc[74,'amended_url'] = 'https://www.iihs.org/ratings/vehicle/Fiat/500l-4-door-hatchback/2017'
iihs_rating.loc[76,'amended_url'] = 'https://www.iihs.org/ratings/vehicle/Ford/focus-4-door-sedan/2017'
iihs_rating.loc[78,'amended_url'] = 'N/A'
iihs_rating.loc[79,'amended_url'] = 'https://www.iihs.org/ratings/vehicle/Nissan/versa-note-4-door-hatchback/2017'
iihs_rating.loc[106,'amended_url'] = 'https://www.iihs.org/ratings/vehicle/jeep/wrangler-2-door-suv/2017'
iihs_rating.loc[136,'amended_url'] = 'https://www.iihs.org/ratings/vehicle/jeep/wrangler-4-door-suv/2017'
iihs_rating.loc[158,'amended_url'] = 'https://www.iihs.org/ratings/vehicle/gmc/yukon-4-door-suv/2017'
iihs_rating.loc[159,'amended_url'] = 'https://www.iihs.org/ratings/vehicle/Chevrolet/suburban-4-door-suv/2017'
iihs_rating.loc[160,'amended_url'] = 'https://www.iihs.org/ratings/vehicle/ford/expedition-4-door-suv/2017'
iihs_rating.loc[162,'amended_url'] = 'N/A'
iihs_rating.loc[165,'amended_url'] = 'https://www.iihs.org/ratings/vehicle/lexus/nx-4-door-suv/2020'
iihs_rating.loc[173,'amended_url'] = 'https://www.iihs.org/ratings/vehicle/lexus/nx-4-door-suv/2020'
iihs_rating.loc[175,'amended_url'] = 'N/A'
iihs_rating.loc[178,'amended_url'] = 'N/A'
iihs_rating.loc[202,'amended_url'] = 'https://www.iihs.org/ratings/vehicle/Chevrolet/silverado-1500-extended-cab-pickup/2017'
iihs_rating.loc[208,'amended_url'] = 'https://www.iihs.org/ratings/vehicle/ford/f-150-extended-cab-pickup/2017'
iihs_rating.loc[209,'amended_url'] = 'https://www.iihs.org/ratings/vehicle/Ram/1500-extended-cab-pickup/2017'
iihs_rating.loc[211,'amended_url'] = 'https://www.iihs.org/ratings/vehicle/chevrolet/silverado-1500-extended-cab-pickup/2017'
for i in range(213, 222):
    iihs_rating.loc[i,'amended_url'] = 'N/A'

# cars that have 1 or more important features missing
missing_feats = [14, 39, 44, 62, 63, 66, 68, 69, 79, 136, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 167, 169, 176, 177]
for num in missing_feats:
    iihs_rating.loc[num,'amended_url'] = 'N/A'

### Scraping and saving as HTML

In [61]:
from bs4 import BeautifulSoup
import requests

In [68]:
for i in range(len(iihs_rating)):
    if pd.isnull(iihs_rating.loc[i, 'amended_url']):
        url = iihs_rating.loc[i, 'predicted_url']
    elif iihs_rating.loc[i, 'amended_url'] == 'N/A':
        continue
    else:
        url = iihs_rating.loc[i, 'amended_url']

    temp = requests.get(url)
    soup = BeautifulSoup(temp.text)

    file = open("iihsrating_sites/" + str(i) +".html" ,"w")
    file.write(str(soup))
    file.close()

### Extract features from test results using beautiful soup

In [209]:
from os import listdir
from os.path import isfile, join

In [210]:
file = open("iihsrating_sites/0.html" ,"r")
soup = BeautifulSoup(file, 'html.parser')

In [211]:
small_overlap_driver = soup.find(href="#small-overlap-front-driver-side").next_element.next_element.next_element.find('span').text
moderate_overlap_front = soup.find(href="#moderate-overlap-front").next_element.next_element.next_element.find('span').text
side = soup.find(href="#side").next_element.next_element.next_element.find('span').text
roof_strength = soup.find(href="#roof-strength").next_element.next_element.next_element.find('span').text
head_restraints_and_seats = soup.find(href="#head-restraints-and-seats").next_element.next_element.next_element.find('span').text

curb_weight = soup.find(text='Curb weight').next_element.next_element.text[:-4] #curb weight in pounds
curb_weight_variant = soup.find(id='roof-strength').find('kbd').text #the model variant tested for roof-strength, thus applies to curb weight

In [212]:
# import all file names that was scraped for html
file_names = [name[:-5] for name in listdir('iihsrating_sites') if isfile(join('iihsrating_sites', name))]
file_names.remove(".DS_")

In [213]:
for file_name in file_names:
    file = open("iihsrating_sites/" + file_name + ".html" ,"r")
    soup = BeautifulSoup(file, 'html.parser')

    small_overlap_driver = soup.find(href="#small-overlap-front-driver-side").next_element.next_element.next_element.find('span').text
    moderate_overlap_front = soup.find(href="#moderate-overlap-front").next_element.next_element.next_element.find('span').text
    side = soup.find(href="#side").next_element.next_element.next_element.find('span').text
    try:
        roof_strength = soup.find(href="#roof-strength").next_element.next_element.next_element.find('span').text
        curb_weight = soup.find(text='Curb weight').next_element.next_element.text[:-4] #curb weight in pounds
        curb_weight_variant = soup.find(id='roof-strength').find('kbd').text #the model variant tested for roof-strength, thus applies to curb weight
    except AttributeError:
        roof_strength = None
        curb_weight = None
        curb_weight_variant = None

    try:
        head_restraints_and_seats = soup.find(href="#head-restraints-and-seats").next_element.next_element.next_element.find('span').text
    except AttributeError:
        head_restraints_and_seats = None

    iihs_rating.loc[int(file_name),'small_overlap_driver'] = small_overlap_driver
    iihs_rating.loc[int(file_name),'moderate_overlap_front'] = moderate_overlap_front
    iihs_rating.loc[int(file_name),'side'] = side
    iihs_rating.loc[int(file_name),'roof_strength'] = roof_strength
    iihs_rating.loc[int(file_name),'curb_weight'] = curb_weight
    iihs_rating.loc[int(file_name),'curb_weight_variant'] = curb_weight_variant
    iihs_rating.loc[int(file_name),'head_restraints_and_seats'] = head_restraints_and_seats