In [12]:
import pandas as pd
from bs4 import BeautifulSoup
import requests
import json
from splinter import Browser
import time

## Part One - Extract Data from Brittannica Website on Populations of State Capitals

In [2]:
## Code below goes to Britannical Web Page and Extracts List of state capitals and Population
#setup for splinter
executable_path = {'executable_path': 'c:/bin/chromedriver.exe'}
browser = Browser('chrome', **executable_path, headless=False)
url = 'https://www.britannica.com/topic/list-of-state-capitals-in-the-United-States-2119210'
browser.visit(url)
tables = pd.read_html(url)
state_capitals = tables[0].copy()
browser.quit()

In [3]:
state_capitals.head(5)

Unnamed: 0,state,capital,population of capital: census,population of capital: estimated
0,Alabama,Montgomery,"(2010) 205,764","(2018 est.) 198,218"
1,Alaska,Juneau,"(2010) 31,275","(2018 est.) 32,113"
2,Arizona,Phoenix,"(2010) 1,445,632","(2018 est.) 1,660,272"
3,Arkansas,Little Rock,"(2010) 193,524","(2018 est.) 197,881"
4,California,Sacramento,"(2010) 466,488","(2018 est.) 508,529"


In [4]:
#Population data needs to be cleaned (remove date field in front and convert to integer)
#Use Population of capital estimated (is most recent based on Census)
population = []
for i in range(0,len(state_capitals)):
    test = state_capitals.iloc[i][3]
    length = len(test)
    strObj = test[12:length:]
    pop = strObj.replace(",","")
    pop_num = int(pop)
    population.append(pop_num)

In [5]:
#Inserting new extracted population back into DataFrame
state_capitals["Population"]=population
state_capitals.head(3)


Unnamed: 0,state,capital,population of capital: census,population of capital: estimated,Population
0,Alabama,Montgomery,"(2010) 205,764","(2018 est.) 198,218",198218
1,Alaska,Juneau,"(2010) 31,275","(2018 est.) 32,113",32113
2,Arizona,Phoenix,"(2010) 1,445,632","(2018 est.) 1,660,272",1660272


In [7]:
#From testing know that cities with population below 90000 likely do not have Craigslist pages.  Therefore eliminate from list

limited_capitals = state_capitals[state_capitals["Population"] > 90000]

In [8]:
print(f"Starting number of capitals: {len(state_capitals)}")
print(f"Number of capitals after filtering (must be >90000) {len(limited_capitals)}")

Starting number of capitals: 50
Number of capitals after filtering (must be >90000) 32


In [9]:
#Extract list of capitals to use with Craigslist
capital_list = limited_capitals["capital"].to_list()

[['STNAME', 'POP', 'DATE_', 'state'], ['Alabama', '4849377', '7', '01'], ['Alaska', '736732', '7', '02'], ['Arizona', '6731484', '7', '04'], ['Arkansas', '2966369', '7', '05'], ['California', '38802500', '7', '06'], ['Colorado', '5355866', '7', '08'], ['Connecticut', '3596677', '7', '09'], ['Delaware', '935614', '7', '10'], ['District of Columbia', '658893', '7', '11'], ['Florida', '19893297', '7', '12'], ['Georgia', '10097343', '7', '13'], ['Hawaii', '1419561', '7', '15'], ['Idaho', '1634464', '7', '16'], ['Illinois', '12880580', '7', '17'], ['Indiana', '6596855', '7', '18'], ['Iowa', '3107126', '7', '19'], ['Kansas', '2904021', '7', '20'], ['Kentucky', '4413457', '7', '21'], ['Louisiana', '4649676', '7', '22'], ['Maine', '1330089', '7', '23'], ['Maryland', '5976407', '7', '24'], ['Massachusetts', '6745408', '7', '25'], ['Michigan', '9909877', '7', '26'], ['Minnesota', '5457173', '7', '27'], ['Mississippi', '2994079', '7', '28'], ['Missouri', '6063589', '7', '29'], ['Montana', '102357

In [27]:
df = pd.DataFrame(response[1:], columns = response[0])
df.head()

Unnamed: 0,STNAME,POP,DATE_,state
0,Alabama,4849377,7,1
1,Alaska,736732,7,2
2,Arizona,6731484,7,4
3,Arkansas,2966369,7,5
4,California,38802500,7,6


In [36]:
df["POP"] = pd.to_numeric(df["POP"])
list_df = df[df["POP"]>5000000]
list_df

Unnamed: 0,STNAME,POP,DATE_,state
2,Arizona,6731484,7,4
4,California,38802500,7,6
5,Colorado,5355866,7,8
9,Florida,19893297,7,12
10,Georgia,10097343,7,13
13,Illinois,12880580,7,17
14,Indiana,6596855,7,18
20,Maryland,5976407,7,24
21,Massachusetts,6745408,7,25
22,Michigan,9909877,7,26


## Second Part - Extracting Information from Craigslist

In [10]:
# Craigslist data extractopm
#using request and soup
CL_url = "https://minneapolis.craigslist.org/d/cars-trucks/search/cta"
CL_response = requests.get(CL_url)
time.sleep(2)
# Create BeautifulSoup object; parse with 'html.parser'
CL_soup = BeautifulSoup(CL_response.text, 'lxml')
#print(CL_soup)

In [11]:
#List of variables of interest - car brands and cities
model_list = ["subaru","honda", "toyota","BMW","mercedes","ford","dodge", "chrysler","chevrolet","chevy"]
city_list = capital_list

In [21]:
#extracting data from website.  Creates Unique search field for each city.
#Model years restricted to 2010 to 2015

model_df = pd.DataFrame()
model_df["Make"]=model_list

for city in city_list:
    
    model_counter = []

    for model in model_list:
        #print(f" {city} and {model}")
        city_strip = city.replace(" ","") #removing spaces in statenames
        if city_strip == "SaintPaul":  #Saint Paul is lumped with Minneapolis on Craigslist
            city_strip = "Minneapolis"
        CL_url = f"https://{city_strip}.craigslist.org/search/cta?auto_make_model={model}&min_auto_year=2010&max_auto_year=2015"
        CL_response = requests.get(CL_url)
        # Create BeautifulSoup object; parse with 'html.parser'
        CL_soup = BeautifulSoup(CL_response.text, 'lxml')
        CL_results = CL_soup.find_all('span', class_="totalcount")
        #print(CL_results)
        if CL_results == []:
            counter = 0
        else:
            counter = CL_results[0].text
        model_counter.append(int(counter))
        
    model_df[city]=model_counter

 Montgomery and subaru
[<span class="totalcount">1</span>, <span class="totalcount">1</span>]
 Montgomery and honda
[<span class="totalcount">6</span>, <span class="totalcount">6</span>]
 Montgomery and toyota
[<span class="totalcount">12</span>, <span class="totalcount">12</span>]
 Montgomery and BMW
[<span class="totalcount">3</span>, <span class="totalcount">3</span>]
 Montgomery and mercedes
[<span class="totalcount">1</span>, <span class="totalcount">1</span>]
 Montgomery and ford
[<span class="totalcount">19</span>, <span class="totalcount">19</span>]
 Montgomery and dodge
[<span class="totalcount">7</span>, <span class="totalcount">7</span>]
 Montgomery and chrysler
[]
 Montgomery and chevrolet
[<span class="totalcount">6</span>, <span class="totalcount">6</span>]
 Montgomery and chevy
[<span class="totalcount">1</span>, <span class="totalcount">1</span>]
 Phoenix and subaru
[<span class="totalcount">92</span>, <span class="totalcount">92</span>]
 Phoenix and honda
[<span class=

[<span class="totalcount">57</span>, <span class="totalcount">57</span>]
 Honolulu and chevrolet
[<span class="totalcount">262</span>, <span class="totalcount">262</span>]
 Honolulu and chevy
[<span class="totalcount">41</span>, <span class="totalcount">41</span>]
 Boise and subaru
[<span class="totalcount">82</span>, <span class="totalcount">82</span>]
 Boise and honda
[<span class="totalcount">180</span>, <span class="totalcount">180</span>]
 Boise and toyota
[<span class="totalcount">196</span>, <span class="totalcount">196</span>]
 Boise and BMW
[<span class="totalcount">95</span>, <span class="totalcount">95</span>]
 Boise and mercedes
[<span class="totalcount">87</span>, <span class="totalcount">87</span>]
 Boise and ford
[<span class="totalcount">584</span>, <span class="totalcount">584</span>]
 Boise and dodge
[<span class="totalcount">156</span>, <span class="totalcount">156</span>]
 Boise and chrysler
[<span class="totalcount">54</span>, <span class="totalcount">54</span>]
 B

[<span class="totalcount">1095</span>, <span class="totalcount">1095</span>]
 Saint Paul and dodge
[<span class="totalcount">295</span>, <span class="totalcount">295</span>]
 Saint Paul and chrysler
[<span class="totalcount">174</span>, <span class="totalcount">174</span>]
 Saint Paul and chevrolet
[<span class="totalcount">744</span>, <span class="totalcount">744</span>]
 Saint Paul and chevy
[<span class="totalcount">131</span>, <span class="totalcount">131</span>]
 Jackson and subaru
[]
 Jackson and honda
[<span class="totalcount">7</span>, <span class="totalcount">7</span>]
 Jackson and toyota
[<span class="totalcount">24</span>, <span class="totalcount">24</span>]
 Jackson and BMW
[<span class="totalcount">2</span>, <span class="totalcount">2</span>]
 Jackson and mercedes
[<span class="totalcount">3</span>, <span class="totalcount">3</span>]
 Jackson and ford
[<span class="totalcount">29</span>, <span class="totalcount">29</span>]
 Jackson and dodge
[<span class="totalcount">19</s

[<span class="totalcount">15</span>, <span class="totalcount">15</span>]
 Columbia and mercedes
[<span class="totalcount">10</span>, <span class="totalcount">10</span>]
 Columbia and ford
[<span class="totalcount">89</span>, <span class="totalcount">89</span>]
 Columbia and dodge
[<span class="totalcount">20</span>, <span class="totalcount">20</span>]
 Columbia and chrysler
[<span class="totalcount">4</span>, <span class="totalcount">4</span>]
 Columbia and chevrolet
[<span class="totalcount">32</span>, <span class="totalcount">32</span>]
 Columbia and chevy
[<span class="totalcount">10</span>, <span class="totalcount">10</span>]
 Nashville and subaru
[<span class="totalcount">29</span>, <span class="totalcount">29</span>]
 Nashville and honda
[<span class="totalcount">174</span>, <span class="totalcount">174</span>]
 Nashville and toyota
[<span class="totalcount">214</span>, <span class="totalcount">214</span>]
 Nashville and BMW
[<span class="totalcount">45</span>, <span class="total

In [38]:
final_df = model_df.transpose(copy = True)
final_df.columns = final_df.iloc[0]
final_df = final_df.drop(final_df.index[0])
final_df.head(50)

Make,subaru,honda,toyota,BMW,mercedes,ford,dodge,chrysler,chevrolet,chevy
Montgomery,1,6,12,3,1,19,7,0,6,1
Phoenix,92,734,765,382,264,1654,530,255,1058,176
Little Rock,2,32,20,10,6,121,26,9,77,13
Sacramento,288,680,1065,700,438,1293,239,79,548,110
Denver,630,450,705,326,200,1601,319,96,817,83
Hartford,43,75,69,45,31,143,33,17,89,14
Tallahassee,3,4,20,4,8,62,18,1,20,10
Atlanta,36,360,382,140,123,719,307,85,386,49
Honolulu,50,332,583,168,147,505,178,57,262,41
Boise,82,180,196,95,87,584,156,54,328,34


In [39]:
final_df["Chrysler_Dodge"]=final_df["chrysler"]+final_df["dodge"]

In [40]:
final_df["Chevrolet"]=final_df["chevrolet"]+final_df["chevy"]

In [41]:
final_df.head()

Make,subaru,honda,toyota,BMW,mercedes,ford,dodge,chrysler,chevrolet,chevy,Chrysler_Dodge,Chevrolet
Montgomery,1,6,12,3,1,19,7,0,6,1,7,7
Phoenix,92,734,765,382,264,1654,530,255,1058,176,785,1234
Little Rock,2,32,20,10,6,121,26,9,77,13,35,90
Sacramento,288,680,1065,700,438,1293,239,79,548,110,318,658
Denver,630,450,705,326,200,1601,319,96,817,83,415,900


In [42]:
final_df = final_df.drop(['chevrolet','chevy', 'chrysler','dodge'],axis=1)

In [43]:
final_df

Make,subaru,honda,toyota,BMW,mercedes,ford,Chrysler_Dodge,Chevrolet
Montgomery,1,6,12,3,1,19,7,7
Phoenix,92,734,765,382,264,1654,785,1234
Little Rock,2,32,20,10,6,121,35,90
Sacramento,288,680,1065,700,438,1293,318,658
Denver,630,450,705,326,200,1601,415,900
Hartford,43,75,69,45,31,143,50,103
Tallahassee,3,4,20,4,8,62,19,30
Atlanta,36,360,382,140,123,719,392,435
Honolulu,50,332,583,168,147,505,235,303
Boise,82,180,196,95,87,584,210,362
