In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import requests
import json
from splinter import Browser
import time
from config import ckey

### Key Variable List

1. state_pop_df = state name, population, code direct from Census.  Imported by JSON and converted to dataframe.
2. state_list_df = sub set of state_pop_df where state population exceeds 5,000,000 people
3. state_code_df = list of states and abbreviation (example Minnesota = MN) taken from https://worldpopulationreview.com/states/state-abbreviations as a table read from HTML code.
4. state_dict = is a dictionnary created from state_code_df so easy to get state abbreviation.



In [2]:

#ckey = "f8d91c497f65177f38e048c6cab6476bdd6de05c"
census_url = f"https://api.census.gov/data/2014/pep/natstprc?get=STNAME,POP&DATE_=7&for=state:*&key={ckey}"
response = requests.get(census_url).json()


In [3]:
state_pop_df = pd.DataFrame(response[1:], columns = response[0])


In [5]:
state_pop_df["POP"] = pd.to_numeric(state_pop_df["POP"])
state_list_df = state_pop_df[state_pop_df["POP"]>5000000]
print(f"Number of states to investigate: {len(state_list_df)}")
state_list_df

Number of states to investigate: 22


Unnamed: 0,STNAME,POP,DATE_,state
2,Arizona,6731484,7,4
4,California,38802500,7,6
5,Colorado,5355866,7,8
9,Florida,19893297,7,12
10,Georgia,10097343,7,13
13,Illinois,12880580,7,17
14,Indiana,6596855,7,18
20,Maryland,5976407,7,24
21,Massachusetts,6745408,7,25
22,Michigan,9909877,7,26


In [14]:
#getting state abbreviation codes
pop_review_url ="https://worldpopulationreview.com/states/state-abbreviations"
#setup for splinter
executable_path = {'executable_path': 'c:/bin/chromedriver.exe'}
browser = Browser('chrome', **executable_path, headless=False)


browser.visit(pop_review_url)
tables = pd.read_html(pop_review_url)
state_code_df = tables[0]
browser.quit()

In [15]:
state_code_df = state_code_df.drop(['Abbreviation'],axis=1)


In [16]:
state_code_df.set_index('State', inplace=True)

In [17]:
state_dict = state_code_df.to_dict('index')
print(f"Testing Dictionnary")
print(f"Abbreviation for Georgia is: {state_dict['Georgia']['Code']}")

Testing Dictionnary
Abbreviation for Georgia is: GA


In [73]:
## getting info from Craigslist
base_url = "https://geo.craigslist.org/iso/us/" #put short form of state at end to get list of cities in craigslist for state
model_list = ["subaru","honda", "toyota","BMW","mercedes","ford","dodge", "chrysler","chevrolet","chevy"]
state_car_totals_df = pd.DataFrame(columns = ["state", *model_list])
state_car_totals_df
result_dict = {}

In [74]:
for i in range(0,len(state_list_df)):#iterating through states
    result_dict = {} #clear place holder dictionnary
    State = state_list_df.iloc[i][0] #get state
    State_AB = state_dict[State]['Code'] #get state abbrevation
    CL_url = base_url+State_AB #creating web address to see available cities
    result_dict["state"] = State

    CL_response = requests.get(CL_url)
    # Create BeautifulSoup object; parse with 'html.parser'
    CL_soup = BeautifulSoup(CL_response.text, 'lxml')
    CL_results = CL_soup.find_all('div', class_="geo-site-list-container")
    CL_results_1 = CL_results[0].find_all('ul')
    CL_results_2 = CL_results_1[0].find_all('li') #this is the list of craiglist website for cities in state
    
    print(f"State: {State_AB}")
    
    for model in model_list: #iterate through on list of models
        counter = 0
        for result in CL_results_2: #iterating through cities in state
            
            CL_url = result.find('a')["href"] #finds general craigslist for city in state
            if CL_url[0:3] == 'htt': #to catch times when includes areas out of state, like Chicago for IN
                CL_url_search = CL_url+f"/search/cta?auto_make_model={model}&min_auto_year=2010&max_auto_year=2015"
                #print(CL_url_search)
                CL_response = requests.get(CL_url_search)
                # Create BeautifulSoup object; parse with 'html.parser'
                CL_soup = BeautifulSoup(CL_response.text, 'lxml')
                CL_results = CL_soup.find_all('span', class_="totalcount") #total count is total number of model for sale
                #print(CL_results)
                if CL_results == []: #if get nothing returned in search, don't increment counter
                    counter = counter
                    #print(f" {CL_url} has 0 {model}")
                else:
                    counter = int(CL_results[0].text)+counter #if have cars then increment by number found
                    #print(f" {CL_url} has {CL_results[0].text} {model}")
                
        result_dict[model] = counter #add results for that model to dictionary: key is model and value is counter
        print(f"Total number of model {model} in {State} is {counter}")  
    state_car_totals_df = state_car_totals_df.append(result_dict, ignore_index = True)

State: AZ
 https://flagstaff.craigslist.org has 1 subaru
 https://mohave.craigslist.org has 0 subaru
 https://phoenix.craigslist.org has 96 subaru
 https://prescott.craigslist.org has 0 subaru
 https://showlow.craigslist.org has 0 subaru


KeyboardInterrupt: 

In [59]:
state_car_totals_df

Unnamed: 0,state,subaru,honda,toyota,BMW,mercedes,ford,dodge,chrysler,chevrolet,chevy
0,Arizona,131,879,946,472,334,2098,667,328,1409,224
1,California,1160,4718,6763,3502,2257,7794,1547,775,4262,844


In [65]:
CL_results_2

[<li><a href="https://bloomington.craigslist.org">bloomington</a></li>,
 <li><a href="https://evansville.craigslist.org">evansville</a></li>,
 <li><a href="https://fortwayne.craigslist.org">fort wayne</a></li>,
 <li><a href="https://indianapolis.craigslist.org"><b>indianapolis</b></a></li>,
 <li><a href="https://kokomo.craigslist.org">kokomo</a></li>,
 <li><a href="https://tippecanoe.craigslist.org">lafayette / west lafayette</a></li>,
 <li><a href="https://muncie.craigslist.org">muncie / anderson</a></li>,
 <li><a href="https://richmondin.craigslist.org">richmond</a></li>,
 <li><a href="https://southbend.craigslist.org">south bend / michiana</a></li>,
 <li><a href="https://terrehaute.craigslist.org">terre haute</a></li>,
 <li><br/><a href="//chicago.craigslist.org/nwi/">northwest indiana</a> (subregion of chicago site)
 
     </li>]

In [66]:
CL_url

'//chicago.craigslist.org/nwi/'

In [71]:
CL_url[0:3]



'//c'