In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import requests
import json
from splinter import Browser
import time
from config import ckey, username, password #ckey is key for census database, username and password are for PostgreSQL


### Key Variable List

1. state_pop_df = state name, population, code direct from Census.  Imported by JSON and converted to dataframe.
2. state_list_df = sub set of state_pop_df where state population exceeds 5,000,000 people
3. state_code_df = list of states and abbreviation (example Minnesota = MN) taken from https://worldpopulationreview.com/states/state-abbreviations as a table read from HTML code.
4. state_dict = is a dictionnary created from state_code_df so easy to get state abbreviation.
5. state_car_totals_df = dataframe containing cars makes by state
6. model_list = list of car make names to search on craigslist



In [2]:

#ckey = "f8d91c497f65177f38e048c6cab6476bdd6de05c"
census_url = f"https://api.census.gov/data/2014/pep/natstprc?get=STNAME,POP&DATE_=7&for=state:*&key={ckey}"
response = requests.get(census_url).json()


In [3]:
state_pop_df = pd.DataFrame(response[1:], columns = response[0])


In [5]:
state_pop_df["POP"] = pd.to_numeric(state_pop_df["POP"])
state_list_df = state_pop_df[state_pop_df["POP"]>5000000]
print(f"Number of states to investigate: {len(state_list_df)}")
state_list_df

Number of states to investigate: 22


Unnamed: 0,STNAME,POP,DATE_,state
2,Arizona,6731484,7,4
4,California,38802500,7,6
5,Colorado,5355866,7,8
9,Florida,19893297,7,12
10,Georgia,10097343,7,13
13,Illinois,12880580,7,17
14,Indiana,6596855,7,18
20,Maryland,5976407,7,24
21,Massachusetts,6745408,7,25
22,Michigan,9909877,7,26


In [14]:
#getting state abbreviation codes
pop_review_url ="https://worldpopulationreview.com/states/state-abbreviations"
#setup for splinter
executable_path = {'executable_path': 'c:/bin/chromedriver.exe'}
browser = Browser('chrome', **executable_path, headless=False)


browser.visit(pop_review_url)
tables = pd.read_html(pop_review_url)
state_code_df = tables[0]
browser.quit()

In [15]:
state_code_df = state_code_df.drop(['Abbreviation'],axis=1)


In [16]:
state_code_df.set_index('State', inplace=True)

In [17]:
state_dict = state_code_df.to_dict('index')
print(f"Testing Dictionnary")
print(f"Abbreviation for Georgia is: {state_dict['Georgia']['Code']}")

Testing Dictionnary
Abbreviation for Georgia is: GA


In [75]:
## getting info from Craigslist
base_url = "https://geo.craigslist.org/iso/us/" #put short form of state at end to get list of cities in craigslist for state
model_list = [
]
state_car_totals_df = pd.DataFrame(columns = ["state", *model_list])
state_car_totals_df
result_dict = {}

In [76]:
for i in range(0,len(state_list_df)):#iterating through states
    result_dict = {} #clear place holder dictionnary
    State = state_list_df.iloc[i][0] #get state
    State_AB = state_dict[State]['Code'] #get state abbrevation
    CL_url = base_url+State_AB #creating web address to see available cities
    result_dict["state"] = State

    CL_response = requests.get(CL_url)
    # Create BeautifulSoup object; parse with 'html.parser'
    CL_soup = BeautifulSoup(CL_response.text, 'lxml')
    CL_results = CL_soup.find_all('div', class_="geo-site-list-container")
    CL_results_1 = CL_results[0].find_all('ul')
    CL_results_2 = CL_results_1[0].find_all('li') #this is the list of craiglist website for cities in state
    
    print(f"State: {State_AB}")
    
    for model in model_list: #iterate through on list of models
        counter = 0
        for result in CL_results_2: #iterating through cities in state
            
            CL_url = result.find('a')["href"] #finds general craigslist for city in state
            if CL_url[0:3] == 'htt': #to catch times when includes areas out of state, like Chicago for IN
                CL_url_search = CL_url+f"/search/cta?auto_make_model={model}&min_auto_year=2010&max_auto_year=2015"
                #print(CL_url_search)
                CL_response = requests.get(CL_url_search)
                # Create BeautifulSoup object; parse with 'html.parser'
                CL_soup = BeautifulSoup(CL_response.text, 'lxml')
                CL_results = CL_soup.find_all('span', class_="totalcount") #total count is total number of model for sale
                #print(CL_results)
                if CL_results == []: #if get nothing returned in search, don't increment counter
                    counter = counter
                    #print(f" {CL_url} has 0 {model}")
                else:
                    counter = int(CL_results[0].text)+counter #if have cars then increment by number found
                    #print(f" {CL_url} has {CL_results[0].text} {model}")
                
        result_dict[model] = counter #add results for that model to dictionary: key is model and value is counter
        print(f"Total number of model {model} in {State} is {counter}")  
    state_car_totals_df = state_car_totals_df.append(result_dict, ignore_index = True)

State: AZ
Total number of model subaru in Arizona is 131
Total number of model honda in Arizona is 877
Total number of model toyota in Arizona is 945
Total number of model BMW in Arizona is 469
Total number of model mercedes in Arizona is 334
Total number of model ford in Arizona is 2099
Total number of model dodge in Arizona is 665
Total number of model chrysler in Arizona is 328
Total number of model chevrolet in Arizona is 1411
Total number of model chevy in Arizona is 224
State: CA
Total number of model subaru in California is 1161
Total number of model honda in California is 4713
Total number of model toyota in California is 6764
Total number of model BMW in California is 3506
Total number of model mercedes in California is 2258
Total number of model ford in California is 7798
Total number of model dodge in California is 1550
Total number of model chrysler in California is 777
Total number of model chevrolet in California is 4260
Total number of model chevy in California is 843
St

Total number of model dodge in Pennsylvania is 253
Total number of model chrysler in Pennsylvania is 108
Total number of model chevrolet in Pennsylvania is 590
Total number of model chevy in Pennsylvania is 123
State: TN
Total number of model subaru in Tennessee is 76
Total number of model honda in Tennessee is 399
Total number of model toyota in Tennessee is 434
Total number of model BMW in Tennessee is 94
Total number of model mercedes in Tennessee is 113
Total number of model ford in Tennessee is 1096
Total number of model dodge in Tennessee is 244
Total number of model chrysler in Tennessee is 86
Total number of model chevrolet in Tennessee is 599
Total number of model chevy in Tennessee is 117
State: TX
Total number of model subaru in Texas is 177
Total number of model honda in Texas is 1271
Total number of model toyota in Texas is 1974
Total number of model BMW in Texas is 849
Total number of model mercedes in Texas is 804
Total number of model ford in Texas is 4720
Total number 

In [77]:
state_car_totals_df

Unnamed: 0,state,subaru,honda,toyota,BMW,mercedes,ford,dodge,chrysler,chevrolet,chevy
0,Arizona,131,877,945,469,334,2099,665,328,1411,224
1,California,1161,4713,6764,3506,2258,7798,1550,777,4260,843
2,Colorado,782,638,980,402,212,2196,482,148,1099,140
3,Florida,202,1195,1696,1186,1013,3387,999,411,1964,372
4,Georgia,42,450,449,183,144,971,387,100,501,68
5,Illinois,119,474,501,234,196,1331,466,178,775,229
6,Indiana,27,122,121,27,29,426,134,60,250,89
7,Maryland,45,167,101,65,44,238,102,21,142,17
8,Massachusetts,181,390,455,207,162,883,131,55,379,100
9,Michigan,145,326,226,156,151,2234,655,447,1306,303


In [78]:
state_car_totals_df["Chrysler_Dodge"]=state_car_totals_df["chrysler"]+state_car_totals_df["dodge"]
state_car_totals_df["Chevrolet"]=state_car_totals_df["chevrolet"]+state_car_totals_df["chevy"]

In [79]:
state_car_totals_df

Unnamed: 0,state,subaru,honda,toyota,BMW,mercedes,ford,dodge,chrysler,chevrolet,chevy,Chrysler_Dodge,Chevrolet
0,Arizona,131,877,945,469,334,2099,665,328,1411,224,993,1635
1,California,1161,4713,6764,3506,2258,7798,1550,777,4260,843,2327,5103
2,Colorado,782,638,980,402,212,2196,482,148,1099,140,630,1239
3,Florida,202,1195,1696,1186,1013,3387,999,411,1964,372,1410,2336
4,Georgia,42,450,449,183,144,971,387,100,501,68,487,569
5,Illinois,119,474,501,234,196,1331,466,178,775,229,644,1004
6,Indiana,27,122,121,27,29,426,134,60,250,89,194,339
7,Maryland,45,167,101,65,44,238,102,21,142,17,123,159
8,Massachusetts,181,390,455,207,162,883,131,55,379,100,186,479
9,Michigan,145,326,226,156,151,2234,655,447,1306,303,1102,1609


In [81]:
state_car_totals_df = state_car_totals_df.drop(['chevrolet','chevy', 'chrysler','dodge'],axis=1)

In [82]:
state_car_totals_df

Unnamed: 0,state,subaru,honda,toyota,BMW,mercedes,ford,Chrysler_Dodge,Chevrolet
0,Arizona,131,877,945,469,334,2099,993,1635
1,California,1161,4713,6764,3506,2258,7798,2327,5103
2,Colorado,782,638,980,402,212,2196,630,1239
3,Florida,202,1195,1696,1186,1013,3387,1410,2336
4,Georgia,42,450,449,183,144,971,487,569
5,Illinois,119,474,501,234,196,1331,644,1004
6,Indiana,27,122,121,27,29,426,194,339
7,Maryland,45,167,101,65,44,238,123,159
8,Massachusetts,181,390,455,207,162,883,186,479
9,Michigan,145,326,226,156,151,2234,1102,1609


In [None]:
engine = create_engine(f'postgresql://{username}:{password}@localhost:5432/state_car_db')

# Checking connection to database by looking at departments
data = engine.execute('SELECT * FROM "State_Abreviations"')
for record in data:
    print(record) #print to make certain getting data in database