In [104]:
import pandas as pd
from bs4 import BeautifulSoup
import requests
import json
from splinter import Browser
import time
from config import ckey, username, password #ckey is key for census database, username and password are for PostgreSQL
# Python SQL toolkit and Object Relational Mapper
import sqlalchemy
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import Session
from sqlalchemy import create_engine, func,  inspect, distinct
from sqlalchemy.types import Integer, Text, String, DateTime


### Getting Data from Internet Sources
The code below gets data from the following sources:

1. The census api server is used to get the state population from 2014.


2. The state abbreviations (example Minnesota = MN) is taken from worldpopulationreview.com.  The offer as a JSON and CSV.  However they also show a table on their web page so just grab from that location.  


3.  Craigslist is used to get the cars for sale in different states.  First, the state is queried in craigslist and a list of the cities or regions with listings on Craigslist is extracted.  The city URL is then combined with the car makes of interest ["subaru","honda", "toyota","BMW","mercedes","ford","dodge", "chrysler","chevrolet","chevy"], and the model years are restricted to be from 2010 to 2015.  The data from all cities in the state is combined to give the total for a given model for the year.

### Key Variable List

1. state_pop_df = state name, population, code direct from Census.  Imported by JSON and converted to dataframe.
2. state_list_df = sub set of state_pop_df where state population exceeds 5,000,000 people
3. state_code_df = list of states and abbreviation (example Minnesota = MN) taken from https://worldpopulationreview.com/states/state-abbreviations as a table read from HTML code.
4. state_dict = is a dictionnary created from state_code_df so easy to get state abbreviation.
5. state_car_totals_df = dataframe containing cars makes by state
6. model_list = list of car make names to search on craigslist



In [2]:


census_url = f"https://api.census.gov/data/2014/pep/natstprc?get=STNAME,POP&DATE_=7&for=state:*&key={ckey}"
response = requests.get(census_url).json()


In [60]:
state_pop_df = pd.DataFrame(response[1:], columns = response[0])
state_pop_df.tail(1)

Unnamed: 0,STNAME,POP,DATE_,state
51,Puerto Rico Commonwealth,3548397,7,72


In [102]:
state_pop_df = state_pop_df[state_pop_df['STNAME'] != "Puerto Rico Commonwealth"]
state_pop_df = state_pop_df.drop(['DATE_','state'],axis=1)
state_pop_df["POP"] = pd.to_numeric(state_pop_df["POP"])
state_list_df = state_pop_df[state_pop_df["POP"]>10000000]
print(f"Number of states to investigate: {len(state_list_df)}")
state_list_df

Number of states to investigate: 8


Unnamed: 0,STNAME,POP,Abb
4,California,38802500,CA
9,Florida,19893297,FL
10,Georgia,10097343,GA
13,Illinois,12880580,IL
32,New York,19746227,NY
35,Ohio,11594163,OH
38,Pennsylvania,12787209,PA
43,Texas,26956958,TX


In [5]:
#getting state abbreviation codes
pop_review_url ="https://worldpopulationreview.com/states/state-abbreviations"
#setup for splinter
executable_path = {'executable_path': 'c:/bin/chromedriver.exe'}
browser = Browser('chrome', **executable_path, headless=False)

browser.visit(pop_review_url)
tables = pd.read_html(pop_review_url)
state_code_df = tables[0]
browser.quit()

In [6]:
state_code_df = state_code_df.drop(['Abbreviation'],axis=1)


In [7]:
state_code_df.set_index('State', inplace=True)

In [8]:
state_dict = state_code_df.to_dict('index')
print(f"Testing Dictionnary")
print(f"Abbreviation for Georgia is: {state_dict['Georgia']['Code']}")

Testing Dictionnary
Abbreviation for Georgia is: GA


In [64]:
state_pop_df["Abb"] = state_pop_df.apply(lambda row: state_dict[row.STNAME]['Code'], axis = 1)


In [12]:
## getting info from Craigslist
base_url = "https://geo.craigslist.org/iso/us/" #put short form of state at end to get list of cities in craigslist for state
model_list = ["subaru","honda", "toyota","BMW","mercedes","ford","dodge", "chrysler","chevrolet","chevy"]
state_car_totals_df = pd.DataFrame(columns = ["state", *model_list])
state_car_totals_df
result_dict = {}

In [13]:
#Code for getting total sales by state for each make

for i in range(0,len(state_list_df)):#iterating through states
    result_dict = {} #clear place holder dictionnary.  Tracks all auto make totals for the state
    State = state_list_df.iloc[i][0] #get state
    State_AB = state_dict[State]['Code'] #get state abbrevation from state_dictionary
    CL_url = base_url+State_AB #creating web address to see available cities
    result_dict["state"] = State #adding current state as first entry into result_dictionary

    CL_response = requests.get(CL_url) #query to get list of cities
    # Create BeautifulSoup object; parse with 'html.parser'
    CL_soup = BeautifulSoup(CL_response.text, 'lxml')
    CL_results = CL_soup.find_all('div', class_="geo-site-list-container")
    CL_results_1 = CL_results[0].find_all('ul')
    CL_results_2 = CL_results_1[0].find_all('li') #this is the list of craiglist website for cities in state
    
    print(f"State: {State_AB}") #printing abbreviation of state
    
    for model in model_list: #iterate through on list of manufacturer
        counter = 0 #counter tracks total number of cars for each maker in state 
        
        for result in CL_results_2: #iterating through cities in state
            CL_url = result.find('a')["href"] #finds general craigslist for city in state
            #if statement below catches an exception where Craigslist gives an incomplete url when suggests a city out
            #of state but close enough to state people may go there to look at cars.  We want to exclude these.
            if CL_url[0:3] == 'htt': #to catch times when includes areas out of state, like Chicago for IN
                CL_url_search = CL_url+f"/search/cta?auto_make_model={model}&min_auto_year=2010&max_auto_year=2015"
                CL_response = requests.get(CL_url_search)
                # Create BeautifulSoup object; parse with 'lxml'
                CL_soup = BeautifulSoup(CL_response.text, 'lxml')
                CL_results = CL_soup.find_all('span', class_="totalcount") #total count is total number of model for sale
                if CL_results == []: #if get nothing returned in search, don't increment counter
                    counter = counter #nothing to add
                    #print(f" {CL_url} has 0 {model}")
                else:
                    counter = int(CL_results[0].text)+counter #if have cars then increment by number found
                    #print(f" {CL_url} has {CL_results[0].text} {model}")
                
        result_dict[model] = counter #add results for that model to dictionary: key is model and value is counter
        print(f"Total number of model {model} in {State} is {counter}")  
    state_car_totals_df = state_car_totals_df.append(result_dict, ignore_index = True) #add completed result_dict for state

State: CA
Total number of model subaru in California is 1160
Total number of model honda in California is 4710
Total number of model toyota in California is 6767
Total number of model BMW in California is 3507
Total number of model mercedes in California is 2259
Total number of model ford in California is 7798
Total number of model dodge in California is 1552
Total number of model chrysler in California is 777
Total number of model chevrolet in California is 4261
Total number of model chevy in California is 845
State: FL
Total number of model subaru in Florida is 202
Total number of model honda in Florida is 1196
Total number of model toyota in Florida is 1696
Total number of model BMW in Florida is 1189
Total number of model mercedes in Florida is 1013
Total number of model ford in Florida is 3388
Total number of model dodge in Florida is 1003
Total number of model chrysler in Florida is 411
Total number of model chevrolet in Florida is 1964
Total number of model chevy in Florida is 3

In [15]:
#combining chrysler and dodge into one column
#combining chevrolet and chevy into one column
state_car_totals_df["Chrysler_Dodge"]=state_car_totals_df["chrysler"]+state_car_totals_df["dodge"]
state_car_totals_df["Chevrolet"]=state_car_totals_df["chevrolet"]+state_car_totals_df["chevy"]

In [17]:
#after combining drop chrysler, dodge, chevrolet and chevy
state_car_totals_df = state_car_totals_df.drop(['chevrolet','chevy', 'chrysler','dodge'],axis=1)

In [118]:
state_car_totals_df

Unnamed: 0,state,subaru,honda,toyota,BMW,mercedes,ford,Chrysler_Dodge,Chevrolet
0,California,1160,4710,6767,3507,2259,7798,2329,5106
1,Florida,202,1196,1696,1189,1013,3388,1414,2337
2,Georgia,42,450,449,183,144,973,487,569
3,Illinois,119,474,500,234,196,1333,643,1005
4,New York,437,717,652,558,371,1508,633,1105
5,Ohio,162,467,397,113,85,1347,579,1105
6,Pennsylvania,197,289,318,195,173,1044,361,713
7,Texas,177,1271,1974,847,804,4719,1571,3204


## Connecting to and writing to SQL database

In [87]:
#defining and creating engine
engine = create_engine(f'postgresql://{username}:{password}@localhost:5432/state_car_db')
#checking names of tables contained in sql database
Base = automap_base()
Base.prepare(engine, reflect=True)
Base.classes.keys()


['State_Cars', 'State_Population']

In [88]:
#creating object for population and car tables
population = Base.classes.State_Population
cars = Base.classes.State_Cars


In [89]:
#setting up inspector and confirming table names
inspector = inspect(engine)
inspector.get_table_names()

['State_Cars', 'State_Population']

In [90]:
# Get a list of column names and types in measurement
columns = inspector.get_columns('State_Population')
for c in columns:
    print(c['name'], c["type"])

index_state INTEGER
state VARCHAR(255)
population INTEGER
abbreviation VARCHAR(3)


In [91]:
# Get a list of column names and types in measurement
columns = inspector.get_columns('State_Cars')
for c in columns:
    print(c['name'], c["type"])

index_car INTEGER
state VARCHAR(255)
subaru INTEGER
honda INTEGER
toyota INTEGER
BMW INTEGER
mercedes INTEGER
ford INTEGER
Chrysler_Dodge INTEGER
Chevrolet INTEGER


In [92]:
session = Session(engine)

In [114]:
state_pop_df.to_sql(
    'State_Population',
    engine,
    if_exists='replace',
    index=True,
    chunksize=500,
    dtype={
        "STNAME": Text,
        "POP": Integer,
        "Abb": Text
        
    }
)

In [115]:
state_car_totals_df.to_sql(
    'State_Cars',
    engine,
    if_exists='replace',
    index=True,
    chunksize=500,
    dtype={
        "state": Text,
        "subaru": Integer,
        "honda": Integer,
        "toyota": Integer,
        "BMW": Integer,
        "mercedes": Integer,
        "ford": Integer,
        "Chrysler_Dodge": Integer,
        "Chevrolet": Integer
    }
)

In [116]:
#creating primary key from state
with engine.connect() as con:
    con.execute('ALTER TABLE "State_Population" ADD PRIMARY KEY ("STNAME");')

In [117]:
#creating primary key from state
with engine.connect() as con:
    con.execute('ALTER TABLE "State_Cars" ADD PRIMARY KEY ("state");')