In [8]:
from scraper import scrape
from pprint import pprint
import pandas as pd
import csv
import json

# List of states

Reads from csv file of all states and creates list of lowercase

In [9]:
# Use Pandas to read data
states_pd = pd.read_csv("states.csv", "utf-8")

# Convert df to list
states = states_pd['State_Name'].values.tolist()

# Convert list to lowercase values
states_lower = []
for state in states:
    states_lower.append(state.lower())
    
print(states_lower)

['alabama', 'alaska', 'arizona', 'arkansas', 'california', 'colorado', 'connecticut', 'delaware', 'florida', 'georgia', 'hawaii', 'idaho', 'illinois', 'indiana', 'iowa', 'kansas', 'kentucky', 'louisiana', 'maine', 'maryland', 'massachusetts', 'michigan', 'minnesota', 'mississippi', 'missouri', 'montana', 'nebraska', 'nevada', 'new hampshire', 'new jersey', 'new mexico', 'new york', 'north carolina', 'north dakota', 'ohio', 'oklahoma', 'oregon', 'pennsylvania', 'rhode island', 'south carolina', 'south dakota', 'tennessee', 'texas', 'utah', 'vermont', 'virginia', 'washington', 'west virginia', 'wisconsin', 'wyoming']


  


# Scraper Function

This function takes in a state name as a string, appends it to the base url, and based on the new url <br> 
it scrapes usnews.com's list of all states by ranking. Output is a dictionary of state stats.

In [10]:
def get_states(state):

    # Initialize base url
    base_url = "https://www.usnews.com/news/best-states/"

    # create final url using "state"
    state_scraped = scrape(base_url + state)

    # Save div as object to scrape & loop through to find all elements of state stats
    div = state_scraped.find("div", class_="Grid-s1x0i6w9-0").find_all("dl", class_="QuickStats-s9hlu1u-0")

    # Initialize empty list to save values
    list = []

    for d in div[:3]:
        for i in d:
            list.append(i.text)

    # Skip every other item in list to append to values
    category = (list[::2])
    cat_value = (list[1::2])   

    # Zip values into dictionary for stats
    quick_stats_dict = dict(zip(category, cat_value))

    # Save new div as object to scrape & find rankings
    rankings = state_scraped.find("div", class_="Cell-s1jgw6rh-0")
    rank_list = rankings.find("ul")

    # Save new div as object to scrape & find overall rating
    overall_rating  = rankings.find_all("div", class_="DonutMeter__Wrapper-s1jo49pn-0 gNXJuS")[0].text

    # Save as integer to be calculated
    overall_rating_num = int(overall_rating.replace("#",""))

    ranking_title = []
    rankings = []

    for list in rank_list:
        ranking_title.append(list.find("span").text)
        rankings.append(list.find("b").text)

    # Must remove # from each number & convert to int to perform calculations
    new_rankings = []

    for rank in rankings:
        new_rankings.append(int(rank.replace("#","")))

    # Calculate "rank" into "percentage" 
    perc_rankings = []
    for rank in new_rankings:
        perc_rankings.append((1 - rank/50)*100)

    # Save ranking values into dataframe
    rankings_df = pd.DataFrame({'Category': ranking_title,
                                'Rank' : rankings,
                                'Value' : new_rankings,
                                'Percentage': perc_rankings
                               })

    # Convert to dictionary
    rankings_dict = rankings_df.to_dict('list')
    
    # Turn dataframe into list of dictionaries per tuple
    new_df = rankings_df.to_dict(orient='records')[1:-1]
    
    # Find URL of state image
    image_url = state_scraped.find("div", class_="Profile__ProfileWakanda-h5rw0b-2")\
    .find("div", class_= "s85n6m5-0-Box-cwadsP cBeNoA")\
    .find("picture", class_="Image__Picture-afx55j-0").find("img")["src"]

    new_state = state.replace("-", " ")
    
    # Create new dictionary from all variables
    state_dict = {
        "Image URL": image_url,
        "Overall Rating": overall_rating,
        "Overall Rating Number": overall_rating_num,
        "State": new_state,
        "Stats Dictionary": quick_stats_dict,
        "Rank Dictionary": new_df 
    }
    
    return state_dict

In [11]:
# Created state list 
state_list = ['arizona', 'california', 'colorado', 'illinois', 'kentucky', 'florida', 'georgia',\
              'massachusetts', 'michigan', 'minnesota',\
              'new-york', 'north-carolina', 'ohio', 'south-carolina', 'tennessee', 'texas', \
              'virginia', 'washington']

# Function to scrape multiple states

This function takes in a LIST of states, and loops through, calling the state scraper function to scrape <br>
each state. From the output, it creates a list of dictionaries for each state and outputs a JSON file.

In [6]:
def combine_states(list_states):
    
    all_states_dict = []

    for state in list_states:
        try:
            state_dict = get_states(state)
            case = {state: state_dict}
            all_states_dict.append(case)
        except:
            print(f'An error occurred for state ' + state)
    
    # Turn dictionary into JSON file
    with open('states_results.json', 'w') as fp:
        json.dump(all_states_dict, fp)

In [7]:
combine_states(state_list)

An error occurred for state california
An error occurred for state colorado
An error occurred for state illinois
An error occurred for state kentucky
An error occurred for state florida
An error occurred for state georgia
An error occurred for state massachusetts
An error occurred for state michigan
An error occurred for state minnesota
An error occurred for state new-york
An error occurred for state north-carolina
An error occurred for state ohio
An error occurred for state south-carolina
An error occurred for state tennessee
An error occurred for state texas
An error occurred for state virginia
An error occurred for state washington


In [62]:
pic_containter = test_scraped.find("div", class_="Profile__ProfileWakanda-h5rw0b-2")\
.find("div", class_= "s85n6m5-0-Box-cwadsP cBeNoA")\
.find("picture", class_="Image__Picture-afx55j-0").find("img")["src"]
print(pic_containter2)

https://www.usnews.com/dims4/USNEWS/0a65959/2147483647/crop/4050x1895%2B0%2B802/resize/1000x468/quality/85/?url=http%3A%2F%2Fcom-usnews-beam-media.s3.amazonaws.com%2F49%2F3f%2F614945704eafae7e1a9cf59366bf%2Fbs19-washington-editorial.jpg
