In [225]:
import warnings
import pandas as pd
import requests
import re
from requests import get
from bs4 import BeautifulSoup



#Disclosure: These list are made by chat GPT
states = ["Alabama", "Alaska", "Arizona", "Arkansas", "California", "Colorado", "Connecticut","District of Columbia", "Delaware", "Florida", "Georgia", "Hawaii", "Idaho", "Illinois", "Indiana", "Iowa", "Kansas", "Kentucky", "Louisiana", "Maine", "Maryland", "Massachusetts", "Michigan", "Minnesota", "Mississippi", "Missouri", "Montana", "Nebraska", "Nevada", "New Hampshire", "New Jersey", "New Mexico", "New York", "North Carolina", "North Dakota", "Ohio", "Oklahoma", "Oregon", "Pennsylvania", "Rhode Island", "South Carolina", "South Dakota", "Tennessee", "Texas", "Utah", "Vermont","Virginia","Washington","West Virginia","Wisconsin","Wyoming"]
state_abbreviations = {
    "Alabama": "AL",
    "Alaska": "AK",
    "Arizona": "AZ",
    "Arkansas": "AR",
    "California": "CA",
    "Colorado": "CO",
    "Connecticut": "CT",
    "District of Columbia": "DC",
    "Delaware": "DE",
    "Florida": "FL",
    "Georgia": "GA",
    "Hawaii": "HI",
    "Idaho": "ID",
    "Illinois": "IL",
    "Indiana": "IN",
    "Iowa": "IA",
    "Kansas": "KS",
    "Kentucky": "KY",
    "Louisiana": "LA",
    "Maine": "ME",
    "Maryland": "MD",
    "Massachusetts": "MA",
    "Michigan": "MI",
    "Minnesota": "MN",
    "Mississippi": "MS",
    "Missouri": "MO",
    "Montana": "MT",
    "Nebraska": "NE",
    "Nevada": "NV",
    'New Hampshire': 'NH',
    'New Jersey': 'NJ',
    'New Mexico': 'NM',
    'New York': 'NY',
    'North Carolina': 'NC',
    'North Dakota': 'ND',
    'Ohio': 'OH',
    'Oklahoma': 'OK',
    'Oregon': 'OR',
    'Pennsylvania': 'PA',
    'Rhode Island': 'RI',
    'South Carolina': 'SC',
    'South Dakota': 'SD',
    'Tennessee': 'TN',
    'Texas': 'TX',
    'Utah': 'UT',
    'Vermont': 'VT',
    'Virginia': 'VA',
    'Washington': 'WA',
    'West Virginia': 'WV',
    'Wisconsin': 'WI',
    'Wyoming': 'WY'
}

stock_map = {"Starbucks": "sbux?mod=search_symbol",
            "Dunkin": "",
            "Peet's": "jdep?countrycode=nl&mod=search_symbol",
            "Tim": "qsr?mod=search_symbol",
            "Panera": "",
            "Caribou": "qsr?mod=search_symbol",
            "Au":"yum?mod=search_symbol",
            "The": "jfc?countrycode=ph&mod=search_symbol",
            "McDonald's": "mcd?mod=search_symbol"}


I chose to Use lists to map the state abbreviations instead of writing a seperate function for it. I am also feeding these

In [234]:
def get_price(name):
    """
    Find the stock price for the given restaurant
  
    Parameter
    ---------
    name : str
        The first word of the retaurant name
    
    Returns
    -------
    number for stock price
    """
    
    # Dunkin and Panera are private companies so price is hardcoded
    if name == "Dunkin":
        stock_website = 106.48
    elif name == "Panera":
        stock_website = 314.93
    # Peet's and The coffee are in a different format on the same page so they are seperated
    # I assume this is to do with the foreign markets
    elif name == "Peet's" or name == "The":
        ext = stock_map.get(name)

        stock_website = get_website("https://www.marketwatch.com/investing/stock/" + ext)

        stock_website = stock_website.find("div", {"class":"intraday__data"})

        stock_website = stock_website.find("span")

        stock_website = stock_website.text
    else:
        ext = stock_map.get(name)

        stock_website = get_website("https://www.marketwatch.com/investing/stock/" + ext)

        stock_website = stock_website.find("div", {"class":"intraday__data"})

        stock_website = stock_website.find("bg-quote")

        stock_website = stock_website.text
    
    print(stock_website)
    

In [135]:
def get_website(url):
    
    Headers = {
    "user-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36 Edg/118.0.2088.61" 
    }
    
    restaurant = requests.get(url, headers = Headers, timeout = 25)
    restaurant_soup = BeautifulSoup(restaurant.content, "html.parser")
    return restaurant_soup

In [238]:
def rest_loc(url, states = states, state_abbreviations = state_abbreviations):
    """
    Scrapes the menuisim website for the amount of restaurant locations for the link provided
  
    Parameter
    ---------
    url : str
        A url link in quotation marks for the desired restaurant chain
    states : lsit
        A list of specific states wanted
    state_abbreviations : dict
        A dictionary mapping states to the desired abbreviations
    
    Returns
    -------
    dataframe 
    A pandas dataframe containing states, abbreviations, and counts for the desired chain
    
    """
    
    # Get the desired html file
    restaurant_soup = get_website(url)
    
    # Find the list of restaurant locations
    restaurant = restaurant_soup.find("div", {"class":"col-sm-6"})
    # Find all instances (locations)
    restaurant = restaurant.find_all("li")
    
    # Initialize an empty list
    rows = []

    # Iterate over all rows in the retaurant list
    for location in restaurant:
        
        # Creates and empty variable state
        state = ""

        # take each state (already contains the location count)
        state_tag = location.find("a")
        if state_tag is not None:
            state = state_tag.text
        else:
            None

        # Append this data.
        rows.append({
            "State": state
        })

    # Create a DataFrame from the list of dictionaries
    data = pd.DataFrame(rows)
    
    # remove ()
    data["State"] = data["State"].apply(lambda x: x.replace("(", "").replace(")", ""))
    
    # Split the count from the rest of the string
    data[["State","Count"]] = data["State"].str.rsplit(pat = " ",n = 1, expand = True)
    
    # Check for the states and find the right location to split the state to it's own column
    if data["State"].iloc(0) in ("New", "South", "West", "Rhode"):
        data[["State", "name"]] = data["State"].str.split(pat = " ",n = 2, expand = True)
    elif data["State"].iloc(0) == ("District"):
        data[["State", "name"]] = data["State"].str.split(pat = " ",n = 3, expand = True)
    else:
        data[["State", "name"]] = data["State"].str.split(pat = " ",n = 1, expand = True)
    
    # check if the area in state column is in fact a state
    data = data[data["State"].isin(states)]
    
    # take the name of the restaurant (remaining string)
    name = data["name"][1]
    
    # Use the previous string to rename the count column
    data.rename(columns={'Count':name}, inplace=True)
    
    # Drop the extra column
    data.drop("name", axis = 1, inplace = True)
    
    # Add a column with state abbreviations
    data["ST"] = data["State"].map(state_abbreviations)
    
    # Re-organize columns
    data = data[["State", "ST", name]]
    
    data["Stock Price"] = get_price(name.split("", 1)[0])

    return data

In [139]:
def state_pop(url):
    
    """
    takes a link (wikipedia) and returns the state name and population
  
    Parameter
    ---------
    url : str
        A url link in quotation marks for the desired wikipwdia page (state population)
    
    Returns
    -------
    dataframe 
    A pandas dataframe containing states and population
    
    """
    
    wiki = get_website(url)
    wiki.find("table")
    
    rows = []
    
    # iterate over all rows in the faculty table
    for state in wiki.find_all("tr")[1:]:
         # Get all the cells (<td>) in the row.
        cells = state.find_all("td")
        
        # Find the state of the city in cell[1]
        # which for most states is contained in the <i> tag
        state_tag = cells[2].find("a") or cells[2]
        state = state_tag.text

        # which for most populations is contained in the <a> tag
        population_tag = cells[3].find("td") or cells[3]
        population = population_tag.text
        population = population.replace("\n", "")

         # Append this data.
        rows.append({
            "state": state,
            "population": population,
    })
        
    
    return pd.DataFrame(rows)


In [143]:
data1 = rest_loc("https://www.menuism.com/restaurant-locations/starbucks-coffee-39564", states, state_abbreviations)

AttributeError: 'NoneType' object has no attribute 'find_all'

In [79]:
data2 = rest_loc("https://www.menuism.com/restaurant-locations/dunkin-donuts-181624", states, state_abbreviations)

data1 = data1.merge(data2, on=["State", "ST"], how = 'left')

In [80]:
data3 = rest_loc("https://www.menuism.com/restaurant-locations/peets-coffee-tea-84051", states, state_abbreviations)

data1 = data1.merge(data3, on=["State", "ST"], how = 'left')

In [81]:
data4 = rest_loc("https://www.menuism.com/restaurant-locations/tim-hortons-190025", states, state_abbreviations)

data1 = data1.merge(data4, on=["State", "ST"], how = 'left')

In [82]:
data5 = rest_loc("https://www.menuism.com/restaurant-locations/panera-bread-4258", states, state_abbreviations)

data1 = data1.merge(data5, on=["State", "ST"], how = 'left')

In [90]:
data6 = rest_loc("https://www.menuism.com/restaurant-locations/caribou-coffee-164861", states, state_abbreviations)

data1 = data1.merge(data6, on=["State", "ST"], how = 'left')

AttributeError: 'NoneType' object has no attribute 'find_all'

In [31]:
data7 = rest_loc("https://www.menuism.com/restaurant-locations/au-bon-pain-69342", states, state_abbreviations)

data1 = data1.merge(data7, on=["State", "ST"], how = 'left')

In [32]:
data8 = rest_loc("https://www.menuism.com/restaurant-locations/the-coffee-bean-tea-leaf-165988", states, state_abbreviations)

data1 = data1.merge(data8, on=["State", "ST"], how = 'left')

In [142]:
data9 = rest_loc("https://www.menuism.com/restaurant-locations/mcdonalds-21019", states, state_abbreviations)

data1 = data1.merge(data9, on=["State", "ST"], how = 'left')

data_temp = data1

AttributeError: 'NoneType' object has no attribute 'find_all'

In [127]:
# Use the function to scrape the wikipedia page
pop = state_pop("https://simple.wikipedia.org/wiki/List_of_U.S._states_by_population")

# Merge the data
data_pop = data_temp.merge(pop, how='left', left_on='State', right_on='state')

# Move the pop column to after the abbreviation

col = data_pop["population"]
data_pop.drop(columns=["population"], inplace=True)
data_pop.insert(2, col.name,col)


In [128]:
print(data1)

            State population_x  population  ST Starbucks Coffee locations  \
0          Alaska      733,391     733,391  AK                         24   
1         Alabama    5,024,279   5,024,279  AL                         73   
2        Arkansas    3,011,524   3,011,524  AR                         33   
3         Arizona    7,151,502   7,151,502  AZ                        279   
4      California   39,538,223  39,538,223  CA                       2362   
5        Colorado    5,773,714   5,773,714  CO                        371   
6     Connecticut    3,605,944   3,605,944  CT                        107   
7        Delaware      989,948     989,948  DE                         20   
8         Florida   21,538,187  21,538,187  FL                        616   
9         Georgia   10,711,908  10,711,908  GA                        248   
10         Hawaii    1,455,271   1,455,271  HI                         72   
11           Iowa    3,190,369   3,190,369  IA                         49   

# Break

In [None]:

# Initialize an empty list
rows = []

# Iterate over all rows in the faculty table
for location in starbucks:
    
    #print(cells)

    # Find the name of the course in cell[0]
    state_tag = location.find("a")
    if state_tag is not None:
        state = state_tag.text
    else:
        None

    # Append this data.
    rows.append({
        "State": state
    })

# Create a DataFrame from the list of dictionaries
data = pd.DataFrame(rows)

#data["State"] = data["State"].apply(lambda x: x.replace("Starbucks Coffee locations ", "").replace("(", "").replace(")", ""))

#data[["State", "Starbucks Coffee Locations"]] = data["State"].str.rsplit(" ", 1, expand = True)

#data = data[data["State"].isin(states)]

#data["ST"] = data["State"].map(state_abbreviations)

#data = data[["State", "ST", "Starbucks Coffee Locations"]]

#print(data.head())

In [None]:
# Original code
# Initialize an empty list
rows = []

# Iterate over all rows in the faculty table
for location in starbucks:
    
    #print(cells)

    # Find the name of the course in cell[0]
    state_tag = location.find("a")
    if state_tag is not None:
        state = state_tag.text
    else:
        None

    # Append this data.
    rows.append({
        "State": state
    })

# Create a DataFrame from the list of dictionaries
data = pd.DataFrame(rows)

data["State"] = data["State"].apply(lambda x: x.replace("Starbucks Coffee locations ", "").replace("(", "").replace(")", ""))

data[["State", "Starbucks Coffee Locations"]] = data["State"].str.rsplit(" ", 1, expand = True)

data = data[data["State"].isin(states)]

data["ST"] = data["State"].map(state_abbreviations)

data = data[["State", "ST", "Starbucks Coffee Locations"]]

print(data.head())