# Scraping the site https://www.seasonalfoodguide.org/

The finished CSV will contain:
1. Each state
2. Getting all fruits for each state
3. Getting the months of growing season for that fruit in that state

What my code will have to do:
1. Selenium clicks a state (start without loop)
2. BSoup function collects all the href links in a list
3. Bsoup function opens each page in a loop and scrapes out the desired text, putting it in another list (returned)
4. Python writes that returned list to a file
5. Selenium opens a new state (create loop if functional)

In [1]:
from bs4 import BeautifulSoup
import requests
import time
import csv
from random import randint

In [2]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By

from selenium.webdriver.chrome.options import Options

In [4]:
#get list of states

chrome_options = Options()
chrome_options.add_argument("--headless=new")
driver = webdriver.Chrome(options=chrome_options)

#open the home page
driver.get('https://www.seasonalfoodguide.org/');
page = driver.page_source

soup = BeautifulSoup(page, 'html.parser')

def get_list_states(home_url):

#     #empty list for states
    states_list = []
    
    #find first input container, which is the first dropdown menu
    dropdown = soup.find('select')
    
    #now find all option values
    options_list = dropdown.find_all('option')
    
    
    for option in options_list:
        states_list.append(option.text)
    
    return states_list
    
states_list = get_list_states(page)
driver.quit()

In [5]:
#get rid of first two

states_list = states_list[2:]

In [6]:
#get the links for each fruit of a state

def get_state_links(state):
    chrome_options = Options()
    chrome_options.add_argument("--headless=new")
    driver = webdriver.Chrome(options=chrome_options)
    
    driver.get('https://www.seasonalfoodguide.org/state/' + state);
    page = driver.page_source
    
    soup = BeautifulSoup(page, 'html.parser')
    
    cards_list = soup.find_all('div', class_ = 'card')
    
    link_list = []
    
    for card in cards_list:
        link_list.append(card.find('a').attrs['href'])
    
    driver.quit()
    return link_list
    
# test
# print(get_state_links('alabama'))

In [7]:
#get seasonality of the fruit

def get_seasonality(partial_link):
    chrome_options = Options()
    chrome_options.add_argument("--headless=new")
    driver = webdriver.Chrome(options=chrome_options)
    
    driver.get('https://www.seasonalfoodguide.org' + partial_link);
    page = driver.page_source
    
    soup = BeautifulSoup(page, 'html.parser')
    
    column2 = soup.find('div', class_ = 'm7') #get second half of card
    
    fruit_name = soup.find('h3', class_ = 'card_title').text #on first half
    state_name = column2.find('h3', class_ = 'card_title').text #on second half
    seasonal_months = column2.find('p', class_ = 'card-content').text #on second half
    
    stats_list = [state_name, fruit_name, seasonal_months]
    
    driver.quit()
    #return list of state, fruit, and seasonal months
    return stats_list

#test
#print(get_seasonality('/veg/apples/alabama'))

In [9]:
#calling everything into a csv

def write_csv(states_list):
    #open new file for writing
    filename = 'fruit_stats.csv'
    f = open(filename, 'w')
    
    # make a Python CSV writer object -
    c = csv.writer(f)
    
    # write the column headings row 
    c.writerow(['State', 'Fruit', 'Seasonality'])
  
    # -------------------------------------------------------
    
    # THIS IS THE MOST OPTIMAL, CORRECT LOOP
    for state in states_list:
        link_list = get_state_links(state)
        for link in link_list:
            c.writerow(get_seasonality(link))

    # -------------------------------------------------------

#     for-loop for scraping individual states pages

#     filename2 = 'state_links.csv'
#     f2 = open(filename2, 'w')
#     c2 = csv.writer(f2)

#     #get all state links in this for-loop
#     nested_links_list = []
#     for state in states_list:
#         links = get_state_links(state)
#         nested_links_list.append(links)
#         c2.writerow(links)
    
#     for link in nested_links_list[1]: #use index of desired state
#         c.writerow(get_seasonality(link))
       
    f.close()
    
    return None

write_csv(states_list)