# Converting TLC SQL Queries to Python DataFrames

### Capstone Question: Is there a correlation between the number and job titles of H-2A workers and crop yields, based on job titles including those crops?

In [None]:
from sqlalchemy import create_engine
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import requests as re

First, we need to create a connection string. The format is

 ```<dialect(+driver)>://<username>:<password>@<hostname>:<port>/<database>```

To connect to a database, you can use the following connection string.

In [None]:
database_name = 'ag_aid'

connection_string = f"postgresql://postgres:postgres@localhost:5432/{database_name}"

Now, we need to create an engine and use it to connect.

In [None]:
engine = create_engine(connection_string)

Now, we can create our query and pass it into the `.query()` method.

## Most common jobs for H-2A workers

In [None]:
query_jobs = '''
WITH cte AS (
SELECT TRIM(UPPER(job_title)) AS title,
    COUNT(TRIM(UPPER(job_title))) AS ct
FROM main
GROUP BY job_title
ORDER BY ct DESC)

SELECT DISTINCT title,
    ct,
    SUM(ct) OVER (PARTITION BY title) AS sum
FROM cte
GROUP BY title, ct
ORDER BY sum DESC;
'''

result = engine.execute(query_jobs)

In [None]:
jobs = pd.read_sql(query_jobs, con = engine)

# Rows
len(jobs.index)

## Total workers - Citrus (grapefruits, lemons, oranges)

In [None]:
query_cit = '''
WITH cit AS (
SELECT
    (RIGHT(begin_date,2)::int) AS year,
    job_title,
    workers_req,
    SUM(workers_req::FLOAT) OVER() as wr_total,
    employer_state,
    worksite_state
FROM main
WHERE job_title ILIKE '%%citrus%%'
    AND (RIGHT(begin_date,2)::int) >= 10
GROUP BY year, job_title, employer_state, workers_req, worksite_state
ORDER BY year)

SELECT year,
    job_title,
    workers_req,
    wr_total,
    SUM(workers_req::FLOAT) OVER(PARTITION BY year) AS wr_total_year,
    employer_state,
    worksite_state
FROM cit
'''

result = engine.execute(query_cit)

In [None]:
#result.fetchone() gets one record as tuple
#result.fetchall() gets all records as tuples
#people = pd.DataFrame(data, 'user')

In [None]:
cit = pd.read_sql(query_cit, con = engine)
cit["workers_req"] = pd.to_numeric(cit["workers_req"])
cit['year'] = cit['year'].map({10:2010, 11:2011, 12:2012, 13:2013, 14:2014, 15:2015, 16:2016, 17:2017, 18:2018, 19:2019, 20:2020})

# Rows
len(cit.index)

In [None]:
annual_cit = cit[['year', 'wr_total', 'wr_total_year']]
annual_cit['crop']='Citrus'

# Drop dupe rows to get unique years
annual_cit = annual_cit.drop_duplicates()
annual_cit

In [None]:
annual_cit["wr_total_year"].mean()

In [None]:
plt.figure(figsize=(16, 8))
sns.set(style="whitegrid")
sns.set(font_scale=1.5)
ax = sns.lineplot(data=annual_cit, x="year", y="wr_total_year", marker='o', color='orange')
ax.axhline(5241, color='silver')
plt.title('H-2A Workers in Citrus Jobs', fontsize=22)
plt.xticks([2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020])
plt.xlabel('Year')
plt.ylabel('Number of workers', fontsize=16);

## Total workers - Strawberries

In [None]:
query_straw = '''
WITH str AS (
SELECT
    (RIGHT(begin_date,2)::int) AS year,
    job_title,
    workers_req,
    SUM(workers_req::FLOAT) OVER() as wr_total,
    employer_state,
    worksite_state
FROM main
WHERE job_title ILIKE '%%strawberr%%'
    AND (RIGHT(begin_date,2)::int) >= 10
GROUP BY year, job_title, employer_state, workers_req, worksite_state
ORDER BY year)

SELECT year,
    job_title,
    workers_req,
    wr_total,
    SUM(workers_req::FLOAT) OVER(PARTITION BY year) AS wr_total_year,
    employer_state,
    worksite_state
FROM str
'''

result = engine.execute(query_straw)

In [None]:
straw = pd.read_sql(query_straw, con = engine)
straw["workers_req"] = pd.to_numeric(straw["workers_req"])
straw['year'] = straw['year'].map({10:2010, 11:2011, 12:2012, 13:2013, 14:2014, 15:2015, 16:2016, 17:2017, 18:2018, 19:2019, 20:2020})

# Rows
len(straw.index)

In [None]:
annual_straw = straw[['year', 'wr_total', 'wr_total_year']]
annual_straw['crop']='Strawberry'

# Drop dupe rows to get unique years
annual_straw = annual_straw.drop_duplicates()
annual_straw

In [None]:
annual_straw["wr_total_year"].mean()

In [None]:
plt.figure(figsize=(16, 8))
sns.set(style="whitegrid")
sns.set(font_scale=1.5)
ax = sns.lineplot(data=annual_straw, x="year", y="wr_total_year", marker='o', color='r')
ax.axhline(2692, color='silver')
plt.title('H-2A Workers in Strawberry Jobs', fontsize=22)
plt.xticks([2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020])
plt.xlabel('Year')
plt.ylabel('Number of workers', fontsize=16);

## Total workers - Sheep

In [None]:
query_she = '''
WITH she AS (
SELECT
    (RIGHT(begin_date,2)::int) AS year,
    job_title,
    workers_req,
    SUM(workers_req::FLOAT) OVER() as wr_total,
    employer_state,
    worksite_state
FROM main
WHERE job_title ILIKE '%%sheep%%'
    AND (RIGHT(begin_date,2)::int) >= 10
GROUP BY year, job_title, employer_state, workers_req, worksite_state
ORDER BY year)

SELECT year,
    job_title,
    workers_req,
    wr_total,
    SUM(workers_req::FLOAT) OVER(PARTITION BY year) AS wr_total_year,
    employer_state,
    worksite_state
FROM she
'''

result = engine.execute(query_she)

In [None]:
she = pd.read_sql(query_she, con = engine)
she["workers_req"] = pd.to_numeric(she["workers_req"])
she['year'] = she['year'].map({10:2010, 11:2011, 12:2012, 13:2013, 14:2014, 15:2015, 16:2016, 17:2017, 18:2018, 19:2019, 20:2020})

# Rows
len(she.index)

In [None]:
annual_she = she[['year', 'wr_total', 'wr_total_year']]
annual_she['crop']='Sheep'

# Drop dupe rows to get unique years
annual_she = annual_she.drop_duplicates()
annual_she

In [None]:
annual_she["wr_total_year"].mean()

In [None]:
plt.figure(figsize=(16, 8))
sns.set(style="whitegrid")
sns.set(font_scale=1.5)
ax = sns.lineplot(data=annual_she, x="year", y="wr_total_year", marker='o', color='slategrey')
ax.axhline(1505, color='silver')
plt.title('H-2A Workers in Sheep Jobs', fontsize=22)
plt.xticks([2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020])
plt.xlabel('Year')
plt.ylabel('Number of workers', fontsize=16);

## Total workers - Lettuce

In [None]:
query_let = '''
WITH let AS (
SELECT
    (RIGHT(begin_date,2)::int) AS year,
    job_title,
    workers_req,
    SUM(workers_req::FLOAT) OVER() as wr_total,
    employer_state,
    worksite_state
FROM main
WHERE job_title ILIKE '%%lettuce%%'
    AND (RIGHT(begin_date,2)::int) >= 10
GROUP BY year, job_title, employer_state, workers_req, worksite_state
ORDER BY year)

SELECT year,
    job_title,
    workers_req,
    wr_total,
    SUM(workers_req::FLOAT) OVER(PARTITION BY year) AS wr_total_year,
    employer_state,
    worksite_state
FROM let
'''

result = engine.execute(query_let)

In [None]:
let = pd.read_sql(query_let, con = engine)
let["workers_req"] = pd.to_numeric(let["workers_req"])
let['year'] = let['year'].map({10:2010, 11:2011, 12:2012, 13:2013, 14:2014, 15:2015, 16:2016, 17:2017, 18:2018, 19:2019, 20:2020})

# Rows
len(let.index)

In [None]:
annual_let = let[['year', 'wr_total', 'wr_total_year']]
annual_let['crop']='Lettuce'

# Drop dupe rows to get unique years
annual_let = annual_let.drop_duplicates()
annual_let

In [None]:
annual_let["wr_total_year"].mean()

In [None]:
plt.figure(figsize=(16, 8))
sns.set(style="whitegrid")
sns.set(font_scale=1.5)
ax = sns.lineplot(data=annual_let, x="year", y="wr_total_year", marker='o', color='g')
ax.axhline(474, color='silver')
plt.title('H-2A Workers in Lettuce Jobs', fontsize=22)
plt.xticks([2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020])
plt.xlabel('Year')
plt.ylabel('Number of workers', fontsize=16);

## Total workers - Potatoes (potato, sweet potato)

In [None]:
query_pot = '''
WITH pot AS (
SELECT
    (RIGHT(begin_date,2)::int) AS year,
    job_title,
    workers_req,
    SUM(workers_req::FLOAT) OVER() as wr_total,
    employer_state,
    worksite_state
FROM main
WHERE job_title ILIKE '%%potato%%'
    AND (RIGHT(begin_date,2)::int) >= 10
GROUP BY year, job_title, employer_state, workers_req, worksite_state
ORDER BY year)

SELECT year,
    job_title,
    workers_req,
    wr_total,
    SUM(workers_req::FLOAT) OVER(PARTITION BY year) AS wr_total_year,
    employer_state,
    worksite_state
FROM pot
'''

result = engine.execute(query_pot)

In [None]:
pot = pd.read_sql(query_pot, con = engine)
pot["workers_req"] = pd.to_numeric(pot["workers_req"])
pot['year'] = pot['year'].map({10:2010, 11:2011, 12:2012, 13:2013, 14:2014, 15:2015, 16:2016, 17:2017, 18:2018, 19:2019, 20:2020})

# Rows
len(pot.index)

In [None]:
annual_pot = pot[['year', 'wr_total', 'wr_total_year']]
annual_pot['crop']='Potato'

# Drop dupe rows to get unique years
annual_pot = annual_pot.drop_duplicates()
annual_pot

In [None]:
annual_pot["wr_total_year"].mean()

In [None]:
plt.figure(figsize=(16, 8))
sns.set(style="whitegrid")
sns.set(font_scale=1.5)
ax = sns.lineplot(data=annual_pot, x="year", y="wr_total_year", marker='o', color='brown')
ax.axhline(604, color='silver')
plt.title('H-2A Workers in Potato Jobs', fontsize=22)
plt.xticks([2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020])
plt.xlabel('Year')
plt.ylabel('Number of workers', fontsize=16);

## Total workers - Tobacco

In [None]:
query_tob = '''
WITH tob AS (
SELECT
    (RIGHT(begin_date,2)::int) AS year,
    job_title,
    workers_req,
    SUM(workers_req::FLOAT) OVER() as wr_total,
    employer_state,
    worksite_state
FROM main
WHERE job_title ILIKE '%%tobacco%%'
    AND (RIGHT(begin_date,2)::int) >= 10
GROUP BY year, job_title, employer_state, workers_req, worksite_state
ORDER BY year)

SELECT year,
    job_title,
    workers_req,
    wr_total,
    SUM(workers_req::FLOAT) OVER(PARTITION BY year) AS wr_total_year,
    employer_state,
    worksite_state
FROM tob
'''

result = engine.execute(query_tob)

In [None]:
tob = pd.read_sql(query_tob, con = engine)
tob["workers_req"] = pd.to_numeric(tob["workers_req"])
tob['year'] = tob['year'].map({10:2010, 11:2011, 12:2012, 13:2013, 14:2014, 15:2015, 16:2016, 17:2017, 18:2018, 19:2019, 20:2020})

# Rows
len(tob.index)

In [None]:
annual_tob = tob[['year', 'wr_total', 'wr_total_year']]
annual_tob['crop']='Tobacco'

# Drop dupe rows to get unique years
annual_tob = annual_tob.drop_duplicates()
annual_tob

In [None]:
annual_tob["wr_total_year"].mean()

In [None]:
plt.figure(figsize=(16, 8))
sns.set(style="whitegrid")
sns.set(font_scale=1.5)
ax = sns.lineplot(data=annual_tob, x="year", y="wr_total_year", marker='o', color='peru')
ax.axhline(360, color='silver')
plt.title('H-2A Workers in Tobacco Jobs', fontsize=22)
plt.xticks([2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020])
plt.xlabel('Year')
plt.ylabel('Number of workers', fontsize=16);

## Total workers - Tomatoes

In [None]:
query_tom = '''
WITH tom AS (
SELECT
    (RIGHT(begin_date,2)::int) AS year,
    job_title,
    workers_req,
    SUM(workers_req::FLOAT) OVER() as wr_total,
    employer_state,
    worksite_state
FROM main
WHERE job_title ILIKE '%%tomato%%'
    AND (RIGHT(begin_date,2)::int) >= 10
GROUP BY year, job_title, employer_state, workers_req, worksite_state
ORDER BY year)

SELECT year,
    job_title,
    workers_req,
    wr_total,
    SUM(workers_req::FLOAT) OVER(PARTITION BY year) AS wr_total_year,
    employer_state,
    worksite_state
FROM tom
'''

result = engine.execute(query_tom)

In [None]:
tom = pd.read_sql(query_tom, con = engine)
tom["workers_req"] = pd.to_numeric(tom["workers_req"])
tom['year'] = tom['year'].map({10:2010, 11:2011, 12:2012, 13:2013, 14:2014, 15:2015, 16:2016, 17:2017, 18:2018, 19:2019, 20:2020})

# Rows
len(tom.index)

In [None]:
annual_tom = tom[['year', 'wr_total', 'wr_total_year']]
annual_tom['crop']='Tomato'

# Drop dupe rows to get unique years
annual_tom = annual_tom.drop_duplicates()
annual_tom

In [None]:
annual_tom["wr_total_year"].mean()

In [None]:
plt.figure(figsize=(16, 8))
sns.set(style="whitegrid")
sns.set(font_scale=1.5)
ax = sns.lineplot(data=annual_tom, x="year", y="wr_total_year", marker='o', color='crimson')
ax.axhline(894, color='silver')
plt.title('H-2A Workers in Tomato Jobs', fontsize=22)
plt.xticks([2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020])
plt.xlabel('Year')
plt.ylabel('Number of workers', fontsize=16);

## Concat all 7 crops into one df: crops

In [None]:
crops = pd.concat([annual_cit, annual_straw, annual_let, annual_pot, annual_tom])
crops = crops.reset_index(drop=True)
len(crops.index)

In [None]:
plt.figure(figsize=(16, 8))
sns.set(style="whitegrid")
sns.set(font_scale=1.5)
ax = sns.lineplot(data=crops, x="year", y="wr_total_year", hue='crop', linewidth=2.5)
plt.title('H-2A Workers across the decade', fontsize=22)
plt.xticks([2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020])
plt.xlabel('Year')
plt.ylabel('Number of workers', fontsize=16);

## Normalizing the above graph in year over year percentage changes

In [None]:
annual_cit['pct_change'] = annual_cit['wr_total_year'].pct_change()*100
annual_cit

In [None]:
annual_straw['pct_change'] = annual_straw['wr_total_year'].pct_change()*100
annual_straw

In [None]:
annual_let['pct_change'] = annual_let['wr_total_year'].pct_change()*100
annual_let

In [None]:
annual_pot['pct_change'] = annual_pot['wr_total_year'].pct_change()*100
annual_pot

In [None]:
annual_tob['pct_change'] = annual_tob['wr_total_year'].pct_change()*100
annual_tob

In [None]:
annual_tom['pct_change'] = annual_tom['wr_total_year'].pct_change()*100
annual_tom

In [None]:
crops_pct = pd.concat([annual_cit, annual_straw, annual_let, annual_pot, annual_tom])
crops_pct = crops_pct.reset_index(drop=True)
len(crops_pct.index)

In [None]:
plt.figure(figsize=(16, 8))
sns.set(style="whitegrid")
sns.set(font_scale=1.5)
ax = sns.lineplot(data=crops_pct, x="year", y="pct_change", hue='crop', linewidth=2.5)
plt.title('Number of H-2A Workers - Year over Year Change (%)', fontsize=22)
plt.xticks([2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020])
plt.xlabel('Year')
plt.ylabel('% change in number of workers', fontsize=16);

# Combine worker plots with CROP YIELDS in a double y-axis graph

## Citrus (grapefruit, lemon, orange) workers vs. yield

In [None]:
grapefruit_df = pd.read_csv('../data_yields/yield_grapefruit.csv')
lemon_df = pd.read_csv('../data_yields/yield_lemon.csv')
orange_df = pd.read_csv('../data_yields/yield_orange.csv')

In [None]:
citrus_df = pd.concat([grapefruit_df, lemon_df, orange_df])
citrus_df.head()

In [None]:
# 2: Get yield totals for each year
citrus_df_sub = citrus_df.groupby(['Year']).sum()
citrus_df_sub = citrus_df_sub.reset_index(level=0)
citrus_df_sub

In [None]:
# Subset annual_cit to provide only year and wr_total_year
annual_cit_sub = annual_cit[["year", "wr_total_year"]]
annual_cit_sub

In [None]:
# 3: Plot double y-axis graph

# create figure and axis objects with subplots()
fig,ax = plt.subplots(figsize=(14,8))
# make a plot
ax.plot(annual_cit_sub.year,
        annual_cit_sub.wr_total_year,
        color="peru", 
        marker="o", label='Workers', linewidth=2.5)
ax.grid(False)
# set x-axis label
ax.set_xlabel("Year", fontsize=16)
ax.set_xticks([2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020])
# set y-axis label
ax.set_ylabel("Number of workers",
              fontsize=16)
ax.set_title('Citrus Workers vs. Yield', fontsize=22)
ax.legend(loc='lower right')
# twin object for two different y-axis on the sample plot
ax2=ax.twinx()
# make a plot with different y-axis using second axis object
ax2.plot(citrus_df_sub.Year, citrus_df_sub.Value, color="dimgrey",marker="o", label='Yield', linewidth=2.5, linestyle=':')
ax2.legend(loc=(0.87, 0.08))
ax2.set_ylabel("Yield (boxes per acre)",color="black",fontsize=16);

Hurricane Irma (2017) devastated Florida's orange crop.
- https://www.ers.usda.gov/amber-waves/2018/januaryfebruary/hurricane-irma-hits-florida-s-agricultural-sector/

## Strawberry workers vs. yield

In [None]:
strawberry_df = pd.read_csv('../data_yields/yield_strawberry.csv')
strawberry_df.head()

In [None]:
strawberry_df_sub = strawberry_df.groupby(['Year']).sum()
strawberry_df_sub = strawberry_df_sub.reset_index(level=0)
strawberry_df_sub

In [None]:
# Subset annual_straw to provide only year and wr_total_year
annual_straw_sub = annual_straw[["year", "wr_total_year"]]
annual_straw_sub

In [None]:
# create figure and axis objects with subplots()
fig,ax = plt.subplots(figsize=(14,8))
# make a plot
ax.plot(annual_straw_sub.year,
        annual_straw_sub.wr_total_year,
        color="darkred", 
        marker="o",
       label='Workers', linewidth=2.5)
ax.grid(False)
# set x-axis label
ax.set_xlabel("Year", fontsize=16)
ax.set_xticks([2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020])
# set y-axis label
ax.set_ylabel("Number of workers",
              fontsize=16)
ax.set_title('Strawberry Workers vs. Yield', fontsize=22)
ax.legend(loc='upper left')
# twin object for two different y-axis on the sample plot
ax2=ax.twinx()
# make a plot with different y-axis using second axis object
ax2.plot(strawberry_df_sub.Year, strawberry_df_sub.Value, color="dimgrey",marker="o", label='Yield', linewidth=2.5, linestyle=':')
ax2.set_ylabel("Yield (cwt per acre)",color="black",fontsize=16)
ax2.legend(loc=(0.01, 0.85));

## Lettuce workers vs. yield

In [None]:
lettuce_df = pd.read_csv('../data_yields/yield_lettuce.csv')
lettuce_df.head()

In [None]:
lettuce_df_sub = lettuce_df.groupby(['Year']).sum()
lettuce_df_sub = lettuce_df_sub.reset_index(level=0)
lettuce_df_sub

In [None]:
# Subset annual_let to provide only year and wr_total_year
annual_let_sub = annual_let[["year", "wr_total_year"]]
annual_let_sub

In [None]:
# create figure and axis objects with subplots()
fig,ax = plt.subplots(figsize=(14,8))
# make a plot
ax.plot(annual_let_sub.year,
        annual_let_sub.wr_total_year,
        color="green", 
        marker="o",
       label='Workers', linewidth=2.5)
ax.grid(False)
# set x-axis label
ax.set_xlabel("Year", fontsize=16)
ax.set_xticks([2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020])
# set y-axis label
ax.set_ylabel("Number of workers",
              fontsize=16)
ax.set_title('Lettuce Workers vs. Yield', fontsize=22)
ax.legend(loc='upper right')
# twin object for two different y-axis on the sample plot
ax2=ax.twinx()
# make a plot with different y-axis using second axis object
ax2.plot(lettuce_df_sub.Year, lettuce_df_sub.Value, color="dimgrey",marker="o", label='Yield', linewidth=2.5, linestyle=':')
ax2.set_ylabel("Yield (cwt per acre)",color="black",fontsize=16)
ax2.legend(loc=(0.86, 0.85));

E. coli outbreak in 2018. CDC urged people not to eat romaine lettuce.
- https://www.fda.gov/food/outbreaks-foodborne-illness/outbreak-investigation-e-coli-romaine-november-2018
- https://downloads.usda.library.cornell.edu/usda-esmis/files/02870v86p/gm80j322z/5138jn50j/vegean19.pdf
- Page 103, lettuce 2nd highest decrease in production: https://www.cdfa.ca.gov/statistics/PDFs/2018-2019AgReportnass.pdf

## Potato (potato, sweet potato) workers vs. yield

In [None]:
potato_df = pd.read_csv('../data_yields/yield_potato.csv')
sweetpotato_df = pd.read_csv('../data_yields/yield_sweetpotato.csv')

In [None]:
potatoes_df = pd.concat([potato_df, sweetpotato_df])
potatoes_df.head()

In [None]:
potatoes_df_sub = potatoes_df.groupby(['Year']).sum()
potatoes_df_sub = potatoes_df_sub.reset_index(level=0)
potatoes_df_sub

In [None]:
# Subset annual_pot to provide only year and wr_total_year
annual_pot_sub = annual_pot[["year", "wr_total_year"]]
annual_pot_sub

In [None]:
# create figure and axis objects with subplots()
fig,ax = plt.subplots(figsize=(14,8))
# make a plot
ax.plot(annual_pot_sub.year,
        annual_pot_sub.wr_total_year,
        color="sienna", 
        marker="o",
       label='Workers', linewidth=2.5)
ax.grid(False)
# set x-axis label
ax.set_xlabel("Year", fontsize=16)
ax.set_xticks([2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020])
# set y-axis label
ax.set_ylabel("Number of workers",
              fontsize=16)
ax.set_title('Potato Workers vs. Yield', fontsize=22)
ax.legend(loc='upper right')
# twin object for two different y-axis on the sample plot
ax2=ax.twinx()
# make a plot with different y-axis using second axis object
ax2.plot(potatoes_df_sub.Year, potatoes_df_sub.Value, color="dimgrey",marker="o", label='Yield', linewidth=2.5, linestyle=':')
ax2.set_ylabel("Yield (cwt per acre)",color="black",fontsize=16)
ax2.legend(loc=(0.86, 0.85));

## Tomato workers vs. yield

In [None]:
tomato_df = pd.read_csv('../data_yields/yield_tomato.csv')
tomato_df.head()

In [None]:
tomato_df_sub = tomato_df.groupby(['Year']).sum()
tomato_df_sub = tomato_df_sub.reset_index(level=0)
tomato_df_sub

In [None]:
# Subset annual_tom to provide only year and wr_total_year
annual_tom_sub = annual_tom[["year", "wr_total_year"]]
annual_tom_sub

In [None]:
# create figure and axis objects with subplots()
fig,ax = plt.subplots(figsize=(14,8))
# make a plot
ax.plot(annual_tom_sub.year,
        annual_tom_sub.wr_total_year,
        color="red", 
        marker="o",
       label='Workers', linewidth=2.5)
ax.grid(False)
# set x-axis label
ax.set_xlabel("Year", fontsize=16)
ax.set_xticks([2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020])
# set y-axis label
ax.set_ylabel("Number of workers",
              fontsize=16)
ax.set_title('Tomato Workers vs. Yield', fontsize=22)
ax.legend(loc=(0.01, 0.88))
# twin object for two different y-axis on the sample plot
ax2=ax.twinx()
# make a plot with different y-axis using second axis object
ax2.plot(tomato_df_sub.Year, tomato_df_sub.Value, color="dimgrey",marker="o", label='Yield', linewidth=2.5, linestyle=':')
ax2.set_ylabel("Yield (cwt per acre)",color="black",fontsize=16)
ax2.legend(loc=(0.01, 0.82));

2019 hail damage, high temperatures, disease.
- https://www.agalert.com/story/?id=13378
- https://www.nass.usda.gov/Statistics_by_State/California/Publications/Specialty_and_Other_Releases/Tomatoes/2019/201908ptom.pdf

## How does the number of TLC approvals compare to the number of H-2A visas issued?

In [None]:
ann_app = pd.read_csv('../data_TLC/annual_approvals_tlc_H2A.csv')
ann_app

In [None]:
# create figure and axis objects with subplots()
fig,ax = plt.subplots(figsize=(14,8))
# make a plot
ax.plot(ann_app.year,
        ann_app.tlc_approvals,
        color="indianred", 
        marker="o",
       label='TLC approvals')
ax.grid(False)
# set x-axis label
ax.set_xlabel("Year", fontsize=16)
ax.set_xticks([2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020])
# set y-axis label
ax.set_ylabel("Number of TLC approvals",
              fontsize=16)
ax.set_title('TLC Approvals vs H-2As Issued', fontsize=22)
ax.legend(loc='best')
# twin object for two different y-axis on the sample plot
ax2=ax.twinx()
# make a plot with different y-axis using second axis object
ax2.plot(ann_app.year, ann_app.H2As_issued, color="indigo",marker="o", label='H-2As issued')
ax2.legend(loc=(0.01, 0.85))
ax2.set_ylabel("Number of H-2A visas issued",color="black",fontsize=16);

In [None]:
# Add data labels to scatterplot
# https://towardsdatascience.com/how-to-add-text-labels-to-scatterplot-in-matplotlib-seaborn-ec5df6afed7a
plt.figure(figsize=(16, 8))
sns.scatterplot(data=ann_app, x=ann_app.tlc_approvals, y=ann_app.H2As_issued)
for i in range(ann_app.shape[0]) :
 plt.text(x=ann_app.tlc_approvals[i]+100,y=ann_app.H2As_issued[i]+100,s=ann_app.year[i], 
          fontdict=dict(color='black', size=15))
plt.title('TLC Approvals vs H-2As Issued', fontsize=22)
plt.xlabel('Number of TLC approvals', fontsize=16)
plt.ylabel('Number of H-2A visas issued', fontsize=16);

## What are the most common H-2A jobs?

In [None]:
com_jobs = pd.read_csv('../data_TLC/mostcommonjobs_clean.csv')
cj_sorted = com_jobs[:10].sort_values(by='total_wkrs', ascending=True)

In [None]:
plt.figure(figsize=(16, 10))
plt.barh(cj_sorted.job_title_clean, cj_sorted.total_wkrs)
plt.xticks(rotation = 0, fontsize=16)
plt.xlabel('Number of jobs', fontsize=18)
plt.ylabel('Job title', fontsize=18)
plt.yticks(fontsize=16)
plt.title('Top 10 Most Common Job Titles, 2010-2020', fontsize=30);

## States requesting the most H-2A workers

In [None]:
q = '''
SELECT DISTINCT employer_state,
   SUM((workers_req::float)) OVER(PARTITION BY employer_state)
FROM main
ORDER BY sum DESC
LIMIT 5;
'''
result = engine.execute(q)
top5states = pd.read_sql(q, con = engine)
#top5states.to_csv('../data_TLC/stateswithmostH2aworkers.csv')

In [None]:
plt.figure(figsize=(16, 10))
plt.bar(top5states['employer_state'], top5states['sum'])
plt.ylabel('Number of H-2A workers requested', fontsize=16)
#plt.xticks(rotation = 50, fontsize=16)
plt.title('States Requesting the Most H-2A Workers', fontsize=22);