# CROP YIELD DATA

## First I will combine all crop yield data into one DataFrame (yield_df) and import that table into Postgres to filter across all 7 data sets. Then I will pull those SQL queries back here to make visualizations.
- This can all be done in Python but I want to practice more SQL and get used to how SQL and Python connect to each other

In [None]:
import pandas as pd
from sqlalchemy import create_engine
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import requests as re
import glob
import os

## Import multiple CSVs and concate into one DataFrame:
https://stackoverflow.com/questions/20906474/import-multiple-csv-files-into-pandas-and-concatenate-into-one-dataframe

In [None]:
# path = r'C:\Users\Jasmine\Documents\NSS\Projects\AgAidFromAbroad\data_yields' # use your path
# all_files = glob.glob(os.path.join(path, "*.csv")) # advisable to use os.path.join as this makes concatenation OS independent

# df_from_each_file = (pd.read_csv(f) for f in all_files)
# yield_df   = pd.concat(df_from_each_file, ignore_index=True)

In [None]:
# Check all files imported into one
# yield_df

In [None]:
# Save it to the same folder
# yield_df.to_csv(r'C:\Users\Jasmine\Documents\NSS\Projects\AgAidFromAbroad\data_yields\yield_all.csv', index=False)

I will analyze this master crop yield CSV in SQL and replicate the queries to below

## Which states request the most H-2A workers? How does this compare to states with the highest crop yields?
For each crop:
1. Get states with most workers
2. Get states with most yields
3. Change state names in 2 to match the two-letter abbreviations in 1
4. INNER JOIN these two DataFrames together because I only want states that have both H-2A workers and recorded yields

In [None]:
# Set up SQL connection
database_name = 'ag_aid'
connection_string = f"postgresql://postgres:postgres@localhost:5432/{database_name}"
engine = create_engine(connection_string)

### States with most citrus H-2A workers vs states with most citrus yields

In [None]:
# States with most workers
query_cit1 = '''
SELECT DISTINCT employer_state,
   SUM((workers_req::float)) OVER(PARTITION BY employer_state)
FROM main
WHERE job_title ILIKE '%%citrus%%'
ORDER BY sum DESC;
'''
result = engine.execute(query_cit1)
cit_wkr_states = pd.read_sql(query_cit1, con = engine)
cit_wkr_states.rename(columns={'employer_state': 'state'}, inplace=True)
cit_wkr_states

In [None]:
# States with most yields
query_cit2 = '''
SELECT DISTINCT state,
    SUM(value) OVER(PARTITION BY state)
FROM cy
WHERE commodity LIKE 'CITRUS'
GROUP BY state, value
ORDER BY sum DESC;
'''
result = engine.execute(query_cit2)
cit_yld_states = pd.read_sql(query_cit2, con = engine)

# States with most yields: Change states to two-letter abbreviations
cit_yld_states['state'] = cit_yld_states['state'].map({'CALIFORNIA':'CA', 'FLORIDA':'FL', 'TEXAS':'TX', 'ARIZONA':'AZ'})
cit_yld_states

In [None]:
# Inner join (sum_x is workers, sum_y is yield)
cit_merged = pd.merge(cit_wkr_states, cit_yld_states, on='state', how='inner')
cit_merged

In [None]:
plt.figure(figsize=(16, 8))
plt.scatter(cit_merged.sum_x, cit_merged.sum_y);

### States with most strawberry H-2A workers vs states with most strawberry yields

In [None]:
# States with most workers
query_straw1 = '''
SELECT DISTINCT employer_state,
   SUM((workers_req::float)) OVER(PARTITION BY employer_state)
FROM main
WHERE job_title ILIKE '%%strawberr%%'
ORDER BY sum DESC;
'''
result = engine.execute(query_straw1)
straw_wkr_states = pd.read_sql(query_straw1, con = engine)
straw_wkr_states.rename(columns={'employer_state': 'state'}, inplace=True)
straw_wkr_states

In [None]:
# States with most yields
query_straw2 = '''
SELECT DISTINCT state,
    SUM(value) OVER(PARTITION BY state)
FROM cy
WHERE commodity LIKE 'STRAWBERRIES'
GROUP BY state, value
ORDER BY sum DESC;
'''
result = engine.execute(query_straw2)
straw_yld_states = pd.read_sql(query_straw2, con = engine)
straw_yld_states

# States with most yields: Change states to two-letter abbreviations
straw_yld_states['state'] = straw_yld_states['state'].map({'CALIFORNIA':'CA', 'FLORIDA':'FL', 'OREGON':'OR', 'NORTH CAROLINA':'NC', 'WASHINGTON':'WA', 'MICHIGAN':'MI', 'NEW YORK':'NY', 'WISCONSIN':'WI', 'OHIO':'OH', 'PENNSYLVANIA':'PA'})
straw_yld_states

In [None]:
# Inner join (sum_x is workers, sum_y is yield)
straw_merged = pd.merge(straw_wkr_states, straw_yld_states, on='state', how='inner')
straw_merged

In [None]:
plt.figure(figsize=(16, 8))
plt.scatter(straw_merged.sum_x, straw_merged.sum_y);

### States with most sheep H-2A workers vs states with most sheep yields

In [None]:
# States with most workers
query_she1 = '''
SELECT DISTINCT employer_state,
   SUM((workers_req::float)) OVER(PARTITION BY employer_state)
FROM main
WHERE job_title ILIKE '%%sheep%%'
ORDER BY sum DESC;
'''
result = engine.execute(query_she1)
she_wkr_states = pd.read_sql(query_she1, con = engine)
she_wkr_states.rename(columns={'employer_state': 'state'}, inplace=True)
she_wkr_states

In [None]:
# States with most yields
query_she2 = '''
SELECT DISTINCT state,
    SUM(value) OVER(PARTITION BY state)
FROM cy
WHERE commodity LIKE 'SHEEP'
GROUP BY state, value
ORDER BY sum DESC;
'''
result = engine.execute(query_she2)
she_yld_states = pd.read_sql(query_she2, con = engine)
she_yld_states

# States with most yields: Change states to two-letter abbreviations
she_yld_states['state'] = she_yld_states['state'].map({'TEXAS':'TX', 'MONTANA':'MT', 'CALIFORNIA':'CA', 'WYOMING':'WY', 'IDAHO':'ID', 'IOWA':'IA', 'OREGON':'OR', 'COLORADO':'CO', 'SOUTH DAKOTA':'SD', 'UTAH':'UT', 'OHIO':'OH', 'OTHER STATES':'OTHER', 'MISSOURI':'MO', 'MINNESOTA':'MN', 'WISCONSIN':'WI', 'PENNSYLVANIA':'PA', 'NORTH DAKOTA':'ND', 'NEW YORK':'NY', 'ILLINOIS':'IL', 'MICHIGAN':'MI', 'VIRGINIA':'VA', 'ARIZONA':'AZ', 'WASHINGTON':'WA', 'NEBRASKA':'NE', 'NEW MEXICO':'NM', 'NEVADA':'NV', 'KENTUCKY':'KY', 'TENNESSEE':'TN', 'INDIANA':'IN', 'OKLAHOMA':'OK', 'KANSAS':'KS', 'WEST VIRGINIA':'WV', 'NORTH CAROLINA':'NC'})
she_yld_states

In [None]:
# Inner join (sum_x is workers, sum_y is yield)
she_merged = pd.merge(she_wkr_states, she_yld_states, on='state', how='inner')
she_merged = she_merged.rename(columns={'sum_x': 'Workers', 'sum_y': 'Yield'})
she_merged

In [None]:
plt.figure(figsize=(16, 8))
plt.scatter(she_merged.Workers, she_merged.Yield);

### States with most lettuce H-2A workers vs states with most lettuce yields

In [None]:
# States with most workers
query_let1 = '''
SELECT DISTINCT employer_state,
   SUM((workers_req::float)) OVER(PARTITION BY employer_state)
FROM main
WHERE job_title ILIKE '%%lettuce%%'
ORDER BY sum DESC;
'''
result = engine.execute(query_let1)
let_wkr_states = pd.read_sql(query_let1, con = engine)
let_wkr_states.rename(columns={'employer_state': 'state'}, inplace=True)
let_wkr_states

In [None]:
# States with most yields
query_let2 = '''
SELECT DISTINCT state,
    SUM(value) OVER(PARTITION BY state)
FROM cy
WHERE commodity LIKE 'LETTUCE'
GROUP BY state, value
ORDER BY sum DESC;
'''
result = engine.execute(query_let2)
let_yld_states = pd.read_sql(query_let2, con = engine)
let_yld_states['state'] = let_yld_states['state'].map({'CALIFORNIA':'CA', 'ARIZONA':'AZ'})
let_yld_states

In [None]:
# Inner join (sum_x is workers, sum_y is yield)
let_merged = pd.merge(let_wkr_states, let_yld_states, on='state', how='inner')
let_merged = let_merged.rename(columns={'sum_x': 'Workers', 'sum_y': 'Yield'})
let_merged

In [None]:
# Grouped bar plot
let_merged.plot(x="state", y=["Workers", "Yield"], kind="bar", figsize=(16,10))
plt.title('States with "Lettuce" Workers and Lettuce Yields, 2010-2020', fontsize=22)
plt.xlabel('')
plt.xticks(fontsize=16, rotation=360)
plt.ylabel('Number of H-2A workers', fontsize=16);

In [None]:
plt.figure(figsize=(16, 8))
plt.scatter(let_merged.Workers, let_merged.Yield)
plt.title('States by "Lettuce" Workers and Lettuce Yields, 2010-2020', fontsize=22, label=)
plt.xlabel('Number of H-2A workers', fontsize=16)
plt.xticks(fontsize=16, rotation=360)
plt.ylabel('Lettuce yield (cwt)', fontsize=16);

### States with most potato H-2A workers vs states with most potato yields

In [None]:
# States with most workers
query_pot1 = '''
SELECT DISTINCT employer_state,
   SUM((workers_req::float)) OVER(PARTITION BY employer_state)
FROM main
WHERE job_title ILIKE '%%potato%%'
ORDER BY sum DESC;
'''
result = engine.execute(query_pot1)
pot_wkr_states = pd.read_sql(query_pot1, con = engine)
pot_wkr_states.rename(columns={'employer_state': 'state'}, inplace=True)
pot_wkr_states

In [None]:
# States with most yields
query_pot2 = '''
SELECT DISTINCT state,
    SUM(value) OVER(PARTITION BY state)
FROM cy
WHERE commodity LIKE 'POTATOES'
GROUP BY state, value
ORDER BY sum DESC;
'''
result = engine.execute(query_pot2)
pot_yld_states = pd.read_sql(query_pot2, con = engine)
pot_yld_states
# States with most yields: Change states to two-letter abbreviations
pot_yld_states['state'] = pot_yld_states['state'].map({'CALIFORNIA':'CA', 'WASHINGTON':'WA', 'TEXAS':'TX', 'OREGON':'OR', 'IDAHO':'ID', 'COLORADO':'CO',  'WISCONSIN':'WI', 'NEBRASKA':'NE', 'MINNESOTA':'MN', 'NORTH DAKOTA':'ND', 'FLORIDA':'FL', 'MICHIGAN':'MI', 'OTHER STATES':'OTHER', 'KANSAS':'KS', 'NEW JERSEY':'NJ', 'ILLINOIS':'IL', 'MAINE':'ME', 'MONTANA':'MT', 'MARYLAND':'MD', 'NEW YORK':'NY', 'ALASKA':'AK', 'MISSOURI':'MI', 'NORTH CAROLINA':'NC', 'VIRGINIA':'VA', 'PENNSYLVANIA':'PA', 'MASSACHUSETTS':'MA', 'OHIO':'OH', 'LOUISIANA':'LA', 'MISSISSIPPI':'MS', 'DELAWARE':'DE', 'RHODE ISLAND':'RI', 'NEVADA':'NV', 'ARIZONA':'AZ', 'ARKANSAS':'AR', 'ALABAMA':'AL', 'NEW MEXICO':'NM'})
pot_yld_states

In [None]:
# Inner join (sum_x is workers, sum_y is yield)
pot_merged = pd.merge(pot_wkr_states, pot_yld_states, on='state', how='inner')
pot_merged

In [None]:
plt.figure(figsize=(16, 8))
plt.scatter(pot_merged.sum_x, pot_merged.sum_y);

### States with most tobacco H-2A workers vs states with most tobacco yields

In [None]:
# States with most workers
query_tob1 = '''
SELECT DISTINCT employer_state,
   SUM((workers_req::float)) OVER(PARTITION BY employer_state)
FROM main
WHERE job_title ILIKE '%%tobacco%%'
ORDER BY sum DESC;
'''
result = engine.execute(query_tob1)
tob_wkr_states = pd.read_sql(query_tob1, con = engine)
tob_wkr_states.rename(columns={'employer_state': 'state'}, inplace=True)
tob_wkr_states

In [None]:
# States with most yields
query_tob2 = '''
SELECT DISTINCT state,
    SUM(value) OVER(PARTITION BY state)
FROM cy
WHERE commodity LIKE 'TOBACCO'
GROUP BY state, value
ORDER BY sum DESC;
'''
result = engine.execute(query_tob2)
tob_yld_states = pd.read_sql(query_tob2, con = engine)
tob_yld_states
tob_yld_states['state'] = tob_yld_states['state'].map({'TENNESSEE':'TN', 'VIRGINIA':'VA', 'PENNSYLVANIA':'PA', 'KENTUCKY':'KY', 'NORTH CAROLINA':'NC', 'GEORGIA':'GA', 'SOUTH CAROLINA':'SC', 'OTHER STATES':'OTHER', 'OHIO':'OH', 'MASSACHUSETTS':'MA', 'CONNECTICUT':'CT'})
tob_yld_states

In [None]:
# Inner join (sum_x is workers, sum_y is yield)
tob_merged = pd.merge(tob_wkr_states, tob_yld_states, on='state', how='inner')
tob_merged

### States with most tomato H-2A workers vs states with most tomato yields

In [None]:
# States with most workers
query_tom1 = '''
SELECT DISTINCT employer_state,
   SUM((workers_req::float)) OVER(PARTITION BY employer_state)
FROM main
WHERE job_title ILIKE '%%tomato%%'
ORDER BY sum DESC;
'''
result = engine.execute(query_tom1)
tom_wkr_states = pd.read_sql(query_tom1, con = engine)
tom_wkr_states.rename(columns={'employer_state': 'state'}, inplace=True)
tom_wkr_states

In [None]:
# States with most yields
query_tom2 = '''
SELECT DISTINCT state,
    SUM(value) OVER(PARTITION BY state)
FROM cy
WHERE commodity LIKE 'TOMATOES'
GROUP BY state, value
ORDER BY sum DESC;
'''
result = engine.execute(query_tom2)
tom_yld_states = pd.read_sql(query_tom2, con = engine)
tom_yld_states['state'] = tom_yld_states['state'].map({'CALIFORNIA':'CA', 'MICHIGAN':'MI', 'OHIO':'OH', 'TENNESSEE':'TN', 'INDIANA':'IN', 'NORTH CAROLINA':'NC', 'SOUTH CAROLINA':'SC', 'VIRGINIA':'VA', 'FLORIDA':'FL', 'NEW JERSEY':'NJ', 'ALABAMA':'AL', 'ARKANSAS':'AR', 'NEW YORK':'NY', 'PENNSYLVANIA':'PA', 'GEORGIA':'GA', 'OTHER STATES':'OTHER', 'TEXAS':'TX'})
tom_yld_states

In [None]:
# Inner join (sum_x is workers, sum_y is yield)
tom_merged = pd.merge(tom_wkr_states, tom_yld_states, on='state', how='inner')
tom_merged