***
#### Springboard.com: Data Science Career Track: Capstone 1: Machine Learning Excercise
# Predicting Sugarcane Production in the United States
***
### Import the Required Python Packages

In [1]:
import pandas as pd
import numpy as np
import plotly as py
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

py.offline.init_notebook_mode(connected=True)

***
### Import the Dataset(s)

In [2]:
# Import the sugarcane data into a pandas DataFrame
df_sugarcane = pd.read_csv("../03 Data Wrangling/df_sugarcane.csv", header=[0,1], index_col=0)
df_sugarcane.tail()

State,FL,FL,FL,HI,HI,HI,LA,LA,LA,TX,TX,TX
Data Item,PRIndex,Value,Weight,PRIndex,Value,Weight,PRIndex,Value,Weight,PRIndex,Value,Weight
Year,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
2013,35.1,481572000.0,13720000.0,57.5,77740000.0,1352000.0,25.9,323880000.0,12505000.0,18.8,27185000.0,1446000.0
2014,36.8,553950000.0,15053000.0,43.1,54349000.0,1261000.0,33.6,382603000.0,11387000.0,9.41,11236000.0,1194000.0
2015,35.6,602174000.0,16915000.0,40.7,46357000.0,1139000.0,24.7,281481000.0,11396000.0,21.1,23316000.0,1105000.0
2016,38.7,623844000.0,16120000.0,40.7,54375000.0,1336000.0,24.6,283392000.0,11520000.0,20.5,28598000.0,1395000.0
2017,,,16237000.0,,,,,,13455000.0,,,1490000.0


In [3]:
# Import the regional weather data into a pandas DataFrame
df_weather = pd.read_csv("../04 Data Storytelling/df_weather.csv", header=[0,1], index_col=0)
df_weather.tail()

Unnamed: 0_level_0,TMAX,TMAX,TMAX,TMAX,WDMV,WDMV,WDMV,WDMV,TMIN,TMIN,TMIN,TMIN,PRCP,PRCP,PRCP,PRCP,SNOW,SNOW,SNOW,SNOW
State,FL,HI,LA,TX,FL,HI,LA,TX,FL,HI,LA,TX,FL,HI,LA,TX,FL,HI,LA,TX
Year,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2
2013,96.0,94.0,95.5,108.0,188.3,77.1,144.2,180.8,40.0,58.0,27.5,2.0,48.21,102.24,78.075,17.54,0.0,0.0,0.0,0.0
2014,97.0,93.0,95.0,105.0,170.3,92.0,139.8,188.9,35.0,58.0,19.0,0.0,50.05,115.24,58.12,26.49,0.0,0.0,0.0,0.0
2015,98.0,93.0,98.0,107.0,277.8,51.0,231.8,136.1,38.0,59.0,21.0,32.0,40.46,147.59,70.92,29.63,0.0,0.0,0.0,0.0
2016,97.0,90.0,95.5,105.0,275.3,,146.0,131.7,40.0,59.0,29.0,29.0,56.33,128.43,75.935,15.12,0.0,0.0,0.0,0.0
2017,97.0,89.0,95.5,110.0,359.2,,183.9,242.05,40.0,57.0,21.5,27.0,49.42,105.57,76.15,17.87,0.0,0.0,0.4,0.0


***
### Reshape the Dataset(s)

In [4]:
# Combine Florida's sugarcane production data with Florida's weather data and handle missing values
FL_s = df_sugarcane["FL"][["Weight"]].reset_index().dropna()

FL_w = df_weather.loc(axis=1)[:,"FL"].reset_index()
FL_w.columns = FL_w.columns.droplevel(level=1)

FL = pd.merge(left=FL_s, right=FL_w, left_on="Year", right_on="Year")
FL = FL.fillna(FL.mean())
FL.tail()
# FL.info()

Unnamed: 0,Year,Weight,TMAX,WDMV,TMIN,PRCP,SNOW
85,2013,13720000.0,96.0,188.3,40.0,48.21,0.0
86,2014,15053000.0,97.0,170.3,35.0,50.05,0.0
87,2015,16915000.0,98.0,277.8,38.0,40.46,0.0
88,2016,16120000.0,97.0,275.3,40.0,56.33,0.0
89,2017,16237000.0,97.0,359.2,40.0,49.42,0.0


In [5]:
# Combine Hawaii's sugarcane production data with Hawaii's weather data and handle missing values
HI_s = df_sugarcane["HI"][["Weight"]].reset_index().dropna()

HI_w = df_weather.loc(axis=1)[:,"HI"].reset_index()
HI_w.columns = HI_w.columns.droplevel(level=1)

HI = pd.merge(left=HI_s, right=HI_w, left_on="Year", right_on="Year")
HI = HI.fillna(HI.mean())
HI.tail()
# HI.info()

Unnamed: 0,Year,Weight,TMAX,WDMV,TMIN,PRCP,SNOW
78,2012,1262000.0,86.0,121.2,60.0,90.94,0.0
79,2013,1352000.0,94.0,77.1,58.0,102.24,0.0
80,2014,1261000.0,93.0,92.0,58.0,115.24,0.0
81,2015,1139000.0,93.0,51.0,59.0,147.59,0.0
82,2016,1336000.0,90.0,110.788889,59.0,128.43,0.0


In [6]:
# Combine Louisiana's sugarcane production data with Louisiana's weather data and handle missing values
LA_s = df_sugarcane["LA"][["Weight"]].reset_index().dropna()

LA_w = df_weather.loc(axis=1)[:,"LA"].reset_index()
LA_w.columns = LA_w.columns.droplevel(level=1)

LA = pd.merge(left=LA_s, right=LA_w, left_on="Year", right_on="Year")
LA = LA.fillna(LA.mean())
LA.tail()
# LA.info()

Unnamed: 0,Year,Weight,TMAX,WDMV,TMIN,PRCP,SNOW
104,2013,12505000.0,95.5,144.2,27.5,78.075,0.0
105,2014,11387000.0,95.0,139.8,19.0,58.12,0.0
106,2015,11396000.0,98.0,231.8,21.0,70.92,0.0
107,2016,11520000.0,95.5,146.0,29.0,75.935,0.0
108,2017,13455000.0,95.5,183.9,21.5,76.15,0.4


In [7]:
# Combine Texas' sugarcane production data with Texas' weather data and handle missing values
TX_s = df_sugarcane["TX"][["Weight"]].reset_index().dropna()

TX_w = df_weather.loc(axis=1)[:,"TX"].reset_index()
TX_w.columns = TX_w.columns.droplevel(level=1)

TX = pd.merge(left=TX_s, right=TX_w, left_on="Year", right_on="Year")
TX = TX.fillna(TX.mean())
TX.tail()
# TX.info()

Unnamed: 0,Year,Weight,TMAX,WDMV,TMIN,PRCP,SNOW
55,2013,1446000.0,108.0,180.8,2.0,17.54,0.0
56,2014,1194000.0,105.0,188.9,0.0,26.49,0.0
57,2015,1105000.0,107.0,136.1,32.0,29.63,0.0
58,2016,1395000.0,105.0,131.7,29.0,15.12,0.0
59,2017,1490000.0,110.0,242.05,27.0,17.87,0.0


***
### Model the Problem with Machine Learning Algorithm(s)

In [8]:
# Compare Linear Regression to Random Forest Regression for Florida's sugarcane production
X_FL = FL[["Year", "TMAX", "TMIN", "WDMV", "PRCP", "SNOW"]]
y_FL = FL["Weight"]

X_FL_train, X_FL_test, y_FL_train, y_FL_test = train_test_split(X_FL, y_FL, test_size=0.3, random_state=42)
# QUESTION: Does the test set pull randomly from this data or does it simply take the last 30% of the data?

lr_model_FL = LinearRegression()
rf_model_FL = RandomForestRegressor()

lr_model_FL.fit(X_FL_train, y_FL_train)
rf_model_FL.fit(X_FL_train, y_FL_train)

print("FLORIDA SUGARCANE PRODUCTION VS FLORIDA WEATHER:")
print("  Linear Regression Model Results:")
print("    R^2 train: {0:.4f}".format(lr_model_FL.score(X_FL_train, y_FL_train)))
print("    R^2  test: {0:.4f}".format(lr_model_FL.score(X_FL_test, y_FL_test)))
print()
print("  Random Forest Regression Model Results:")
print("    R^2 train: {0:.4f}".format(rf_model_FL.score(X_FL_train, y_FL_train)))
print("    R^2  test: {0:.4f}".format(rf_model_FL.score(X_FL_test, y_FL_test)))

FLORIDA SUGARCANE PRODUCTION VS FLORIDA WEATHER:
  Linear Regression Model Results:
    R^2 train: 0.9235
    R^2  test: 0.8793

  Random Forest Regression Model Results:
    R^2 train: 0.9891
    R^2  test: 0.9678


In [9]:
# Compare Linear Regression to Random Forest Regression for Hawaii's sugarcane production
X_HI = HI[["Year", "TMAX", "TMIN", "WDMV", "PRCP", "SNOW"]]
y_HI = HI["Weight"]

X_HI_train, X_HI_test, y_HI_train, y_HI_test = train_test_split(X_HI, y_HI, test_size=0.3, random_state=42)
# QUESTION: Does the test set pull randomly from this data or does it simply take the last 30% of the data?

lr_model_HI = LinearRegression()
rf_model_HI = RandomForestRegressor()

lr_model_HI.fit(X_HI_train, y_HI_train)
rf_model_HI.fit(X_HI_train, y_HI_train)

print("HAWAII SUGARCANE PRODUCTION VS HAWAII WEATHER:")
print("  Linear Regression Model Results:")
print("    R^2 train: {0:.4f}".format(lr_model_HI.score(X_HI_train, y_HI_train)))
print("    R^2  test: {0:.4f}".format(lr_model_HI.score(X_HI_test, y_HI_test)))
print()
print("  Random Forest Regression Model Results:")
print("    R^2 train: {0:.4f}".format(rf_model_HI.score(X_HI_train, y_HI_train)))
print("    R^2  test: {0:.4f}".format(rf_model_HI.score(X_HI_test, y_HI_test)))

HAWAII SUGARCANE PRODUCTION VS HAWAII WEATHER:
  Linear Regression Model Results:
    R^2 train: 0.6897
    R^2  test: 0.4619

  Random Forest Regression Model Results:
    R^2 train: 0.9934
    R^2  test: 0.9366


In [10]:
# Compare Linear Regression to Random Forest Regression for Louisiana's sugarcane production
X_LA = LA[["Year", "TMAX", "TMIN", "WDMV", "PRCP", "SNOW"]]
y_LA = LA["Weight"]

X_LA_train, X_LA_test, y_LA_train, y_LA_test = train_test_split(X_LA, y_LA, test_size=0.3, random_state=42)
# QUESTION: Does the test set pull randomly from this data or does it simply take the last 30% of the data?

lr_model_LA = LinearRegression()
rf_model_LA = RandomForestRegressor()

lr_model_LA.fit(X_LA_train, y_LA_train)
rf_model_LA.fit(X_LA_train, y_LA_train)

print("LOUISIANA SUGARCANE PRODUCTION VS LOUISIANA WEATHER:")
print("  Linear Regression Model Results:")
print("    R^2 train: {0:.4f}".format(lr_model_LA.score(X_LA_train, y_LA_train)))
print("    R^2  test: {0:.4f}".format(lr_model_LA.score(X_LA_test, y_LA_test)))
print()
print("  Random Forest Regression Model Results:")
print("    R^2 train: {0:.4f}".format(rf_model_LA.score(X_LA_train, y_LA_train)))
print("    R^2  test: {0:.4f}".format(rf_model_LA.score(X_LA_test, y_LA_test)))

LOUISIANA SUGARCANE PRODUCTION VS LOUISIANA WEATHER:
  Linear Regression Model Results:
    R^2 train: 0.8027
    R^2  test: 0.6125

  Random Forest Regression Model Results:
    R^2 train: 0.9816
    R^2  test: 0.8239


In [11]:
# Compare Linear Regression to Random Forest Regression for Texas' sugarcane production
X_TX = TX[["Year", "TMAX", "TMIN", "WDMV", "PRCP", "SNOW"]]
y_TX = TX["Weight"]

X_TX_train, X_TX_test, y_TX_train, y_TX_test = train_test_split(X_TX, y_TX, test_size=0.3, random_state=42)
# QUESTION: Does the test set pull randomly from this data or does it simply take the last 30% of the data?

lr_model_TX = LinearRegression()
rf_model_TX = RandomForestRegressor()

lr_model_TX.fit(X_TX_train, y_TX_train)
rf_model_TX.fit(X_TX_train, y_TX_train)

print("TEXAS SUGARCANE PRODUCTION VS TEXAS WEATHER:")
print("  Linear Regression Model Results:")
print("    R^2 train: {0:.4f}".format(lr_model_TX.score(X_TX_train, y_TX_train)))
print("    R^2  test: {0:.4f}".format(lr_model_TX.score(X_TX_test, y_TX_test)))
print()
print("  Random Forest Regression Model Results:")
print("    R^2 train: {0:.4f}".format(rf_model_TX.score(X_TX_train, y_TX_train)))
print("    R^2  test: {0:.4f}".format(rf_model_TX.score(X_TX_test, y_TX_test)))

TEXAS SUGARCANE PRODUCTION VS TEXAS WEATHER:
  Linear Regression Model Results:
    R^2 train: 0.8606
    R^2  test: 0.8158

  Random Forest Regression Model Results:
    R^2 train: 0.9775
    R^2  test: 0.9391
