***
#### Springboard.com: Data Science Career Track: Capstone 1: Machine Learning Excercise
# Predicting Sugarcane Production in the United States
***
### Import the Required Python Packages

In [1]:
import pandas as pd
import numpy as np
import plotly as py
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from statsmodels.formula.api import ols
import warnings

warnings.filterwarnings('ignore')
py.offline.init_notebook_mode(connected=True)

***
### Import the Dataset(s)

In [2]:
# Import the sugarcane data into a pandas DataFrame
df_sugarcane = pd.read_csv("../03 Data Wrangling/df_sugarcane.csv", header=[0,1], index_col=0)
df_sugarcane.tail()

State,FL,FL,FL,HI,HI,HI,LA,LA,LA,TX,TX,TX
Data Item,PRIndex,Value,Weight,PRIndex,Value,Weight,PRIndex,Value,Weight,PRIndex,Value,Weight
Year,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
2013,35.1,481572000.0,13720000.0,57.5,77740000.0,1352000.0,25.9,323880000.0,12505000.0,18.8,27185000.0,1446000.0
2014,36.8,553950000.0,15053000.0,43.1,54349000.0,1261000.0,33.6,382603000.0,11387000.0,9.41,11236000.0,1194000.0
2015,35.6,602174000.0,16915000.0,40.7,46357000.0,1139000.0,24.7,281481000.0,11396000.0,21.1,23316000.0,1105000.0
2016,38.7,623844000.0,16120000.0,40.7,54375000.0,1336000.0,24.6,283392000.0,11520000.0,20.5,28598000.0,1395000.0
2017,,,16237000.0,,,,,,13455000.0,,,1490000.0


In [3]:
# Import the regional weather data into a pandas DataFrame
df_weather = pd.read_csv("../04 Data Storytelling/df_weather.csv", header=[0,1], index_col=0)
df_weather.tail()

Unnamed: 0_level_0,TMAX,TMAX,TMAX,TMAX,WDMV,WDMV,WDMV,WDMV,TMIN,TMIN,TMIN,TMIN,PRCP,PRCP,PRCP,PRCP,SNOW,SNOW,SNOW,SNOW
State,FL,HI,LA,TX,FL,HI,LA,TX,FL,HI,LA,TX,FL,HI,LA,TX,FL,HI,LA,TX
Year,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2
2013,96.0,94.0,95.5,108.0,188.3,77.1,144.2,180.8,40.0,58.0,27.5,2.0,48.21,102.24,78.075,17.54,0.0,0.0,0.0,0.0
2014,97.0,93.0,95.0,105.0,170.3,92.0,139.8,188.9,35.0,58.0,19.0,0.0,50.05,115.24,58.12,26.49,0.0,0.0,0.0,0.0
2015,98.0,93.0,98.0,107.0,277.8,51.0,231.8,136.1,38.0,59.0,21.0,32.0,40.46,147.59,70.92,29.63,0.0,0.0,0.0,0.0
2016,97.0,90.0,95.5,105.0,275.3,,146.0,131.7,40.0,59.0,29.0,29.0,56.33,128.43,75.935,15.12,0.0,0.0,0.0,0.0
2017,97.0,89.0,95.5,110.0,359.2,,183.9,242.05,40.0,57.0,21.5,27.0,49.42,105.57,76.15,17.87,0.0,0.0,0.4,0.0


***
### Reshape the Dataset(s)

In [4]:
# Combine Florida's sugarcane production data with Florida's weather data and handle missing values
FL_s = df_sugarcane["FL"][["Weight"]].reset_index().dropna()

FL_w = df_weather.loc(axis=1)[:,"FL"].reset_index()
FL_w.columns = FL_w.columns.droplevel(level=1)

FL = pd.merge(left=FL_s, right=FL_w, left_on="Year", right_on="Year")
FL = FL.fillna(FL.mean())
FL.tail()
# FL.info()

Unnamed: 0,Year,Weight,TMAX,WDMV,TMIN,PRCP,SNOW
85,2013,13720000.0,96.0,188.3,40.0,48.21,0.0
86,2014,15053000.0,97.0,170.3,35.0,50.05,0.0
87,2015,16915000.0,98.0,277.8,38.0,40.46,0.0
88,2016,16120000.0,97.0,275.3,40.0,56.33,0.0
89,2017,16237000.0,97.0,359.2,40.0,49.42,0.0


In [5]:
# Combine Hawaii's sugarcane production data with Hawaii's weather data and handle missing values
HI_s = df_sugarcane["HI"][["Weight"]].reset_index().dropna()

HI_w = df_weather.loc(axis=1)[:,"HI"].reset_index()
HI_w.columns = HI_w.columns.droplevel(level=1)

HI = pd.merge(left=HI_s, right=HI_w, left_on="Year", right_on="Year")
HI = HI.fillna(HI.mean())
HI.tail()
# HI.info()

Unnamed: 0,Year,Weight,TMAX,WDMV,TMIN,PRCP,SNOW
78,2012,1262000.0,86.0,121.2,60.0,90.94,0.0
79,2013,1352000.0,94.0,77.1,58.0,102.24,0.0
80,2014,1261000.0,93.0,92.0,58.0,115.24,0.0
81,2015,1139000.0,93.0,51.0,59.0,147.59,0.0
82,2016,1336000.0,90.0,110.788889,59.0,128.43,0.0


In [6]:
# Combine Louisiana's sugarcane production data with Louisiana's weather data and handle missing values
LA_s = df_sugarcane["LA"][["Weight"]].reset_index().dropna()

LA_w = df_weather.loc(axis=1)[:,"LA"].reset_index()
LA_w.columns = LA_w.columns.droplevel(level=1)

LA = pd.merge(left=LA_s, right=LA_w, left_on="Year", right_on="Year")
LA = LA.fillna(LA.mean())
LA.tail()
# LA.info()

Unnamed: 0,Year,Weight,TMAX,WDMV,TMIN,PRCP,SNOW
104,2013,12505000.0,95.5,144.2,27.5,78.075,0.0
105,2014,11387000.0,95.0,139.8,19.0,58.12,0.0
106,2015,11396000.0,98.0,231.8,21.0,70.92,0.0
107,2016,11520000.0,95.5,146.0,29.0,75.935,0.0
108,2017,13455000.0,95.5,183.9,21.5,76.15,0.4


In [7]:
# Combine Texas' sugarcane production data with Texas' weather data and handle missing values
TX_s = df_sugarcane["TX"][["Weight"]].reset_index().dropna()

TX_w = df_weather.loc(axis=1)[:,"TX"].reset_index()
TX_w.columns = TX_w.columns.droplevel(level=1)

TX = pd.merge(left=TX_s, right=TX_w, left_on="Year", right_on="Year")
TX = TX.fillna(TX.mean())
TX.tail()
# TX.info()

Unnamed: 0,Year,Weight,TMAX,WDMV,TMIN,PRCP,SNOW
55,2013,1446000.0,108.0,180.8,2.0,17.54,0.0
56,2014,1194000.0,105.0,188.9,0.0,26.49,0.0
57,2015,1105000.0,107.0,136.1,32.0,29.63,0.0
58,2016,1395000.0,105.0,131.7,29.0,15.12,0.0
59,2017,1490000.0,110.0,242.05,27.0,17.87,0.0


In [8]:
# Combine United States' sugarcane production data with United States' weather data and handle missing values
US_s = df_sugarcane["TX"][["Weight"]]

# TX_w = df_weather.loc(axis=1)[:,"TX"].reset_index()
# TX_w.columns = TX_w.columns.droplevel(level=1)

# TX = pd.merge(left=TX_s, right=TX_w, left_on="Year", right_on="Year")
# TX = TX.fillna(TX.mean())
# TX.tail()
# TX.info()

***
### Model the Problem with Machine Learning Algorithm(s)

In [9]:
# Compare Linear Regression to Random Forest Regression for Florida's sugarcane production
X_FL = FL[["Year", "TMAX", "TMIN", "WDMV", "PRCP", "SNOW"]]
y_FL = FL["Weight"]

X_FL_train, X_FL_test, y_FL_train, y_FL_test = train_test_split(X_FL, y_FL, test_size=0.3, random_state=42)

lr_model_FL = LinearRegression()
rf_model_FL = RandomForestRegressor()

lr_model_FL.fit(X_FL_train, y_FL_train)
rf_model_FL.fit(X_FL_train, y_FL_train)

FL_train = X_FL_train.join(y_FL_train)
m_FL = ols("Weight ~ Year + TMAX + TMIN + WDMV + PRCP + SNOW",FL_train).fit()

print(m_FL.summary())
print()
print("FLORIDA SUGARCANE PRODUCTION VS FLORIDA WEATHER:")
print("  Linear Regression Model Results:")
print("    R^2 train: {0:.4f}".format(lr_model_FL.score(X_FL_train, y_FL_train)))
print("    R^2  test: {0:.4f}".format(lr_model_FL.score(X_FL_test, y_FL_test)))
print("    y-intercept: {}".format(round(lr_model_FL.intercept_, 1)))
print("    {0}-coefficients: {1}".format(u"\u03B2", [round(c, 1) for c in lr_model_FL.coef_]))
print("    Feature variable order: {}".format([x for x in X_FL.columns]))
print()
print("  Random Forest Regression Model Results:")
print("    R^2 train: {0:.4f}".format(rf_model_FL.score(X_FL_train, y_FL_train)))
print("    R^2  test: {0:.4f}".format(rf_model_FL.score(X_FL_test, y_FL_test)))
print("    Feature importances: {}".format([round(p, 4) for p in rf_model_FL.feature_importances_]))
print("    Feature variable order: {}".format([x for x in X_FL.columns]))

lr_y_FL_pred = lr_model_FL.predict(X_FL)
lr_FL_residuals = y_FL - lr_y_FL_pred

rf_y_FL_pred = rf_model_FL.predict(X_FL)
rf_FL_residuals = y_FL - rf_y_FL_pred

trace0 = py.graph_objs.Scatter(name="LR Residuals", x=lr_y_FL_pred, y=y_FL, mode="markers")
trace1 = py.graph_objs.Scatter(name="RF Residuals", x=rf_y_FL_pred, y=y_FL, mode="markers")
trace2 = py.graph_objs.Scatter(name="Actual", x=y_FL, y=y_FL, mode="lines", line=dict(color="red"))

print()
fig = py.tools.make_subplots(rows=1, cols=2, shared_yaxes=True, horizontal_spacing=0.01)
fig.append_trace(trace0, 1, 1)
fig.append_trace(trace1, 1, 2)
fig.append_trace(trace2, 1, 1)
fig.append_trace(trace2, 1, 2)

fig["layout"].update(
    plot_bgcolor="rgb(247,247,247)",
    legend=dict(font=dict(family="serif", size=12)),
    title="<b>Residuals Comparison between Linear Regression and Random Forest Models</b>",
    titlefont=dict(family="serif", size=24),
    yaxis1=dict(title="<b>Production, US ton</b>", titlefont=dict(family="serif", size=14), tickfont=dict(family="serif", size=14)),
    xaxis1=dict(title="<b>Production, US ton</b>".format(u'\xb0'), titlefont=dict(family="serif", size=14),tickfont=dict(family="serif", size=14)),
    xaxis2=dict(title="<b>Production, US ton</b>".format(u'\xb0'), titlefont=dict(family="serif", size=14),tickfont=dict(family="serif", size=14)))

py.offline.iplot(fig)

                            OLS Regression Results                            
Dep. Variable:                 Weight   R-squared:                       0.923
Model:                            OLS   Adj. R-squared:                  0.917
Method:                 Least Squares   F-statistic:                     137.6
Date:                Sat, 14 Jul 2018   Prob (F-statistic):           1.67e-30
Time:                        14:07:15   Log-Likelihood:                -991.27
No. Observations:                  63   AIC:                             1995.
Df Residuals:                      57   BIC:                             2007.
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept  -4.244e+08   1.92e+07    -22.114      0.0

In [10]:
# Compare Linear Regression to Random Forest Regression for Hawaii's sugarcane production
X_HI = HI[["Year", "TMAX", "TMIN", "WDMV", "PRCP", "SNOW"]]
y_HI = HI["Weight"]

X_HI_train, X_HI_test, y_HI_train, y_HI_test = train_test_split(X_HI, y_HI, test_size=0.3, random_state=42)

lr_model_HI = LinearRegression()
rf_model_HI = RandomForestRegressor()

lr_model_HI.fit(X_HI_train, y_HI_train)
rf_model_HI.fit(X_HI_train, y_HI_train)

HI_train = X_HI_train.join(y_HI_train)
m_HI = ols("Weight ~ Year + TMAX + TMIN + WDMV + PRCP + SNOW",HI_train).fit()

print(m_HI.summary())
print()
print("HAWAII SUGARCANE PRODUCTION VS HAWAII WEATHER:")
print("  Linear Regression Model Results:")
print("    R^2 train: {0:.4f}".format(lr_model_HI.score(X_HI_train, y_HI_train)))
print("    R^2  test: {0:.4f}".format(lr_model_HI.score(X_HI_test, y_HI_test)))
print("    y-intercept: {}".format(round(lr_model_HI.intercept_, 1)))
print("    {0}-coefficients: {1}".format(u"\u03B2", [round(c, 1) for c in lr_model_HI.coef_]))
print("    Feature variable order: {}".format([x for x in X_HI.columns]))
print()
print("  Random Forest Regression Model Results:")
print("    R^2 train: {0:.4f}".format(rf_model_HI.score(X_HI_train, y_HI_train)))
print("    R^2  test: {0:.4f}".format(rf_model_HI.score(X_HI_test, y_HI_test)))
print("    Feature importances: {}".format([round(p, 4) for p in rf_model_HI.feature_importances_]))
print("    Feature variable order: {}".format([x for x in X_HI.columns]))

lr_y_HI_pred = lr_model_HI.predict(X_HI)
lr_HI_residuals = y_HI - lr_y_HI_pred

rf_y_HI_pred = rf_model_HI.predict(X_HI)
rf_HI_residuals = y_HI - rf_y_HI_pred

trace0 = py.graph_objs.Scatter(name="LR Residuals", x=lr_y_HI_pred, y=y_HI, mode="markers")
trace1 = py.graph_objs.Scatter(name="RF Residuals", x=rf_y_HI_pred, y=y_HI, mode="markers")
trace2 = py.graph_objs.Scatter(name="Actual", x=y_HI, y=y_HI, mode="lines", line=dict(color="red"))

print()
fig = py.tools.make_subplots(rows=1, cols=2, shared_yaxes=True, horizontal_spacing=0.01)
fig.append_trace(trace0, 1, 1)
fig.append_trace(trace1, 1, 2)
fig.append_trace(trace2, 1, 1)
fig.append_trace(trace2, 1, 2)

fig["layout"].update(
    plot_bgcolor="rgb(247,247,247)",
    legend=dict(font=dict(family="serif", size=12)),
    title="<b>Residuals Comparison between Linear Regression and Random Forest Models</b>",
    titlefont=dict(family="serif", size=24),
    yaxis1=dict(title="<b>Production, US ton</b>", titlefont=dict(family="serif", size=14), tickfont=dict(family="serif", size=14)),
    xaxis1=dict(title="<b>Production, US ton</b>".format(u'\xb0'), titlefont=dict(family="serif", size=14),tickfont=dict(family="serif", size=14)),
    xaxis2=dict(title="<b>Production, US ton</b>".format(u'\xb0'), titlefont=dict(family="serif", size=14),tickfont=dict(family="serif", size=14)))

py.offline.iplot(fig)

                            OLS Regression Results                            
Dep. Variable:                 Weight   R-squared:                       0.690
Model:                            OLS   Adj. R-squared:                  0.660
Method:                 Least Squares   F-statistic:                     23.12
Date:                Sat, 14 Jul 2018   Prob (F-statistic):           3.83e-12
Time:                        14:07:15   Log-Likelihood:                -918.26
No. Observations:                  58   AIC:                             1849.
Df Residuals:                      52   BIC:                             1861.
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept   2.122e+08   2.33e+07      9.098      0.0

In [11]:
# Compare Linear Regression to Random Forest Regression for Louisiana's sugarcane production
X_LA = LA[["Year", "TMAX", "TMIN", "WDMV", "PRCP", "SNOW"]]
y_LA = LA["Weight"]

X_LA_train, X_LA_test, y_LA_train, y_LA_test = train_test_split(X_LA, y_LA, test_size=0.3, random_state=42)

lr_model_LA = LinearRegression()
rf_model_LA = RandomForestRegressor()

lr_model_LA.fit(X_LA_train, y_LA_train)
rf_model_LA.fit(X_LA_train, y_LA_train)

LA_train = X_LA_train.join(y_LA_train)
m_LA = ols("Weight ~ Year + TMAX + TMIN + WDMV + PRCP + SNOW",LA_train).fit()

print(m_LA.summary())
print()
print("LOUISIANA SUGARCANE PRODUCTION VS LOUISIANA WEATHER:")
print("  Linear Regression Model Results:")
print("    R^2 train: {0:.4f}".format(lr_model_LA.score(X_LA_train, y_LA_train)))
print("    R^2  test: {0:.4f}".format(lr_model_LA.score(X_LA_test, y_LA_test)))
print("    y-intercept: {}".format(round(lr_model_LA.intercept_, 1)))
print("    {0}-coefficients: {1}".format(u"\u03B2", [round(c, 1) for c in lr_model_LA.coef_]))
print("    Feature variable order: {}".format([x for x in X_LA.columns]))
print()
print("  Random Forest Regression Model Results:")
print("    R^2 train: {0:.4f}".format(rf_model_LA.score(X_LA_train, y_LA_train)))
print("    R^2  test: {0:.4f}".format(rf_model_LA.score(X_LA_test, y_LA_test)))
print("    Feature importances: {}".format([round(p, 4) for p in rf_model_LA.feature_importances_]))
print("    Feature variable order: {}".format([x for x in X_LA.columns]))

lr_y_LA_pred = lr_model_LA.predict(X_LA)
lr_LA_residuals = y_LA - lr_y_LA_pred

rf_y_LA_pred = rf_model_LA.predict(X_LA)
rf_LA_residuals = y_LA - rf_y_LA_pred

trace0 = py.graph_objs.Scatter(name="LR Residuals", x=lr_y_LA_pred, y=y_LA, mode="markers")
trace1 = py.graph_objs.Scatter(name="RF Residuals", x=rf_y_LA_pred, y=y_LA, mode="markers")
trace2 = py.graph_objs.Scatter(name="Actual", x=y_LA, y=y_LA, mode="lines", line=dict(color="red"))

print()
fig = py.tools.make_subplots(rows=1, cols=2, shared_yaxes=True, horizontal_spacing=0.01)
fig.append_trace(trace0, 1, 1)
fig.append_trace(trace1, 1, 2)
fig.append_trace(trace2, 1, 1)
fig.append_trace(trace2, 1, 2)

fig["layout"].update(
    plot_bgcolor="rgb(247,247,247)",
    legend=dict(font=dict(family="serif", size=12)),
    title="<b>Residuals Comparison between Linear Regression and Random Forest Models</b>",
    titlefont=dict(family="serif", size=24),
    yaxis1=dict(title="<b>Production, US ton</b>", titlefont=dict(family="serif", size=14), tickfont=dict(family="serif", size=14)),
    xaxis1=dict(title="<b>Production, US ton</b>".format(u'\xb0'), titlefont=dict(family="serif", size=14),tickfont=dict(family="serif", size=14)),
    xaxis2=dict(title="<b>Production, US ton</b>".format(u'\xb0'), titlefont=dict(family="serif", size=14),tickfont=dict(family="serif", size=14)))

py.offline.iplot(fig)

                            OLS Regression Results                            
Dep. Variable:                 Weight   R-squared:                       0.803
Model:                            OLS   Adj. R-squared:                  0.786
Method:                 Least Squares   F-statistic:                     46.78
Date:                Sat, 14 Jul 2018   Prob (F-statistic):           2.04e-22
Time:                        14:07:15   Log-Likelihood:                -1187.0
No. Observations:                  76   AIC:                             2388.
Df Residuals:                      69   BIC:                             2404.
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept  -1.869e+08   1.58e+07    -11.820      0.0

In [12]:
# Compare Linear Regression to Random Forest Regression for Texas' sugarcane production
X_TX = TX[["Year", "TMAX", "TMIN", "WDMV", "PRCP", "SNOW"]]
y_TX = TX["Weight"]

X_TX_train, X_TX_test, y_TX_train, y_TX_test = train_test_split(X_TX, y_TX, test_size=0.3, random_state=42)

lr_model_TX = LinearRegression()
rf_model_TX = RandomForestRegressor()

lr_model_TX.fit(X_TX_train, y_TX_train)
rf_model_TX.fit(X_TX_train, y_TX_train)

TX_train = X_TX_train.join(y_TX_train)
m_TX = ols("Weight ~ Year + TMAX + TMIN + WDMV + PRCP + SNOW",TX_train).fit()

print(m_TX.summary())
print()
print("TEXAS SUGARCANE PRODUCTION VS TEXAS WEATHER:")
print("  Linear Regression Model Results:")
print("    R^2 train: {0:.4f}".format(lr_model_TX.score(X_TX_train, y_TX_train)))
print("    R^2  test: {0:.4f}".format(lr_model_TX.score(X_TX_test, y_TX_test)))
print("    y-intercept: {}".format(round(lr_model_TX.intercept_, 1)))
print("    {0}-coefficients: {1}".format(u"\u03B2", [round(c, 1) for c in lr_model_TX.coef_]))
print("    Feature variable order: {}".format([x for x in X_TX.columns]))
print()
print("  Random Forest Regression Model Results:")
print("    R^2 train: {0:.4f}".format(rf_model_TX.score(X_TX_train, y_TX_train)))
print("    R^2  test: {0:.4f}".format(rf_model_TX.score(X_TX_test, y_TX_test)))
print("    Feature importances: {}".format([round(p, 4) for p in rf_model_TX.feature_importances_]))
print("    Feature variable order: {}".format([x for x in X_TX.columns]))

lr_y_TX_pred = lr_model_TX.predict(X_TX)
lr_TX_residuals = y_TX - lr_y_TX_pred

rf_y_TX_pred = rf_model_TX.predict(X_TX)
rf_TX_residuals = y_TX - rf_y_TX_pred

trace0 = py.graph_objs.Scatter(name="LR Residuals", x=lr_y_TX_pred, y=y_TX, mode="markers")
trace1 = py.graph_objs.Scatter(name="RF Residuals", x=rf_y_TX_pred, y=y_TX, mode="markers")
trace2 = py.graph_objs.Scatter(name="Actual", x=y_TX, y=y_TX, mode="lines", line=dict(color="red"))

print()
fig = py.tools.make_subplots(rows=1, cols=2, shared_yaxes=True, horizontal_spacing=0.01)
fig.append_trace(trace0, 1, 1)
fig.append_trace(trace1, 1, 2)
fig.append_trace(trace2, 1, 1)
fig.append_trace(trace2, 1, 2)

fig["layout"].update(
    plot_bgcolor="rgb(247,247,247)",
    legend=dict(font=dict(family="serif", size=12)),
    title="<b>Residuals Comparison between Linear Regression and Random Forest Models</b>",
    titlefont=dict(family="serif", size=24),
    yaxis1=dict(title="<b>Production, US ton</b>", titlefont=dict(family="serif", size=14), tickfont=dict(family="serif", size=14)),
    xaxis1=dict(title="<b>Production, US ton</b>".format(u'\xb0'), titlefont=dict(family="serif", size=14),tickfont=dict(family="serif", size=14)),
    xaxis2=dict(title="<b>Production, US ton</b>".format(u'\xb0'), titlefont=dict(family="serif", size=14),tickfont=dict(family="serif", size=14)))

py.offline.iplot(fig)

                            OLS Regression Results                            
Dep. Variable:                 Weight   R-squared:                       0.861
Model:                            OLS   Adj. R-squared:                  0.837
Method:                 Least Squares   F-statistic:                     36.03
Date:                Sat, 14 Jul 2018   Prob (F-statistic):           1.43e-13
Time:                        14:07:15   Log-Likelihood:                -569.92
No. Observations:                  42   AIC:                             1154.
Df Residuals:                      35   BIC:                             1166.
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept  -2.689e+07   2.44e+06    -11.005      0.0