# Part I: Libraries:

In [168]:
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import GradientBoostingRegressor, ExtraTreesRegressor
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.externals import joblib
import matplotlib.pyplot as plt 
import statsmodels.api as sm
import pandas as pd 
import numpy as np
import pickle
%matplotlib inline 

# Part II: Functions: 

In [258]:
#Function 1:
def fit_model_sklearn(df, commodity, model_name):
    """
    INPUT: df (master dataframe); commodity (the type of commodity that one whishes to predict); mode_name (type of model)
    OUT: model (the fitted modle); X_train; X_test; y_train; y_test 
    """
    com_df = df[df['COMMODITY']==commodity]
    y, X = com_df['YIELD/ACRE'], com_df.drop(['COMMODITY', 'YIELD/ACRE'], axis=1)    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
                                     
    if model_name == "Linear Regression":
        model = LinearRegression()
    elif model_name == 'RandomForestRegressor':
        model = RandomForestRegressor(n_estimators=50)
    elif model_name == 'ExtraTreesRegressor':
        model = ExtraTreesRegressor(n_estimators=50)
    elif model_name == 'GradientBoostingRegressor':
        params = {'n_estimators': 500, 'max_depth': 4, 'min_samples_split': 1,'learning_rate': 0.01, 'loss': 'ls'}
        model = GradientBoostingRegressor(**params)
    model.fit(X_train, y_train) 
    return model, X_train, X_test, y_train, y_test

#Function 2: 
def generate_sub_csv(df, commodity):
    """
    INPUT: df (master dataframe); commodities_list (list of commodities)
    """
    path = '/Users/Hsieh/Desktop/persephone/Data/Models/model_yield_{}.csv'
    df[df['commodity_desc']==commodity].to_csv(path.format(commodity))
        
#Function 3: 
def join_dfs(weather_df, commodity):
    path = '/Users/Hsieh/Desktop/persephone/Data/Models/model_yield_{}.csv'
    yield_df = pd.read_csv('/Users/Hsieh/Desktop/persephone/Data/model_yield_{}.csv'.format(commodity))
    
#Function 4: 
def fit_model_sm(df, commodity, model_name):
    """
    INPUT: df (master dataframe); commodity (the type of commodity that one whishes to predict); mode_name (type of model)
    OUT: model (the fitted modle); X_train; X_test; y_train; y_test 
    """
    com_df = df[df["COMMODITY"]==commodity]
    y, X = com_df['YIELD/ACRE'], com_df.drop(['COMMODITY', 'YIELD/ACRE'], axis=1)    
                                     
    if model_name == "Linear Regression":
        X = sm.add_constant(X)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
        model = sm.OLS(y_train, X_train)
        
    results = model.fit()            
    return model, results, X_train, X_test, y_train, y_test

#Function 5: 
def fit_all_commodities(df, commodities_list, filepath, model_name):
    """
    INPUT: df (dataframe), commodity_list (list of respective commodities for which one whishes to build regressio models)
    OUTPUT: print result; write pickled models to file path
    """
    sklearn_models = {'RandomForestRegressor','ExtraTreesRegressor','GradientBoostingRegressor'}
    sm_models = {"Linear Regression"}
    for commodity in commodities_list:
        if model_name in sklearn_models:
            model, X_train, X_test, y_train, y_test = fit_model_sklearn(df, commodity, model_name)
            predict = model.predict(X_test)
            print "****************************************************************************"
            print "{}'s adjusted r^2 score with {} is:".format(commodity, model_name)
            print r2_score(y_test, predict)
        elif model_name in sm_models: 
            model, results, X_train, X_test, y_train, y_test = fit_model_sm(df, commodity, model_name)
        #pickle model: 
        joblib.dump(model,'{}/{}_with_{}.pkl'.format(filepath, commodity, model_name)) 
        
#Function 6: 
def calculate_yield_lost(states_stats, degree_increase, commodity_list, states_list):
    """
    INPUT: states_stats (df), degree_increase (float)
    OUTPUT: lost (float)
    """
    aggregate_lost = 0 
    for state in states_list: 
        for commodity_list

# Part III: Reorganizing Dataframes: 

In [3]:
#declaring variables: 
targeted_states = pd.Series(["California", "Iowa", "Texas", "Nebraska", "Illinois",\
                  "Minnesota", "Kansas", "Indiana", "North Carolina", "Wisconsin"])
targeted_states = targeted_states.apply(lambda x: x.upper())

## a) yield:

In [29]:
#load yield_csv:
yield_df = pd.read_csv('/Users/Hsieh/Desktop/persephone/Data/cleaned_master_yield.csv')

In [30]:
target_units = ['TONS / ACRE','LB / ACRE']
yield_df = yield_df[yield_df["unit_desc"].apply(lambda x: True if (x in target_units) else (False))]
yield_df = yield_df[yield_df["unit_desc"].notnull()==True]

In [31]:
out_put = ['value']
group_columns = ['year','state_name','county_name','commodity_desc','unit_desc']
dummy_columns = ['prodn_practice_desc','util_practice_desc','class_desc']
drop_columns = ['Unnamed: 0','data_item','state_alpha','statisticcat_desc','asd_code','asd_desc',\
               'congr_district_code','county_ansi','county_code','location_desc']
drop_extra = ['state_name','county_name','year']
#drop unneeded columsn for yield/acre prediction: 
yield_df.sort("year", ascending=True, inplace=True)
yield_df.drop(drop_columns,axis=1,inplace=True)
#yield_df.reset_index(inplace=True)



In [32]:
yield_df['STATE'] = yield_df['state_name']
yield_df['COUNTY'] = yield_df['county_name']
yield_df['YEAR'] = yield_df['year']
yield_df.drop(drop_extra,axis=1,inplace=True)

In [33]:
commodities_list = yield_df.commodity_desc.unique()

In [34]:
#Create  dummy variables:
yield_df= pd.get_dummies(yield_df, columns=['prodn_practice_desc','util_practice_desc'],drop_first=True)
#df_prodn = pd.get_dummies(yield_df['prodn_practice_desc'])
#df_util_practice = pd.get_dummies(yield_df['util_practice_desc'])
#df_class = pd.get_dummies(yield_df['class_desc'])

In [35]:
yield_df.columns

Index([u'commodity_desc', u'unit_desc', u'value', u'class_desc', u'STATE',
       u'COUNTY', u'YEAR', u'prodn_practice_desc_IRRIGATED',
       u'prodn_practice_desc_NON-IRRIGATED',
       u'prodn_practice_desc_NON-IRRIGATED, CONTINUOUS CROP',
       u'prodn_practice_desc_NON-IRRIGATED, FOLLOWING SUMMER FALLOW',
       u'prodn_practice_desc_NOT FOLLOWING ANOTHER CROP',
       u'util_practice_desc_GRAIN', u'util_practice_desc_SILAGE',
       u'util_practice_desc_SUGAR'],
      dtype='object')

In [36]:
#drop variables that have already been dummified: 
yield_df.drop('class_desc',axis=1,inplace=True)

In [37]:
yield_df['unit_desc'].value_counts()

TONS / ACRE    980303
LB / ACRE       95807
Name: unit_desc, dtype: int64

In [38]:
yield_df['COMMODITY'] = yield_df['commodity_desc']
yield_df['UNIT'] = yield_df['unit_desc']
yield_df['YIELD/ACRE'] = yield_df['value']
yield_df.drop(['commodity_desc','unit_desc','value'],axis=1,inplace=True)

In [None]:
yield_df.to_csv('/Users/Hsieh/Desktop/persephone/Data/model_yield_df_dummies.csv')

In [41]:
commodities_list = yield_df['COMMODITY'].unique()

In [None]:
#generate_sub_csv(yield_df, commodities_list)

## b) weather: 

In [4]:
#load weather csv:
#weather_df = pd.read_csv('/Users/Hsieh/Desktop/persephone/Data/cleaned_master_weather.csv')
weather_df = pd.read_csv('/Users/Hsieh/Desktop/persephone/Data/cleaned_master_weather_complete.csv')

In [5]:
#declaring variables: 
abnormal = -9999.00
w_group_columns = ['STATE','COUNTY','YEAR','MONTH']

features = ['CLDD','DPNP','DPNT','HTDD','DT90', 'DX32', 'DT00', 'DT32', 'DP01', 'DP05', 'DP10', 'MMXP',\
    'MMNP','TEVP','HO51A0','HO51P0','HO52A0','HO52P0','HO53A0','HO53P0','HO54A0','HO54P0','HO55A0','HO55P0',\
    'HO56A0','HO56P0','HO01A0','HO03A0','LO51A0','LO51P0','LO52A0','LO52P0','LO53A0','LO53P0','LO54A0','LO54P0',\
    'LO55A0','LO55P0','LO56A0','LO56P0','LO01A0','LO03A0','MO51A0','MO51P0','MO52A0','MO52P0','MO53A0','MO53P0',\
    'MO54A0','MO54P0', 'MO55A0','MO55P0','MO56A0','MO56P0','MO01A0','MO03A0','EMXP','MXSD','DSNW','TPCP','TSNW','EMXT',\
    'EMNT','MMXT','MMNT','MNTM','TWND']

#order_columns = ['STATE','COUNTY', 'MONTH','DSNW','EMNT','EMXP','EMXT','MMNT','MMXT','MNTM','MXSD','TPCP','TSNW','YEAR']
model_group_columns = ['STATE','COUNTY','YEAR']

In [6]:
#drop unneeded columns:
weather_df.drop(['Unnamed: 0','LATITUDE','LONGITUDE'],axis=1,inplace=True)
#grab monthly value:
weather_df["MONTH"] = weather_df["DATE"].apply(lambda x: int(str(x)[4:6]))
#turn -9999.00 (the way the data record Nan values) into Nan values:
for feature in features: 
    weather_df[feature] = weather_df[feature].apply(lambda x: np.nan if (x == abnormal) else (x))
#filter out a row where COUNTY value is missing: 
weather_df = weather_df[weather_df["COUNTY"].notnull()==True]
#capitalized 'STATE' and 'COUNTY':
weather_df['STATE'] = weather_df['STATE'].apply(lambda x: x.upper())
weather_df['COUNTY'] = weather_df['COUNTY'].apply(lambda x: x.upper())
#drop unneeded columns: 
weather_df.drop('DATE',axis=1,inplace=True)
#check total null values:
#for c in weather_df:
#    print c, np.mean(pd.isnull(weather_df[c]))
#median weather data in respect to coutny/year:
#median_weather_df = weather_df.groupby(w_group_columns).median().reset_index()
#back fill na for values that is missing in median df:
#median_weather_df.fillna(method="bfill",inplace=True)

In [7]:
#average annual weather day: 
weather_df = weather_df.groupby(['STATE','COUNTY','YEAR']).mean().reset_index()

In [8]:
weather_df.drop(['MONTH'], axis=1, inplace=True)

In [9]:
#drop columns with more 30% na values:
drop_features = []
for c in weather_df:
    if np.mean(pd.isnull(weather_df[c])) > 0.3:
        drop_features.append(c)

In [10]:
weather_df.drop(drop_features, axis=1, inplace=True)

In [11]:
print drop_features

['MMXP', 'MMNP', 'TEVP', 'HO51A0', 'HO51P0', 'HO52A0', 'HO52P0', 'HO53A0', 'HO53P0', 'HO54A0', 'HO54P0', 'HO55A0', 'HO55P0', 'HO56A0', 'HO56P0', 'HO01A0', 'HO03A0', 'LO51A0', 'LO51P0', 'LO52A0', 'LO52P0', 'LO53A0', 'LO53P0', 'LO54A0', 'LO54P0', 'LO55A0', 'LO55P0', 'LO56A0', 'LO56P0', 'LO01A0', 'LO03A0', 'MO51A0', 'MO51P0', 'MO52A0', 'MO52P0', 'MO53A0', 'MO53P0', 'MO54A0', 'MO54P0', 'MO55A0', 'MO55P0', 'MO56A0', 'MO56P0', 'MO01A0', 'MO03A0', 'TWND']


In [12]:
#since the df is order based on time and location this will fill the na with the values of next year. 
weather_df.fillna(method="bfill", inplace=True)

In [13]:
weather_df.to_csv('/Users/Hsieh/Desktop/persephone/Data/model_weather_df_complete.csv')

# Part IV: Combining Dataframes:

In [54]:
#weather_model_df = pd.read_csv('/Users/Hsieh/Desktop/persephone/Data/model_weather_df.csv')
weather_model_df = pd.read_csv('/Users/Hsieh/Desktop/persephone/Data/model_weather_df_complete.csv')
yield_model_df = pd.read_csv('/Users/Hsieh/Desktop/persephone/Data/model_yield_df_dummies.csv')

In [55]:
model_df = pd.merge(left=yield_model_df, right=weather_model_df, left_on=['STATE','COUNTY','YEAR'],\
                   right_on=['STATE','COUNTY','YEAR'], how="inner")

In [56]:
model_df.columns

Index([u'Unnamed: 0_x', u'STATE', u'COUNTY', u'YEAR',
       u'prodn_practice_desc_IRRIGATED', u'prodn_practice_desc_NON-IRRIGATED',
       u'prodn_practice_desc_NON-IRRIGATED, CONTINUOUS CROP',
       u'prodn_practice_desc_NON-IRRIGATED, FOLLOWING SUMMER FALLOW',
       u'prodn_practice_desc_NOT FOLLOWING ANOTHER CROP',
       u'util_practice_desc_GRAIN', u'util_practice_desc_SILAGE',
       u'util_practice_desc_SUGAR', u'COMMODITY', u'UNIT', u'YIELD/ACRE',
       u'Unnamed: 0_y', u'CLDD', u'DPNP', u'DPNT', u'HTDD', u'DT90', u'DX32',
       u'DT00', u'DT32', u'DP01', u'DP05', u'DP10', u'EMXP', u'MXSD', u'DSNW',
       u'TPCP', u'TSNW', u'EMXT', u'EMNT', u'MMXT', u'MMNT', u'MNTM'],
      dtype='object')

In [57]:
model_df.drop(['Unnamed: 0_x','Unnamed: 0_y','UNIT'], axis=1, inplace=True)

In [59]:
#grouping column names: 
categorical_features = ['STATE','COUNTY',]
dependent_variable = ['YIELD/ACRE']
#get dummy variables for categorical variables:
model_df= pd.get_dummies(model_df, columns=categorical_features,drop_first=True)
model_df.to_csv('/Users/Hsieh/Desktop/persephone/Data/model_df_complete.csv')

# Part V: Models: 

In [89]:
#after cleanning, some commodities has no values (removing these commodities)
commodities_list = list(commodities_list)
commodities_list.remove('SAFFLOWER')
commodities_list.remove('MUSTARD')
commodities_list.remove('LENTILS')
commodities_list.remove('PEAS')

In [195]:
print commodities_list

['WHEAT', 'HAY', 'CORN', 'OATS', 'BARLEY', 'TOBACCO', 'SOYBEANS', 'COTTON', 'SORGHUM', 'RICE', 'PEANUTS', 'BEANS', 'SUGARBEETS', 'RYE', 'FLAXSEED', 'SUNFLOWER', 'SUGARCANE', 'CANOLA']


## a: RandomForestRegressor: 

In [189]:
#get results/models for all commodities:
model_name = "RandomForestRegressor"
filepath = "/Users/Hsieh/Desktop/persephone/Model Results/{}".format(model_name)
fit_all_commodities(model_df, commodities_list, filepath, model_name)

****************************************************************************
WHEAT's adjusted r^2 score with RandomForestRegressor is:
0.911699733918
****************************************************************************
HAY's adjusted r^2 score with RandomForestRegressor is:
0.289489581754
****************************************************************************
CORN's adjusted r^2 score with RandomForestRegressor is:
0.877705649764
****************************************************************************
OATS's adjusted r^2 score with RandomForestRegressor is:
0.63165792489
****************************************************************************
BARLEY's adjusted r^2 score with RandomForestRegressor is:
0.591906736817
****************************************************************************
TOBACCO's adjusted r^2 score with RandomForestRegressor is:
0.332622726471
****************************************************************************
SOYBEANS's adjusted r^2 s

## b: Linear Regression: 

In [192]:
#get results/models for all commodities:
model_name = "Linear Regression"
filepath = "/Users/Hsieh/Desktop/persephone/Model Results/{}".format(model_name)
fit_all_commodities(model_df, commodities_list, filepath, model_name)

## c: ExtraTreesRegressor:

In [197]:
#get results/models for all commodities:
model_name = "ExtraTreesRegressor"
filepath = "/Users/Hsieh/Desktop/persephone/Model Results/{}".format(model_name)
fit_all_commodities(model_df, commodities_list, filepath, model_name)

****************************************************************************
WHEAT's adjusted r^2 score with ExtraTreesRegressor is:
0.924257819216
****************************************************************************
HAY's adjusted r^2 score with ExtraTreesRegressor is:
0.151594521911
****************************************************************************
CORN's adjusted r^2 score with ExtraTreesRegressor is:
0.898872333698
****************************************************************************
OATS's adjusted r^2 score with ExtraTreesRegressor is:
0.679362107888
****************************************************************************
BARLEY's adjusted r^2 score with ExtraTreesRegressor is:
0.632962389045
****************************************************************************
TOBACCO's adjusted r^2 score with ExtraTreesRegressor is:
0.383272210121
****************************************************************************
SOYBEANS's adjusted r^2 score with E

## d: Gradient Boosting: 

In [113]:
#get results/models for all commodities:
model_name = "GradientBoostingRegressor"
filepath = "/Users/Hsieh/Desktop/persephone/Model Results/{}".format(model_name)
fit_allmodels(model_df, commodities_list, filepath, model_name)

****************************************************************************
WHEAT's adjusted r^2 score with GradientBoostingRegressor is:
0.63218196811
****************************************************************************
HAY's adjusted r^2 score with GradientBoostingRegressor is:
0.340117176932
****************************************************************************
CORN's adjusted r^2 score with GradientBoostingRegressor is:
0.778768335245
****************************************************************************
OATS's adjusted r^2 score with GradientBoostingRegressor is:
0.460199686498
****************************************************************************
BARLEY's adjusted r^2 score with GradientBoostingRegressor is:
0.483636497025
****************************************************************************
TOBACCO's adjusted r^2 score with GradientBoostingRegressor is:
0.221823977219
****************************************************************************
S

## e: for loading pickled models: 

In [170]:
# model = joblib.load('filename.pkl') 

# Part VI: Conclusion: 

In [190]:
#Focus on top three field crops of U.S.:corn, soybeans, wheat 
#Source: http://www.ers.usda.gov/faqs.aspx#leading
focos_crops = ["CORN", "SOYBEANS", "WHEAT"]

## a) Features Importance:

In [260]:
model, X_train, X_test, y_train, y_test = fit_model_sklearn(model_df, 'CORN', 'ExtraTreesRegressor')

In [261]:
importances = model.feature_importances_

In [268]:
std = np.std([model.feature_importances_ for tree in model.estimators_],
             axis=0)
indices = np.argsort(importances)[::-1]

In [275]:
model_df.columns[indices][:20]

Index([u'util_practice_desc_GRAIN', u'YEAR',
       u'prodn_practice_desc_NON-IRRIGATED', u'STATE_NEBRASKA', u'COMMODITY',
       u'prodn_practice_desc_IRRIGATED', u'EMNT', u'DPNT', u'TSNW',
       u'STATE_INDIANA', u'MMXT', u'MXSD', u'STATE_SOUTH CAROLINA', u'DX32',
       u'EMXT', u'HTDD', u'DT00', u'YIELD/ACRE', u'DT32', u'TPCP'],
      dtype='object')

## b) Results:

### i. what will happen to total yield production (measured in money) of top 10 U.S. states, if average temperature +2? 

In [200]:
#load data with aggregated state yields (top 10 states):
path = "/Users/Hsieh/Desktop/persephone/Data/top10_state_yield.csv"
states_yied_df = pd.read_csv(path)

In [211]:
states_yied_df.columns

Index([u'source_desc', u'sector_desc', u'group_desc', u'commodity_desc',
       u'class_desc', u'prodn_practice_desc', u'util_practice_desc',
       u'statisticcat_desc', u'unit_desc', u'domain_desc', u'domaincat_desc',
       u'data_item', u'agg_level_desc', u'state_ansi', u'state_fips_code',
       u'state_alpha', u'state_name', u'asd_code', u'asd_desc', u'county_ansi',
       u'county_code', u'county_name', u'region_desc', u'zip_5',
       u'watershed_code', u'watershed_desc', u'congr_district_code',
       u'country_code', u'country_name', u'location_desc', u'year',
       u'freq_desc', u'begin_code', u'end_code', u'reference_period_desc',
       u'week_ending', u'load_time', u'value', u'CV'],
      dtype='object')

In [228]:
stats_values_needed = ['PRODUCTION']
columns_needed = ['statisticcat_desc','commodity_desc','state_name','value','year','unit_desc']

In [215]:
states_yied_df[states_yied_df['unit_desc']=='$']

Unnamed: 0,source_desc,sector_desc,group_desc,commodity_desc,class_desc,prodn_practice_desc,util_practice_desc,statisticcat_desc,unit_desc,domain_desc,...,location_desc,year,freq_desc,begin_code,end_code,reference_period_desc,week_ending,load_time,value,CV
62,SURVEY,CROPS,FIELD CROPS,SUNFLOWER,ALL CLASSES,ALL PRODUCTION PRACTICES,ALL UTILIZATION PRACTICES,PRODUCTION,$,TOTAL,...,MINNESOTA,2012,ANNUAL,0,0,YEAR,,2014-11-06 10:31:04,27481000,
63,SURVEY,CROPS,FIELD CROPS,BEANS,DRY EDIBLE,ALL PRODUCTION PRACTICES,ALL UTILIZATION PRACTICES,PRODUCTION,$,TOTAL,...,KANSAS,2012,ANNUAL,0,0,YEAR,,2014-11-06 10:31:04,5688000,
64,SURVEY,CROPS,FIELD CROPS,BEANS,DRY EDIBLE,ALL PRODUCTION PRACTICES,ALL UTILIZATION PRACTICES,PRODUCTION,$,TOTAL,...,NEBRASKA,2012,ANNUAL,0,0,YEAR,,2014-11-06 10:31:04,123569000,
65,SURVEY,CROPS,FIELD CROPS,HAY,ALFALFA,ALL PRODUCTION PRACTICES,ALL UTILIZATION PRACTICES,PRODUCTION,$,TOTAL,...,CALIFORNIA,2012,ANNUAL,0,0,YEAR,,2014-11-06 10:31:04,1266300000,
66,SURVEY,CROPS,FIELD CROPS,HAY,(EXCL ALFALFA),ALL PRODUCTION PRACTICES,ALL UTILIZATION PRACTICES,PRODUCTION,$,TOTAL,...,MINNESOTA,2012,ANNUAL,0,0,YEAR,,2014-11-06 10:31:04,195840000,
240,SURVEY,CROPS,FIELD CROPS,WHEAT,WINTER,ALL PRODUCTION PRACTICES,ALL UTILIZATION PRACTICES,PRODUCTION,$,TOTAL,...,WISCONSIN,2012,ANNUAL,0,0,YEAR,,2014-11-06 10:31:04,138731000,
241,SURVEY,CROPS,FIELD CROPS,WHEAT,ALL CLASSES,ALL PRODUCTION PRACTICES,ALL UTILIZATION PRACTICES,PRODUCTION,$,TOTAL,...,ILLINOIS,2012,ANNUAL,0,0,YEAR,,2014-11-06 10:31:04,292045000,
242,SURVEY,CROPS,FIELD CROPS,OATS,ALL CLASSES,ALL PRODUCTION PRACTICES,ALL UTILIZATION PRACTICES,PRODUCTION,$,TOTAL,...,IOWA,2012,ANNUAL,0,0,YEAR,,2014-11-06 10:31:04,15231000,
243,SURVEY,CROPS,FIELD CROPS,BARLEY,ALL CLASSES,ALL PRODUCTION PRACTICES,ALL UTILIZATION PRACTICES,PRODUCTION,$,TOTAL,...,CALIFORNIA,2012,ANNUAL,0,0,YEAR,,2014-11-06 10:31:04,26268000,
244,SURVEY,CROPS,FIELD CROPS,COTTON,PIMA,ALL PRODUCTION PRACTICES,ALL UTILIZATION PRACTICES,PRODUCTION,$,TOTAL,...,TEXAS,2012,ANNUAL,0,0,YEAR,,2014-11-06 10:31:04,8978000,


In [236]:
filtered_df = states_yied_df[columns_needed]
filtered_df = filtered_df[filtered_df["unit_desc"]=='$']
filtered_df.drop(['statisticcat_desc','unit_desc'],axis=1,inplace=True)

In [242]:
filtered_df['value'] = filtered_df['value'].apply(lambda x: np.nan if (x == '(D)') else x) 

In [249]:
filtered_df['value'] = filtered_df['value'].apply(lambda x: x if type(x)==float else float(x.replace(',','')))

In [255]:
filtered_df = filtered_df.groupby(by=['commodity_desc','state_name']).mean().reset_index()
filtered_df.drop(['year'],axis=1,inplace=True)

In [256]:
filtered_df.head(10)

Unnamed: 0,commodity_desc,state_name,value
0,BARLEY,CALIFORNIA,17959330.0
1,BARLEY,KANSAS,2203667.0
2,BARLEY,MINNESOTA,29368000.0
3,BARLEY,NORTH CAROLINA,4017000.0
4,BARLEY,WISCONSIN,4378333.0
5,BEANS,CALIFORNIA,67576670.0
6,BEANS,KANSAS,4430500.0
7,BEANS,MINNESOTA,130162300.0
8,BEANS,NEBRASKA,121366300.0
9,BEANS,TEXAS,11577330.0


In [259]:
increase_2_celcius = 3.6
lost =  calculate_yield_lost(filtered_df, increase_2_celcius, commodities_list, targeted_states)

TypeError: calculate_yield_lost() takes exactly 2 arguments (4 given)

# Part VII: Archive Code:

In [179]:
#Function 1: 
"""
def merge_cols(x):
    return x['COUNTY'] + x['STATE'] + str(x['MONTH'])
"""

#Function 2: 
"""
def fill_na(x, median_df):
    INPUT: x (pd series; row of a df); median_df (pd df; dataframe with the needed average values)
    OUTPUT: new_x (pd series; new row of df with filled_na values)
    OVERVIEW: fill in na values of a df with historical average of the monthly value of that month and region 
    
    state, county, month = x[0], x[1], x[2]  
    
    row = median_df.loc[(median_df['STATE']==state)&(median_df['COUNTY']==county)&(median_df['MONTH']==month)]
    s_row = row.iloc[0][1:]  
    
    x.loc[np.where(x=="NAN")] = s_row.loc[np.where(x=="NAN")]
    return x 
"""
#model_df.unit_desc.unique()

#grouping column names: 
#categorical_features = ['STATE','COUNTY']
#numeric_features = ['YEAR','DSNW','EMNT','EMXP','EMXT','MMNT','MMXT','MNTM','MXSD','TPCP','TSNW']
#dependent_variable = ['value']

'\ndef fill_na(x, median_df):\n    INPUT: x (pd series; row of a df); median_df (pd df; dataframe with the needed average values)\n    OUTPUT: new_x (pd series; new row of df with filled_na values)\n    OVERVIEW: fill in na values of a df with historical average of the monthly value of that month and region \n    \n    state, county, month = x[0], x[1], x[2]  \n    \n    row = median_df.loc[(median_df[\'STATE\']==state)&(median_df[\'COUNTY\']==county)&(median_df[\'MONTH\']==month)]\n    s_row = row.iloc[0][1:]  \n    \n    x.loc[np.where(x=="NAN")] = s_row.loc[np.where(x=="NAN")]\n    return x \n'

# Part VIII: Features Key: 

In [178]:
features_key = {'CLDD':'Cooling degree days','DPNP':'Departure from normal monthly precipitation',\
                'DPNT':'Departure from normal monthly temperature','HTDD':'Heating degree days',\
                'DT90':'Number days with maximum temperature greater than or equal 90.0 F',\
                'DX32':'Number days with maximum temperature less than or equal to 32.0 F',\
                'DT00':'Number days with minimum temperature less than or equal to 0.0 F',\
                'DT32':'Number days with minimum temperature less than or equal to 32.0 F',\
                'DP01':'Number of days with greater than or equal to 0.1 inch of precipitation',\
                'DP05':'Number of days with greater than or equal to 0.5 inch of precipitation',\
                'DP10':'Number of days with greater than or equal to 1.0 inch of precipitation',\
                'MMXP':'Monthly mean maximum temperature of evaporation pan water',\
                'MMNP':'Monthly mean maximum temperature of evaporation pan water',\
                'TEVP':'Total monthly evaporation',\
                'EMXP':'Extreme maximum daily precipitation','MXSD':'Maximum snow depth',\
                'DSNW':'Number days with snow depth > 1 inch','TPCP':'Total precipitation',\
                'TPCP':'Total precipitation','TSNW':'Total snow fall',\
                'EMXT':'Extreme maximum daily temperature','EMNT':'Extreme maximum daily temperature',\
                'MMNT':'Monthly Mean minimum temperature','MNTM':' Monthly mean temperature',\
                'TWND':'Total monthly wind movement over evaporation pan'}