# Part 0: Libraries:

In [1]:
from collections import OrderedDict
from matplotlib import pyplot as plt
import pandas as pd
import numpy as np
%matplotlib inline



# Part I: Functions:

In [2]:
#Function 1: 
def parse_data_item(s):
    """
    INPUT: s (string; description of data value)
    OUTPUT: attributes (set of strings; sub-domain/yield-type/season-harvested etc.) \
        unit (string; ex. BU/net harvested acre) 
    OVERVIEW: Parse String in column "Data Item": 
    """
    left, unit = s.split(" - YIELD, MEASURED IN ")
    attributes = set(left.split(", ")[1:])
    master_attributes.update(attributes)
    return attributes, unit

#Function 2: 
def add_columns(df, s):
    """
    INPUT: df (data frame), s (set of strings of column names )
    OUTPUT: N/A
    OVERVIEW: Add new columns to dataframe: 
    """
    for name in s:
        df[name] = df["Crop Attributes"].apply(lambda x: 1 if (name in x) else 0) 
        
#Function 3: 
def count_irrigation(df, time_period):
    """
    INPUT: df (data frame), time_period (string; the time period this calculation is based on)
    OUTPUT: N/A
    OVERVIEW: count/print of irrigated vs. non-irrigated crops: 
    """
    irrigated = len(df[df["IRRIGATED"]==1])
    non_irrigated = len(df[df["NON-IRRIGATED"]==0])
    print "__Time period:{}__".format(time_period)
    print "Total irrigated crops: {}".format(irrigated)
    print "Total non-irrigated crops: {}".format(non_irrigated)
    print "Ratio of irrigated/non-irrigated: {}".format(irrigated/float(non_irrigated))
    
#Function 4: 
def count_sub_type(df, time_period): 
    """
    INPUT: df (data frame), time_period (string; the time period this calculation is based on) 
    OUTPUT: count (# of total sub_types of all commodities in this data frame)
    OVERVIEW: count/print total sub_type of master commodity during a period: 
    """
    df = df[df["Year"]==time_period]
    all_commodities = df["Commodity"].unique()
    total_commodities, total_sub_domains = len(all_commodities), 0
    print "__Time period:{}__".format(time_period)
    for commodity in all_commodities:
        sub_domains_count = len(df[df["Commodity"]==commodity])
        total_sub_domains += sub_domains_count
        print "{} has {} sub_types".format(commodity, sub_domains_count)
    print "Total general commodities: {}".format(total_commodities)
    print "Total commodities: {}".format(total_sub_domains)
    
#Function 5: 
def graph_yield(df, year, comparing_variable):
    """
    INPUT: df (data frame), commodity (string; type of commodity one wishes to graph);\
        comparing_variable (string; type of variable one wishes to compare)
    OUTPUT: N/A
    OVERVIEW: graphing the sum of one general type of cmmodity against a comparing_variable
    """
    #i.grabbing values: 
    df = df[df["Year"]==year]
    year_commodities = df["Commodity"].unique()
    for commodity in year_commodities:
        df_commodity = df[df["Commodity"]==commodity]
        filtered_df = df_commodity.groupby(by=[comparing_variable])["Value"].sum()
        comparing_variable_list, yield_per_acre = list(filtered_df.index), list(filtered_df.values)
        if len(set(df_commodity["Unit"].values)) == 1:
            unit = df_commodity["Unit"].values[0]
        else:
            print "ERROR: MULTIPLE TYPES OF UNIT FOR COMMODITY"
            print "DETAILS ABOUT ERROR:"
            print commodity
            print set(df["Unit"].values)
            return None 
        #ii. graph:
        graph_bar(comparing_variable_list, yield_per_acre, comparing_variable, unit, commodity, year)

#Function 6:
def convert_unit(x, conversion_dict):
    """
    INPUT: x (row of dataframe); conversion_dict (dict; unit conversion (key = "commodity; value = conversion rate))
    OUPUT: new_val (float)
    OVERVIEW: N/A
    """
    new_val = x['Value']
    if x['Unit'].split(' / ')[0] == "BU":
        new_val = round(new_val*conversion_dict[x['Commodity']],2)
    return new_val

#Function 7:
def change_unit_name(x):
    """
    INPUT: x (row of dataframe)
    OUPUT: new_name (str)
    OVERVIEW: N/A
    """
    new_name = x['Unit']
    if x['Unit'].split(' / ')[0] == "BU":
        new_name = "TONS"+' / '+x['Unit'].split(' / ')[1]
    return new_name

#Function 8:
def standardrize_unit(df, conversion_dict):
    """
    INPUT: df(data frame), conversion_dict (dict; unit conversion (key = "commodity; value = conversion rate))
    OUTPUT: df(data frame)
    OVERVIEW: standardrize data frame unit to ton/area
    """
    df['Value'] = df.apply(convert_unit,axis=1, args = (conversion_dict,))
    df['Unit'] = df.apply(change_unit_name,axis=1)
    return df 

#Function 9:
def graph_bar(x, y, x_label, y_label, title, year):
    """
    INPUT: x (list of str of discrete variables), y (list of float values)\
        ,x_label (str of discrete variable types), y_label (float of y-axis unit)\
        ,title (str; title of graph), year (int)
    OUTPUT: N/A
    OVERVIEW: N/A
    """
    with plt.style.context('fivethirtyeight'):
        len_x = range(len(x))
        plt.bar(len_x, y, align='center')
        plt.xticks(len_x, x, rotation='vertical')
        plt.xlabel(x_label)
        plt.ylabel(y_label)
        plt.title(title+" ({})".format(year))
        plt.show()

# Part II: Load/Clean Data: 

In [None]:
#0: Declaring Variables:
master_attributes = set()

In [None]:
#1: Load csv files: 
df_2000s = pd.read_csv("/Users/Hsieh/desktop/project/Data/Yield/CACropYield2000s.csv")
df_90s = pd.read_csv("/Users/Hsieh/desktop/project/Data/Yield/CACropYield90s.csv")
df_80s = pd.read_csv("/Users/Hsieh/desktop/project/Data/Yield/CACropYield80s.csv")
df_70s = pd.read_csv("/Users/Hsieh/desktop/project/Data/Yield/CACropYield70s.csv")

In [None]:
#2: Comibine into master dataframe: 
master_df = df_70s.append([df_80s, df_90s, df_2000s])
master_df.sort("Year", ascending=True, inplace=True)
master_df.reset_index(inplace=True)
#back up copy:
master_df_copy = master_df.copy()

In [None]:
#3: Refine dataframe: 
#a) filter unnecessary values:
master_df = master_df[master_df['Domain']=='TOTAL']
master_df = master_df[master_df['Period']=='YEAR']
master_df = master_df[master_df["County"]!='OTHER (COMBINED) COUNTIES']
#b) drop unnecessary columns:
master_df.drop(['Program', 'Week Ending','Geo Level', 'State', 'State ANSI','Ag District Code','County ANSI' \
                ,'Zip Code','Region', 'watershed_code','Watershed', 'Domain', 'Domain Category', 'CV (%)','Period'\
                ,'index','Ag District'
               ], axis=1, inplace=True)

In [None]:
#4: Parse "Data Item" column and add new columns: 
#a) filter out rows in "Data Item" with the word "YIELD":
master_df = master_df[master_df["Data Item"].apply(lambda x: True if ("YIELD" in x) else False)]
#b) parse String in "Data Item":
#add new column: "Crop Attributes"(set; attributes of associated crops), "Unit"(string; yield unit):
master_df["Crop Attributes"], master_df["Unit"] = zip(*master_df["Data Item"].apply(parse_data_item))

In [None]:
#5: Add attributes in "Crop Attributes" as dummy variables: 
add_columns(master_df, master_attributes)

In [None]:
#6: "Data Item" values: 
#master_df['Data Item'].unique()

In [None]:
#7: convert values in column "Unit" to float:
master_df["Value"] = master_df["Value"].apply(lambda x: float(x.replace(",","")))

# Part III: EDA: 

## a) Overview: 

In [None]:
#1: original column names:
master_df.columns

In [None]:
#2: make df for model: 
#drop unnecessary columns: 
archive_columns = ["Data Item","Crop Attributes","PROCESSING","DRY EDIBLE","OTHER","IN THE OPEN"]
model_df = master_df.drop(archive_columns, axis=1, inplace=False)

In [None]:
#3: model column names:
model_df.columns

In [None]:
#4: general commodity types:
model_df["Commodity"].unique()

In [None]:
#5: pre/post 1990:
#a: pre 1990:
pre_1990_df = model_df[model_df["Year"]<=1989]
#b: post 1990:
post_1990_df = model_df[model_df["Year"]>1989]

In [None]:
#6: count of net acre harvested v.s. count of acre harvested:
print "PRE-1990:"
print pre_1990_df["Unit"].value_counts()
#b: post 1990:
post_1990_df = model_df[model_df["Year"]>1989]
print "POST-1989"
print post_1990_df["Unit"].value_counts()
#c: conclusion: 
#i. after 1989, almost only unit/acre data avaliable 
#ii. before 1989 (1989 included): about 50/5

In [None]:
#7: cout irrigated vs non-rrigated vs n/a:
#a) all time: 
count_irrigation(model_df, "all time")
#b) pre-1900:
count_irrigation(pre_1990_df, "pre 1900")
#c) post-1900:
count_irrigation(post_1990_df, "post 1900")
#d) conclusion: 
#ratio about constant over the years (3 times more non-irrigated crops than irrigated crops )

In [None]:
#8: total # crop types over the years (out put every 10 year mark):
for year in xrange(1990, 2020, 5):
    count_sub_type(model_df, year)
    print "************************"

## b) Feature Engineering: 

In [None]:
#1: creating conversion rates for diff units (for crops measured in bu)
#resources: 
#https://www.agric.gov.ab.ca/app19/calc/crop/bushel2tonne.jsp
#http://www.grains.org/buyingselling/conversion-factors

tonne_per_ton = 0.907185
#tonne_per_bu for each commodities: 
tonne_bu_dict = OrderedDict({"CORN":0.25, "BARLEY": 0.021, "WHEAT":0.027, "SORGHUM":0.25, "OATS":0.015})
#ton_per_bu for each commodities:
ton_bu_dict = {}
for commodity in tonne_bu_dict:
    ton_bu_dict[commodity] = tonne_bu_dict[commodity]/tonne_per_ton

In [None]:
print ton_bu_dict

In [None]:
#2: standardized df unit to 'TONS / ACRE':
model_df = standardrize_unit(model_df, ton_bu_dict)

## c) Graphs (for each crop types) (w/ 2014):

In [None]:
year = 2011

## i: Bar Graphs: "yield/acre" v.s. "counties" 

In [None]:
graph_yield(model_df, year, 'County')

## d) Multicollinearity:

In [None]:
#N/A

# Part IV: Features Selection: 

In [None]:
#N/A

# Part V: Models:

## a) 1st Model (timeseries on future yield):

# Part VI: Assessment: 

# Part VII: Conclusions:

# Part VIII: Archive Code:

In [None]:
#1: 

#a) Handpick column names that specifies the type of crop (commodity): 
#NOTED: only beans have "DRY EDIBLE", 
#sub_domain = ["DURUM","GRAIN","SMALL","UPLAND","LIMA","WHITE","KIDNEY","SILAGE","CHICKPEAS"]

#b): Add these sub_domain name to "Commodity" name: 
#add_sub_domain(master_df, sub_domain)
#Function 3: Add sub_domain name to "Commodity" name, then remove the sub_domain names from columns:
"""
def add_sub_domain(df, l):
    #INPUT: df (data frame), l (list of sub_domain names )
    #OUTPUT: N/A
    #OVERVIEW: FILL IN 
    for name in l:
        if df.ix[:, name] == 1:
"""

#2: 

#Pivot data frame so it is right for time series: 
#this line has the basic features:
#index_list = ["County","Commodity","Unit",\
#this line has the season harvested:
#              "SPRING","FOLLOWING SUMMER FALLOW","WINTER","CONTINUOUS CROP",\
#this line has the type of famring techniques used: 
#              "IRRIGATED","NON-IRRIGATED",\
#this line has the sub_domains:
#              "DURUM","GRAIN","SMALL","UPLAND","LIMA","WHITE",\
#              "KIDNEY","SILAGE","RED","PIMA","CHICKPEAS",\
#this line has the usage types:
#              "OIL TYPE","NON-OIL TYPE"]

#p_model_df = pd.pivot_table(model_df, values="Value", index=index_list, columns=["Year"])