# Cleaning Soybean Data

#### Import Line

In [2]:
import pandas as pd
import glob
import numpy as np
pd.set_option("display.max_columns",100)
pd.set_option("display.max_rows",200)
import re
import matplotlib.pyplot as plt

%matplotlib inline

In [13]:
data_dir = "../data/Soybean_Trial_Data/UIUC_SAES/conventional/entries"

In [16]:
glob.glob(data_dir+"conventional/entries/*")

[]

### Reading in files

In [14]:
cv_file_lists ={"entries": glob.glob(data_dir+"conventional/entries/*"),
             "region1": glob.glob(data_dir+"conventional/Region1/*"),
             "region2": glob.glob(data_dir+"conventional/Region2/*"),
             "region3": glob.glob(data_dir+"conventional/Region3/*"),
             "region4": glob.glob(data_dir+"conventional/Region4/*"),
             "region5": glob.glob(data_dir+"conventional/Region5/*"),
             "region6": glob.glob(data_dir+"conventional/Region6/*"),
             "urbana": glob.glob(data_dir+"conventional/Urbana/*")
            }

In [15]:
cv_file_lists

{'entries': [],
 'region1': [],
 'region2': [],
 'region3': [],
 'region4': [],
 'region5': [],
 'region6': [],
 'urbana': []}

In [6]:
roundup_file_lists ={"entries": glob.glob(data_dir+"roundup/entries/*"),
             "region1": glob.glob(data_dir+"roundup/Region1/*"),
             "region2": glob.glob(data_dir+"roundup/Region2/*"),
             "region3": glob.glob(data_dir+"roundup/Region3/*"),
             "region4": glob.glob(data_dir+"roundup/Region4/*"),
             "region5": glob.glob(data_dir+"roundup/Region5/*"),
             "region6": glob.glob(data_dir+"roundup/Region6/*"),
             "urbana": glob.glob(data_dir+"roundup/Urbana/*")
            }

In [7]:
liberty_file_lists ={"entries": glob.glob(data_dir+"liberty/entries/*"),
             "region1": glob.glob(data_dir+"liberty/Region1/*"),
             "region2": glob.glob(data_dir+"liberty/Region2/*"),
             "region3": glob.glob(data_dir+"liberty/Region3/*"),
             "region4": glob.glob(data_dir+"liberty/Region4/*"),
             "region5": glob.glob(data_dir+"liberty/Region5/*")
            }

This is a loooong function for cleaning the different column names

In [8]:
replace_dict = {"regional":"region",
                "dixonsprings":"dixon_springs"}

def clean_column(x):
    x = unidec(x)
    x = x.strip()
    x = x.lower()
    x = x.translate(remv_punc)
    for k,v in replace_dict.items():
        x = x.replace(k, v)
    x = x.replace(" ","_")
    if ("results" in x):
        x = x.replace("results","").replace("_results_","")
    if ("company" in x)|("brand" in x):
        x = "company"
    if ("variety" in x)|("cultivar" in x)|("name" in x):
        x = "variety"
    if ("yieldbua" in x):
        x = x.split("yield")[0] + "_yield"
    if ("yldbua" in x):
        x = x.split("yldbua")[0] + "_yield"
    if ("oil" in x):
        x = x.split("oil")[0] + "_oil"
    if ("protein" in x):
        x = x.split("protein")[0] + "_protein" 
    if ("yld" in x):
        x = x.split("yld")[0] + "_yield"
    if ("mat" in x):
        x = "maturity"
    if ("hgt" in x)|("htin" in x)|("ht" == x):
        x = "height"
    if (x == "ave2_yrbua")|(x == "ave2_yr")|("9697" in x)|(x=='2_yravg_yield')|(x=='2yravebua')|("2yr" in x):
        x = "region_yield_2yr_avg"
    if (x == "ave3_yrbua")|(x == "ave3_yr")|("9597" in x)|(x=='3_yravg_yield')|(x=='3yravebua')|("3yr" in x):
        x = "region_yield_3yr_avg"
    if ("2_yield" in x):
        x=x.replace("2_yield","_yield")
    if ("3_yield" in x):
        x=x.replace("3_yield","_yield")
    if (x=="_yield")|(x=="reg_yield")|(x=='2_yield')|(x=="regional_yield"):
        x="region_yield"
    if (x=="_oil")|(x=="region_oil")|(x=='regional_oil'):
        x="region_oil"
    if (x=="_protein")|(x=="region_protein")|(x=='regional_protein'):
        x="region_protein"
    if (x=="lodg")|(x=='1997lodg')|(x=='resultslodg')|(x=='region_lodging'):
        x="lodging"
    if (x=="shat")|(x=="shatter"):
        x="shattering"
    if ("herbicide_trait" in x):
        x= "herbicide_trait"
    if ("__" in x):
        x = x.replace("__","_")
    return x

In [9]:
def clean_entries_column(x):
    x = unidec(str(x))
    x = x.replace("*","")
    x = x.lower()
    x = x.translate(remv_punc)
    x = x.replace("brand","")
    x = x.strip()
    
    if x=="i":
        x="1"
    if x=="ii":
        x="2"
    if x=="iii":
        x="3"
    if x=="iv":
        x="4"
    if x=="v":
        x="5"
    if x=="vi":
        x="6"
    
    if (x=="mg")|(x=="m")|(x=="group"):
        x= "maturity"
    
    if ("hc" in x)|("color" in x):
        x="color"
    if ("herb" in x):
        x="herb_trait" 
    if x=="seed trt":
        x="st"
        
    if x=="scn":
        x="sn"
    if "info" in x:
        x="entry_info"
    if x=="nem":
        x="sn"
    
    return x

Here are functions for removing punctuation from the string as well as unicode.

In [10]:
import string
from unidecode import unidecode as unidec

# Taking out the "_" character
punc_list = string.punctuation[:-6] + string.punctuation[-5:]

# This maketrans function can be used to remove a list of characters from a string.
remv_punc = str.maketrans('','',punc_list)

# Remove unicode
def unicode_remove(x):
    return str(x).replace(u'\xa0', u'')

Read in files

In [11]:
# Read in the excel files
cv = {}
roundup = {}
liberty = {}

for region in cv_file_lists.keys():
    cv[region] = {}
    for file in cv_file_lists[region]:
        year = file.lower().split("\\")[1].split(".xl")[0]
        cv[region][year] = pd.read_excel(file)

for region in roundup_file_lists.keys():
    roundup[region] = {}
    for file in roundup_file_lists[region]:
        year = file.lower().split("\\")[1].split(".xl")[0]
        roundup[region][year] = pd.read_excel(file)
        
for region in liberty_file_lists.keys():
    liberty[region] = {}
    for file in liberty_file_lists[region]:
        year = file.lower().split("\\")[1].split(".xl")[0]
        liberty[region][year] = pd.read_excel(file)    

Nested for loops to read in and clean the columns.

In [12]:
# For everything but "entries"
clean_tables = {}
for region in cv.keys():
    clean_tables[region] = {}
    if region!="entries":
        for year in cv[region].keys():
            print(year,region)
            temp = cv[region][year]
            
            # Remove unicode characters
            temp = temp.applymap(unicode_remove).replace("",np.nan).replace("nan",np.nan)
            
            # These rows contain column names
            temp_cols = list(temp.iloc[:5,:].fillna("").sum(axis=0))
            
            # Take them out and assign them as columns.
            temp = temp.iloc[5:,:]
            
            temp.columns = temp_cols
            
            # Get rid of all null rows
            temp = temp[temp.count(axis=1)!=0]
            
            # Find the number non-null entries per row
            entry_counts = temp.count(axis=1)
            
            # Find the ixs which equal 1, these ones separate tables.
            ixs = list(temp[(entry_counts==1)&(~pd.isnull(temp.iloc[:,0]))].index) + [list(temp.index)[-1]+2]
            
            # Cycle through and extract info.
            res = []
            for x in range(1,len(ixs)):
                temp_ = temp.loc[ixs[x-1]:ixs[x]-1,:]
                temp_['mgroup'] = list(temp.loc[ixs[x-1]])[0]
                res += [temp_]
            
            # Concat results together
            temp = pd.concat(res,axis=0)
            
            # Take out rows where either first or second column are missing, as these are label rows.
            temp = temp[(~pd.isnull(temp.iloc[:,0]))&(~pd.isnull(temp.iloc[:,1]))]
            
            
            
            # assign year and region
            temp['year'] = year
            temp['region'] = region
            
            # Drop these too after parsing unicode.
            temp = temp[(temp.iloc[:,0].apply(unicode_remove)!="")&\
                        (temp.iloc[:,1].apply(unicode_remove)!="")]
            
            # Apply the clean column function.
            temp.columns = [clean_column(x) for x in temp.columns]
            
            temp = temp.drop(list(temp.count()[temp.count()==0].index),axis=1)
            
            clean_tables[region][year] = temp

clean_cv_entries = {}
for region in cv.keys():
    if region=="entries":
        for year in cv["entries"].keys():
            print(year,region)
            temp = cv[region][year]
            
            # Remove unicode characters
            temp = temp.applymap(unicode_remove).replace("",np.nan).replace("nan",np.nan)
            
            # Get rid of all null rows
            temp = temp[temp.count(axis=1)!=0]
            
            temp['year'] = year

            temp.columns = [clean_entries_column(x) for x in temp.columns]
            
            temp = temp.drop(list(temp.count()[temp.count()==0].index),axis=1)
            
            clean_cv_entries[year] = temp

1995 region1
1996 region1
1997 region1
1998 region1
1999 region1
2000 region1


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_['mgroup'] = list(temp.loc[ixs[x-1]])[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_['mgroup'] = list(temp.loc[ixs[x-1]])[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_['mgroup'] = list(temp.loc[ixs[x-1]])[0]
A value is trying to be set on a copy of a slice from a DataFrame.


2001 region1
2002 region1
2003 region1
2004 region1
2005 region1
2006 region1
2007 region1


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_['mgroup'] = list(temp.loc[ixs[x-1]])[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_['mgroup'] = list(temp.loc[ixs[x-1]])[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_['mgroup'] = list(temp.loc[ixs[x-1]])[0]
A value is trying to be set on a copy of a slice from a DataFrame.


2008 region1
2009 region1
2010 region1
2011 region1
2012 region1
2013 region1
2014 region1
2015 region1
2016 region1
2017 region1


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_['mgroup'] = list(temp.loc[ixs[x-1]])[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_['mgroup'] = list(temp.loc[ixs[x-1]])[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_['mgroup'] = list(temp.loc[ixs[x-1]])[0]
A value is trying to be set on a copy of a slice from a DataFrame.


2018 region1
2019 region1
2020 region1
1995 region2
1996 region2
1997 region2


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_['mgroup'] = list(temp.loc[ixs[x-1]])[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_['mgroup'] = list(temp.loc[ixs[x-1]])[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_['mgroup'] = list(temp.loc[ixs[x-1]])[0]
A value is trying to be set on a copy of a slice from a DataFrame.


1998 region2
1999 region2
2000 region2
2001 region2
2002 region2
2003 region2
2004 region2


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_['mgroup'] = list(temp.loc[ixs[x-1]])[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_['mgroup'] = list(temp.loc[ixs[x-1]])[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_['mgroup'] = list(temp.loc[ixs[x-1]])[0]
A value is trying to be set on a copy of a slice from a DataFrame.


2005 region2
2006 region2
2007 region2
2008 region2
2009 region2
2010 region2
2011 region2


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_['mgroup'] = list(temp.loc[ixs[x-1]])[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_['mgroup'] = list(temp.loc[ixs[x-1]])[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_['mgroup'] = list(temp.loc[ixs[x-1]])[0]
A value is trying to be set on a copy of a slice from a DataFrame.


2012 region2
2013 region2
2014 region2
2015 region2
2016 region2
2017 region2
2018 region2
2019 region2
2020 region2


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_['mgroup'] = list(temp.loc[ixs[x-1]])[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_['mgroup'] = list(temp.loc[ixs[x-1]])[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_['mgroup'] = list(temp.loc[ixs[x-1]])[0]
A value is trying to be set on a copy of a slice from a DataFrame.


1995 region3
1996 region3
1997 region3
1998 region3
1999 region3
2000 region3
2001 region3


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_['mgroup'] = list(temp.loc[ixs[x-1]])[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_['mgroup'] = list(temp.loc[ixs[x-1]])[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_['mgroup'] = list(temp.loc[ixs[x-1]])[0]
A value is trying to be set on a copy of a slice from a DataFrame.


2002 region3
2003 region3
2004 region3
2005 region3
2006 region3


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_['mgroup'] = list(temp.loc[ixs[x-1]])[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_['mgroup'] = list(temp.loc[ixs[x-1]])[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_['mgroup'] = list(temp.loc[ixs[x-1]])[0]
A value is trying to be set on a copy of a slice from a DataFrame.


2007 region3
2008 region3
2009 region3
2010 region3
2011 region3
2012 region3
2013 region3
2014 region3
2015 region3
2016 region3
2017 region3
2018 region3
2019 region3
2020 region3
1995 region4


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_['mgroup'] = list(temp.loc[ixs[x-1]])[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_['mgroup'] = list(temp.loc[ixs[x-1]])[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_['mgroup'] = list(temp.loc[ixs[x-1]])[0]
A value is trying to be set on a copy of a slice from a DataFrame.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_['mgroup'] = list(temp.loc[ixs[x-1]])[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_['mgroup'] = list(temp.loc[ixs[x-1]])[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_['mgroup'] = list(temp.loc[ixs[x-1]])[0]
A value is trying to be set on a copy of a slice from a DataFrame.


1996 region4
1997 region4
1998 region4
1999 region4
2000 region4
2001 region4
2002 region4
2003 region4
2004 region4
2005 region4


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_['mgroup'] = list(temp.loc[ixs[x-1]])[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_['mgroup'] = list(temp.loc[ixs[x-1]])[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_['mgroup'] = list(temp.loc[ixs[x-1]])[0]
A value is trying to be set on a copy of a slice from a DataFrame.


2006 region4
2007 region4
2008 region4
2009 region4
2010 region4
2011 region4
2012 region4
2013 region4


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_['mgroup'] = list(temp.loc[ixs[x-1]])[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_['mgroup'] = list(temp.loc[ixs[x-1]])[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_['mgroup'] = list(temp.loc[ixs[x-1]])[0]
A value is trying to be set on a copy of a slice from a DataFrame.


2014 region4
2015 region4
2016 region4
2017 region4
2018 region4
2019 region4
2020 region4
1995 region5
1996 region5


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_['mgroup'] = list(temp.loc[ixs[x-1]])[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_['mgroup'] = list(temp.loc[ixs[x-1]])[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_['mgroup'] = list(temp.loc[ixs[x-1]])[0]
A value is trying to be set on a copy of a slice from a DataFrame.


1997 region5
1998 region5
1999 region5
2000 region5
2001 region5
2002 region5
2003 region5
2004 region5
2005 region5


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_['mgroup'] = list(temp.loc[ixs[x-1]])[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_['mgroup'] = list(temp.loc[ixs[x-1]])[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_['mgroup'] = list(temp.loc[ixs[x-1]])[0]
A value is trying to be set on a copy of a slice from a DataFrame.


2006 region5
2007 region5
2008 region5
2009 region5
2010 region5
2011 region5
2012 region5
2013 region5
2014 region5


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_['mgroup'] = list(temp.loc[ixs[x-1]])[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_['mgroup'] = list(temp.loc[ixs[x-1]])[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_['mgroup'] = list(temp.loc[ixs[x-1]])[0]
A value is trying to be set on a copy of a slice from a DataFrame.


2015 region5
2016 region5
2017 region5
2018 region5
2019 region5
2020 region5
1995 region6


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_['mgroup'] = list(temp.loc[ixs[x-1]])[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_['mgroup'] = list(temp.loc[ixs[x-1]])[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_['mgroup'] = list(temp.loc[ixs[x-1]])[0]
A value is trying to be set on a copy of a slice from a DataFrame.


1996 region6
1997 region6
1998 region6
1999 region6
1995 urbana
1996 urbana
1997 urbana


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_['mgroup'] = list(temp.loc[ixs[x-1]])[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_['mgroup'] = list(temp.loc[ixs[x-1]])[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_['mgroup'] = list(temp.loc[ixs[x-1]])[0]
A value is trying to be set on a copy of a slice from a DataFrame.


1998 urbana
1999 urbana
2000 urbana
2001 urbana
2002 urbana
2003 urbana
2004 urbana
2005 urbana


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_['mgroup'] = list(temp.loc[ixs[x-1]])[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_['mgroup'] = list(temp.loc[ixs[x-1]])[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_['mgroup'] = list(temp.loc[ixs[x-1]])[0]
A value is trying to be set on a copy of a slice from a DataFrame.


2006 urbana
2007 urbana
2008 urbana
2009 urbana
2010 urbana
2011 urbana
1997 entries
1998 entries
1999 entries
2000 entries
2001 entries
2002 entries
2003 entries
2004 entries
2005 entries
2006 entries
2007 entries
2008 entries
2009 entries
2010 entries
2011 entries
2012 entries
2013 entries
2014 entries
2015 entries
2016 entries
2017 entries
2018 entries
2019 entries
2020 entries


In [13]:
# Roundup
clean_tables_roundup = {}
for region in roundup.keys():
    clean_tables_roundup[region] = {}
    if region!="entries":
        for year in roundup[region].keys():
            print(year,region)
            temp = roundup[region][year]
            
            # Remove unicode characters
            temp = temp.applymap(unicode_remove).replace("",np.nan).replace("nan",np.nan)
            
            # These rows contain column names
            temp_cols = list(temp.iloc[:5,:].fillna("").sum(axis=0))
            
            # Take them out and assign them as columns.
            temp = temp.iloc[5:,:]
            
            temp.columns = temp_cols
            
            # Get rid of all null rows
            temp = temp[temp.count(axis=1)!=0]
            
            # Find the number non-null entries per row
            entry_counts = temp.count(axis=1)
            
            # Find the ixs which equal 1, these ones separate tables.
            ixs = list(temp[(entry_counts==1)&(~pd.isnull(temp.iloc[:,0]))].index) + [list(temp.index)[-1]+2]
            
            # Cycle through and extract info.
            res = []
            for x in range(1,len(ixs)):
                temp_ = temp.loc[ixs[x-1]:ixs[x]-1,:]
                temp_['mgroup'] = list(temp.loc[ixs[x-1]])[0]
                res += [temp_]
            
            # Concat results together
            temp = pd.concat(res,axis=0)
            
            # Take out rows where either first or second column are missing, as these are label rows.
            temp = temp[(~pd.isnull(temp.iloc[:,0]))&(~pd.isnull(temp.iloc[:,1]))]
            
            
            
            # assign year and region
            temp['year'] = year
            temp['region'] = region
            
            # Drop these too after parsing unicode.
            temp = temp[(temp.iloc[:,0].apply(unicode_remove)!="")&\
                        (temp.iloc[:,1].apply(unicode_remove)!="")]
            
            # Apply the clean column function.
            temp.columns = [clean_column(x) for x in temp.columns]
            
            temp = temp.drop(list(temp.count()[temp.count()==0].index),axis=1)
            
            clean_tables_roundup[region][year] = temp
            
clean_roundup_entries = {}
for region in roundup.keys():
    if region=="entries":
        for year in roundup["entries"].keys():
            print(year,region)
            temp = roundup[region][year]
            
            # Remove unicode characters
            temp = temp.applymap(unicode_remove).replace("",np.nan).replace("nan",np.nan)
            
            # Get rid of all null rows
            temp = temp[temp.count(axis=1)!=0]
            
            temp['year'] = year

            temp.columns = [clean_entries_column(x) for x in temp.columns]
            
            temp = temp.drop(list(temp.count()[temp.count()==0].index),axis=1)
            
            clean_roundup_entries[year] = temp

1998 region1
1999 region1
2000 region1
2001 region1
2002 region1
2003 region1
2004 region1
2005 region1


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_['mgroup'] = list(temp.loc[ixs[x-1]])[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_['mgroup'] = list(temp.loc[ixs[x-1]])[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_['mgroup'] = list(temp.loc[ixs[x-1]])[0]
A value is trying to be set on a copy of a slice from a DataFrame.


2006 region1
2007 region1
2008 region1
2009 region1
2010 region1
2011 region1
2012 region1


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_['mgroup'] = list(temp.loc[ixs[x-1]])[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_['mgroup'] = list(temp.loc[ixs[x-1]])[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_['mgroup'] = list(temp.loc[ixs[x-1]])[0]
A value is trying to be set on a copy of a slice from a DataFrame.


2013 region1
2014 region1
2015 region1
2016 region1
1998 region2
1999 region2
2000 region2
2001 region2
2002 region2
2003 region2
2004 region2


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_['mgroup'] = list(temp.loc[ixs[x-1]])[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_['mgroup'] = list(temp.loc[ixs[x-1]])[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_['mgroup'] = list(temp.loc[ixs[x-1]])[0]
A value is trying to be set on a copy of a slice from a DataFrame.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_['mgroup'] = list(temp.loc[ixs[x-1]])[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_['mgroup'] = list(temp.loc[ixs[x-1]])[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_['mgroup'] = list(temp.loc[ixs[x-1]])[0]
A value is trying to be set on a copy of a slice from a DataFrame.


2005 region2
2006 region2
2007 region2
2008 region2
2009 region2


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_['mgroup'] = list(temp.loc[ixs[x-1]])[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_['mgroup'] = list(temp.loc[ixs[x-1]])[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_['mgroup'] = list(temp.loc[ixs[x-1]])[0]
A value is trying to be set on a copy of a slice from a DataFrame.


2010 region2
2011 region2
2012 region2
2013 region2
2014 region2
2015 region2
2016 region2
1998 region3


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_['mgroup'] = list(temp.loc[ixs[x-1]])[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_['mgroup'] = list(temp.loc[ixs[x-1]])[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_['mgroup'] = list(temp.loc[ixs[x-1]])[0]
A value is trying to be set on a copy of a slice from a DataFrame.


1999 region3
2000 region3
2001 region3
2002 region3
2003 region3
2004 region3
2005 region3


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_['mgroup'] = list(temp.loc[ixs[x-1]])[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_['mgroup'] = list(temp.loc[ixs[x-1]])[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_['mgroup'] = list(temp.loc[ixs[x-1]])[0]
A value is trying to be set on a copy of a slice from a DataFrame.


2006 region3
2007 region3
2008 region3
2009 region3
2010 region3
2011 region3


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_['mgroup'] = list(temp.loc[ixs[x-1]])[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_['mgroup'] = list(temp.loc[ixs[x-1]])[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_['mgroup'] = list(temp.loc[ixs[x-1]])[0]
A value is trying to be set on a copy of a slice from a DataFrame.


2012 region3
2013 region3
2014 region3
2015 region3
2016 region3
1998 region4
1999 region4
2000 region4


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_['mgroup'] = list(temp.loc[ixs[x-1]])[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_['mgroup'] = list(temp.loc[ixs[x-1]])[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_['mgroup'] = list(temp.loc[ixs[x-1]])[0]
A value is trying to be set on a copy of a slice from a DataFrame.


2001 region4
2002 region4
2003 region4
2004 region4
2005 region4
2006 region4
2007 region4


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_['mgroup'] = list(temp.loc[ixs[x-1]])[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_['mgroup'] = list(temp.loc[ixs[x-1]])[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_['mgroup'] = list(temp.loc[ixs[x-1]])[0]
A value is trying to be set on a copy of a slice from a DataFrame.


2008 region4
2009 region4
2010 region4
2011 region4
2012 region4
2013 region4
2014 region4
2015 region4


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_['mgroup'] = list(temp.loc[ixs[x-1]])[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_['mgroup'] = list(temp.loc[ixs[x-1]])[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_['mgroup'] = list(temp.loc[ixs[x-1]])[0]
A value is trying to be set on a copy of a slice from a DataFrame.


2016 region4
1998 region5
1999 region5
2000 region5
2001 region5


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_['mgroup'] = list(temp.loc[ixs[x-1]])[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_['mgroup'] = list(temp.loc[ixs[x-1]])[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_['mgroup'] = list(temp.loc[ixs[x-1]])[0]
A value is trying to be set on a copy of a slice from a DataFrame.


2002 region5
2003 region5
2004 region5
2005 region5
2006 region5


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_['mgroup'] = list(temp.loc[ixs[x-1]])[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_['mgroup'] = list(temp.loc[ixs[x-1]])[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_['mgroup'] = list(temp.loc[ixs[x-1]])[0]
A value is trying to be set on a copy of a slice from a DataFrame.


2007 region5
2008 region5
2009 region5
2010 region5
2011 region5
2012 region5
2013 region5


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_['mgroup'] = list(temp.loc[ixs[x-1]])[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_['mgroup'] = list(temp.loc[ixs[x-1]])[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_['mgroup'] = list(temp.loc[ixs[x-1]])[0]
A value is trying to be set on a copy of a slice from a DataFrame.


2014 region5
2015 region5
2016 region5
1999 region6
1999 urbana
2000 urbana
2001 urbana
2002 urbana


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_['mgroup'] = list(temp.loc[ixs[x-1]])[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_['mgroup'] = list(temp.loc[ixs[x-1]])[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_['mgroup'] = list(temp.loc[ixs[x-1]])[0]
A value is trying to be set on a copy of a slice from a DataFrame.


2003 urbana
2004 urbana
2005 urbana
2006 urbana
2007 urbana
2008 urbana
2009 urbana
2010 urbana
2011 urbana


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_['mgroup'] = list(temp.loc[ixs[x-1]])[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_['mgroup'] = list(temp.loc[ixs[x-1]])[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_['mgroup'] = list(temp.loc[ixs[x-1]])[0]
A value is trying to be set on a copy of a slice from a DataFrame.


1998 entries
1999 entries
2000 entries
2001 entries
2002 entries
2003 entries
2004 entries
2005 entries
2006 entries
2007 entries
2008 entries
2009 entries
2010 entries
2011 entries
2012 entries
2013 entries
2014 entries
2015 entries
2016 entries


In [14]:
# Liberty
clean_tables_liberty = {}
for region in liberty.keys():
    clean_tables_liberty[region] = {}
    if region!="entries":
        for year in liberty[region].keys():
            print(year,region)
            temp = liberty[region][year]
            
            # Remove unicode characters
            temp = temp.applymap(unicode_remove).replace("",np.nan).replace("nan",np.nan)
            
            # These rows contain column names
            temp_cols = list(temp.iloc[:5,:].fillna("").sum(axis=0))
            
            # Take them out and assign them as columns.
            temp = temp.iloc[5:,:]
            
            temp.columns = temp_cols
            
            # Get rid of all null rows
            temp = temp[temp.count(axis=1)!=0]
            
            
            # Take out rows where either first or second column are missing, as these are label rows.
            temp = temp[(~pd.isnull(temp.iloc[:,0]))&(~pd.isnull(temp.iloc[:,1]))&(~pd.isnull(temp.iloc[:,2]))]
            
            
            
            # assign year and region
            temp['year'] = year
            temp['region'] = region
            
            # Drop these too after parsing unicode.
            temp = temp[(temp.iloc[:,0].apply(unicode_remove)!="")&\
                        (temp.iloc[:,1].apply(unicode_remove)!="")]
            
            # Apply the clean column function.
            temp.columns = [clean_column(x) for x in temp.columns]
            
            temp = temp.drop(list(temp.count()[temp.count()==0].index),axis=1)
            
            clean_tables_liberty[region][year] = temp
            
clean_liberty_entries = {}
for region in liberty.keys():
    if region=="entries":
        for year in liberty["entries"].keys():
            print(year,region)
            temp = liberty[region][year]
            
            # Remove unicode characters
            temp = temp.applymap(unicode_remove).replace("",np.nan).replace("nan",np.nan)
            
            # Get rid of all null rows
            temp = temp[temp.count(axis=1)!=0]
            
            temp['year'] = year

            temp.columns = [clean_entries_column(x) for x in temp.columns]
            
            temp = temp.drop(list(temp.count()[temp.count()==0].index),axis=1)
            
            clean_liberty_entries[year] = temp

2015 region1
2016 region1
2015 region2
2016 region2
2015 region3
2016 region3
2015 region4
2016 region4
2015 region5
2016 region5
2015 entries
2016 entries


#### Yield Data

In [15]:
df_lib = pd.concat([pd.concat(clean_tables_liberty[reg],axis=0) for reg in \
                   ["region1","region2","region3","region4","region5"]],axis=0)

df_lib['type'] = "liberty"

df_rp = pd.concat([pd.concat(clean_tables_roundup[reg],axis=0) for reg in \
                   ["region1","region2","region3","region4","region5","region6","urbana"]],axis=0)

df_rp['type'] = "roundup"

df_cv = pd.concat([pd.concat(clean_tables[reg],axis=0) for reg in \
                   ["region1","region2","region3","region4","region5","region6","urbana"]],axis=0)

df_cv['type'] = "conventional"

df = pd.concat([df_cv,df_rp,df_lib],axis=0)

#### Entries

In [16]:
entries_cv = pd.concat([x for k,x in clean_cv_entries.items()],axis=0)
entries_cv = entries_cv[~pd.isnull(entries_cv['variety'])]
entries_cv = entries_cv[~pd.isnull(entries_cv['company'])]
entries_cv = entries_cv[~pd.isnull(entries_cv['maturity'])]
entries_cv['type'] = "conventional"

entries_rp = pd.concat([x for k,x in clean_roundup_entries.items()],axis=0)
entries_rp = entries_rp[~pd.isnull(entries_rp['variety'])]
entries_rp = entries_rp[~pd.isnull(entries_rp['company'])]
entries_rp = entries_rp[~pd.isnull(entries_rp['maturity'])]
entries_rp['type'] = "roundup"

entries_lib = pd.concat([x for k,x in clean_liberty_entries.items()],axis=0)
entries_lib = entries_lib[~pd.isnull(entries_lib['variety'])]
entries_lib = entries_lib[~pd.isnull(entries_lib['company'])]
entries_lib = entries_lib[~pd.isnull(entries_lib['maturity'])]
entries_lib['type'] = "liberty"

entries = pd.concat([entries_cv,entries_rp,entries_lib],axis=0)

## Data cleaning operations

Omitt these rows, as they have info on the names of the herbicide traits (but not any other info).

In [17]:
df = df[~pd.isnull(df['maturity'])]

### Cleaning the maturity date

In [18]:
df['maturity_date'] = df['maturity'].apply(lambda x: str(x).split(" ")[0])

def date_clean(x):
    if len(x['maturity_date'])<6:
        x['maturity_date'] = str(x['year']) +"-"+ str(x['maturity_date']).replace("/","-")
        return x
    else:
        return x
    
df = df.apply(lambda x: date_clean(x),axis=1)

df['maturity_date'] = df['year'] + df['maturity_date'].apply(lambda x: x[4:])

df['maturity_date'] = pd.to_datetime(df['maturity_date'],errors='coerce')

In [19]:
df.drop("maturity",axis=1,inplace=True)

### Wide to long conversion

In [20]:
id_vars = ['year','region','type','company','variety','mgroup']

df = df.reset_index().set_index(id_vars)

Yield data

In [21]:
y = df.filter(regex="_yield",axis=1)

y.columns = ["yield" + "".join(x.split("_yield")[1:]).replace("_","")+ "_" + x.split("_yield")[0] for x in y.columns]

y = y.apply(pd.to_numeric,axis=1,errors="coerce")

y = pd.wide_to_long(y.reset_index(),\
                stubnames = ["yield",'yield2yravg','yield3yravg'],\
                j='location',\
                sep="_",\
                i=id_vars,\
                suffix='\\w+')

y = y[y.count(axis=1)!=0]

Protein data

In [22]:
P = df.filter(regex="_protein",axis=1)

P.columns = ["protein"+ "_" + x.split("_protein")[0] for x in P.columns]

P = P.apply(pd.to_numeric,axis=1,errors="coerce")

P = pd.wide_to_long(P.reset_index(),\
                stubnames = ["protein"],\
                j='location',\
                sep="_",\
                i=id_vars,\
                suffix='\\w+')

P = P[P.count(axis=1)!=0]

Oil data

In [23]:
O = df.filter(regex="_oil",axis=1)

O.columns = ["oil"+ "_" + x.split("_oil")[0] for x in O.columns]

O = O.apply(pd.to_numeric,axis=1,errors="coerce")

O = pd.wide_to_long(O.reset_index(),\
                stubnames = ["oil"],\
                j='location',\
                sep="_",\
                i=id_vars,\
                suffix='\\w+')

O = O[O.count(axis=1)!=0]

Other columns

In [24]:
other_columns = \
list(df.filter(regex="_yield",axis=1).columns) + \
list(df.filter(regex="_protein",axis=1).columns) + \
list(df.filter(regex="_oil",axis=1).columns)

other_columns = list(set(df.columns) - set(other_columns))

other_columns.sort()

other = df[other_columns]

other = other.reset_index().set_index(id_vars)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,height,herbicide_trait,ist1,level_0,level_1,lodging,lodging05,lodging15,maturity_date,shattering,st1,st2
year,region,type,company,variety,mgroup,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
1995,region1,conventional,CALLAHAN SEEDS,4195,MATURITY GROUP I,26,,,1995,8,1.1,,,1995-09-17,1,,
1995,region1,conventional,COLE GROWER SERVICE,DYNA-GRO 3180,MATURITY GROUP I,31,,,1995,9,1.9,,,1995-09-16,1,,
1995,region1,conventional,COLE GROWER SERVICE,DYNA-GRO 3195,MATURITY GROUP I,32,,,1995,10,2,,,1995-09-14,1,,
1995,region1,conventional,COLE GROWER SERVICE,DYNA-GRO UAPX 150,MATURITY GROUP I,28,,,1995,11,1.1,,,1995-09-13,1,,
1995,region1,conventional,MUIRHEAD FARMS,MF 198,MATURITY GROUP I,33,,,1995,12,1.2,,,1995-09-17,1,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2016,region5,liberty,Dyna-Gro,S45LL97,,43.834,,,2016,34,1.667,,,2016-10-03,,CC,
2016,region5,liberty,Hisoy,HS 47L50,,43.5,,,2016,35,1.667,,,2016-09-27,,ACC,
2016,region5,liberty,Hisoy,HS 49L50,,45.334,,,2016,36,1.667,,,2016-10-06,,CC,
2016,region5,liberty,Hoblit,457 LL,,40,,,2016,37,1,,,2016-10-04,,PRSLD,


Join them all together.

In [25]:
loc_data = y.join(P).join(O)

df_w = loc_data.reset_index().merge(other.reset_index(),how='outer',on=id_vars)

In [26]:
df_w.drop(["level_0","level_1"],axis=1,inplace=True)

### Cleaning the location names

In [27]:
loc_names = {
    "br":"brownstown",
    "be":"belleville",
    "de":"dekalb",
    "dw":"dwight",
    "mo":"monmouth",
    "ur":"urbana",
    "pe":"perry",
    "ds":"dixon_springs",
    "ca":"carbondale",
    "dixonsprings":"dixon_springs"
}

df_w['location'] = df_w['location'].replace(loc_names)

In [28]:
df_w['location'].value_counts().sort_index()

belleville        4411
brownstown        2342
carbondale        1537
dekalb            4479
dixon_springs     1381
dwight            5271
elkville          1666
erie              2927
fenton             381
goodfield         4275
harrisburg        1876
monmouth          6611
mt_morris         2152
mtmorris           582
new_berlin        4522
perry             6517
region           26106
st_peter          1838
urbana            6867
Name: location, dtype: int64

### Cleaning Variety Names

In [29]:
entries['variety'] = entries['variety'].str.replace("*","").str.upper().str.strip()

  entries['variety'] = entries['variety'].str.replace("*","").str.upper().str.strip()


### Cleaning the company names

In [30]:
punc_list2 = "'®\"/."
remv_punc2 = str.maketrans('','',punc_list2)

df_w['company_clean'] = df_w['company'].str.lower().str.strip()
df_w['company_clean'] = df_w['company_clean'].apply(lambda x: x.translate(remv_punc2))
df_w['company_clean'] = df_w['company_clean'].str.replace("&","and")
df_w['company_clean'] = df_w['company_clean'].str.replace("-","_")
df_w['company_clean'] = df_w['company_clean'].str.replace(" ","_")
df_w['company_clean'] = df_w['company_clean'].str.replace("___","_")

In [31]:
entries['company_clean'] = entries['company'].str.lower().str.strip()
entries['company_clean'] = entries['company_clean'].apply(lambda x: x.translate(remv_punc2))
entries['company_clean'] = entries['company_clean'].str.replace("&","and")
entries['company_clean'] = entries['company_clean'].str.replace("-","_")
entries['company_clean'] = entries['company_clean'].str.replace(" ","_")
entries['company_clean'] = entries['company_clean'].str.replace("___","_")

Company name replacement

In [32]:
company_names = {
    "public":"public_variety",
    "public_isu":"public_variety",
    "kruger":"kruger_seed_co",
    "adler":"adler_seeds",
    "ag_alumni":"ag_alumni_seed",
    "agratech_seed":"agratech",
    "agratech_seeds":"agratech",
    "agripro":"agripro_seeds",
    "asgrow":"asgrow_seed_co",
    "baird_seed_farm":"baird_seed_farms",
    "baker":"baker_seed_co",
    "beck":"becks_hybrids",
    "becks":"becks_hybrids",
    "becks_superior_hybrids":"becks_hybrids",
    "belleville_sd_hse":"belleville_seed_house",
    "bell_sd_hse":"belleville_seed_house",
    "brown_seed_enter":"brown_seed_enterprises",
    "brown":"brown_seed_enterprises",
    "cargill_hybrid_seeds":"cargill",
    "callahan":"callahan_seeds",
    "crows":"crows_hybrid_corn_co",
    "dairyland":"dairyland_seed_co",
    "dairyland_seed":"dairyland_seed_co",
    "dekalb":'dekalb_genetics_corp',
    "delta_and_pine_land":"delta_and_pine_land_co",
    "deltapine_seed":"deltapine",
    "delta_king":"delta_king_seed_co",
    "deraedt":"deraedt_seed_co",
    "dereadt":"deraedt_seed_co",
    "diener":"diener_seeds",
    "diener_bros":"diener_seeds",
    "dyna-gro":"dyna_gro",
    'emerge':"emerge_genetics",
    "excel":'excel_brand',
    "freeman":"freeman_seed_co", 
    "freedom":"freedom_seed_co",
    "garst":"garst_seed_co",
    "gateway":"gateway_seed_co",
    "good_buddy":"good_buddy_seeds",
    "gma":"gma_seed_co",
    "great_heart":"great_heart_seed",
    "great_lakes":"great_lakes_hybrids",
    "green_valley":"green_valley_seed",
    "green_valley_sd":"green_valley_seed",
    "griffith":"griffith_seed_co",
    "griffith_pure_line_seeds":"griffith_seed_co",
    "gutwein":"gutwein_seeds",
    "henkel":"henkel_seeds",
    "henkel_seed":"henkel_seeds",
    "hoblit":"hoblit_seed_co",
    "ici_garst":"ici_seeds",
    "illinois_pride":"illinois_pride_genetics",
    "jg_limited":"jgl",
    'kaltenberg':'kaltenberg_seeds',
    'kaltenberg_seed_farms':'kaltenberg_seeds',
    "kitchen":"kitchen_seed_co",
    "kruger":"kruger_seed_co",
    "laprairie_chatton":"laprairie_chatton_elevator",
    "laprairie":"laprairie_chatton_elevator",
    "latham":"latham_seed_co",
    "lewis":"lewis_hybrids",
    "mark":"mark_seed_co",
    "martin":"martin_seeds",
    'merschman':'merschman_seeds',
    "midwest_seed":"midwest_seed_genetics",
    "mw_seed_genetics":"midwest_seed_genetics",
    "midwest_seed_gen":"midwest_seed_genetics",
    "miles":"miles_farm_supply",
    "muirhead":"muirhead_farms",
    "missouri_premium":"missouri",
    "missouri_premium_var":"missouri",
    "msi_a":"msia",
    "mw_premium_gen":"midwest_premium_genetics",
    "midwest_premium_gen":"midwest_premium_genetics",
    'mycogen':"mycogen_seeds",
    'mycogen_plant_sciences':"mycogen_seeds",
    "nextgene":"nextgene_seed",
    "nk":"nk_brand",
    "p3":"p3_genetics",
    'patriot':'patriot_seed_co',
    'pioneer_hi_bred_intern':'pioneer',
    'pioneer_brand':'pioneer',
    'prairie':'prairie_brand_seed_co',
    'prairie_brand':'prairie_brand_seed_co',
    'prairie_heritage':'prairie_heritage_seeds',
    'prairie_stream':'prairie_stream_farms',
    'renk':'renk_seed_co',
    'renk_seed':'renk_seed_co',
    'roeschley':'roeschley_hybrids',
    'schultz':'schultz_seed_co',
    'sieben':'sieben_hybrids',
    'southern_states':'southern_states_coop',
    'stewart':'stewart_hybrids',
    'stine':'stine_seed_co',
    'stone':'stone_seed_group',
    "stone_seed":'stone_seed_group',
    'stone_seed_farms':'stone_seed_group',
    'sun_praire':'sun_prairie_seeds',
    'sun_prairie':'sun_prairie_seeds',
    'sunstar':'sunstar_hybrids',
    'terra':'terra_international',
    'trisler':'trisler_seed_farms',
    'trisler_trisoy':'trisler_seed_farms',
    'trisoy':'trisler_seed_farms',
    'twin_states':'twin_states_seed_co',
    'uap':'uap_seeds',
    'uap_seed':'uap_seeds',
    'viking':'viking_seed',
    'wilken':'wilken_seed_grains',
    "wildy":"wildy_seed"
}

In [33]:
df_w['company_name'] = df_w['company_clean'].replace(company_names)
entries['company_name'] = entries['company_clean'].replace(company_names)

In [34]:
df_w.drop("company_clean",axis=1,inplace=True)
entries.drop("company_clean",axis=1,inplace=True)

### Variety Matching

First find the ones that are not matched.

In [35]:
test = df_w.merge(entries[['company_name','variety','year','type']].drop_duplicates(),how='left',indicator="i")
test = test[test.year>"1996"]
ex = test[test.i=="left_only"]

In [37]:
df_w['variety_old'] = df_w['variety']

In [38]:
D = {}
for y in ex.year.unique():
    D[y] = {}
    for c in ex[(ex.year==y)]['company_name'].unique():
        D[y][c] = {}
        d1 = ex[(ex.company_name==c)&(ex.year==y)]
        d2 = entries[(entries.company_name==c)&(entries['year']==y)][['variety']]
        
        for var in d1['variety']:
            d2['score'] = [fuzz.partial_ratio(var,x) for x in d2['variety']]
            match = d2.sort_values("score")['variety'].iloc[-1]
        
            D[y][c][var]=match
            
        df_w.loc[(df_w.company_name==c)&(df_w.year==y),"variety"] = df_w['variety'].replace(D[y][c])

In [39]:
df_w['producer_nominated'] = df_w['variety_old'].str.contains("\*")

In [40]:
test = df_w.merge(entries[['company_name','variety','year','type']].drop_duplicates(),how='left',indicator="i")
test = test[test.year>"1996"]
ex = test[test.i=="left_only"]

In [41]:
ex1 = entries[(entries.year=="2011")&(entries.company_name=="public_variety")]

It made one mistake:

In [43]:
D['2011']['public_variety']['IA2097RR2Y*'] = 'IA2097RR2Y'

Correcting it:

In [44]:
df_w.loc[(df_w.year=="2011")&(df_w.company_name=="public_variety"),"variety"] = df_w['variety_old'].replace(D['2011']['public_variety'])

Making sure it worked:

In [45]:
test = df_w.merge(entries[['company_name','variety','year','type']].drop_duplicates(),how='left',indicator="i")
test = test[test.year>"1996"]
ex = test[test.i=="left_only"]

In [46]:
len(ex)

0

^ Make sure this is zero

##### Create a public variety id.

In [47]:
df_w['public_variety'] = df_w['company_name'] =="public_variety"

In [48]:
entries['public_variety'] = entries['company_name'] =="public_variety"

### Cleaning Maturity Group

In [49]:
df_w['mgroup'] = \
df_w.mgroup.str.replace(" III"," 3")\
           .str.replace(" II"," 2")\
            .str.replace(" IV"," 4")\
            .str.replace(" V"," 5")\
            .str.replace(" I"," 1")

In [50]:
mrange = df_w.mgroup.str.extract("(\d).\d-(\d).\d")

mrange_cat = mrange[0].astype(str)+mrange[1].astype(str)

df_w['maturity_range'] = mrange_cat.apply(lambda x: "".join(set(x))).replace("an",np.nan)

df_w['mlabel'] = df_w['mgroup'].str.extract("(\d)")

df_w.loc[pd.isnull(df_w.maturity_range),"maturity_range"] = df_w['mlabel']

In [51]:
df_w.drop("mlabel",axis=1,inplace=True)

In [52]:
entries['maturity'] = \
entries['maturity'].replace("III",3)\
                .replace("II",2)\
                .replace("I",1)\
                .replace("IV",4)\
                .replace("V",5)

### Lodging

In [57]:
df_w.loc[(pd.isnull(df_w.lodging))&(~pd.isnull(df_w.lodging15)),"lodging"] = df_w["lodging15"]

In [58]:
df_w['lodging'] = df_w.lodging.astype(float)

In [59]:
df_w = df_w.drop("lodging15",axis=1)

### Traits and Treatment

Combining the seed trait columns into one column.

In [60]:
df_w['seed_treat'] = df_w['st1']

df_w.loc[~pd.isnull(df_w.st2),"seed_treat"] = df_w.st2

df_w.loc[~pd.isnull(df_w.ist1),"seed_treat"] = df_w['ist1']

df_w['seed_treat'] = df_w['seed_treat'].replace("b","B").replace("na",np.nan)

In [61]:
df_w.drop(['st1','st2','ist1'],axis=1,inplace=True)

In [62]:
st1_replace =\
{"na":"No information available",
"U":"Untreated",
"ACC":"Acceleron®",
"ACCN":"Acceleron® + NitroShield®",
"ACCQ":"Acceleron + Cue",
"AMX":"ApronMaxx®",
"AMXV":"ApronMaxx® with Vibrance",
"AST":"Agrishield™ ST System Fungicide+Insecticide",
"AST+":"Agrishield™ ST System Fungicide+Insecticide+Nematicide",
"CC":"Clariva™ Complete Beans",
"CCM":"Clariva™ Complete Beans+Mertect",
"CMX":"CruiserMaxx® Beans",
"CMXO":"CruiserMaxx® Beans with Optimize®",
"CMXV":"CruiserMaxx® Beans with Vibrance®",
"CMXVI":"CruiserMaxx® Beans with Vibrance® plus Illevo®",
"EE":"EverGol™ Energy",
"EEG":"EverGol™ Energy plus Gaucho® 600",
"EEGI":"EverGol™ Energy plus Gaucho® 600 plus Illevo®",
"GIA":"Gaucho® 600 + Illevo®+ Allegiance® FL",
"INTS":"Intego™ Suite",
"PGP":"Profit Guard Plus",
"PV":"Poncho® Votivo®",
"PVI":"Poncho® Votivo® plus Illevo®",
"PVIEE":"Poncho® Votivo® plus Illevo® plus Evergol™ Energy",
"PRSLD":"PowerShield SDS",
"RAN":"Rancona®",
"SS":"SureStand™",
"B":"Insecticide + Fungicide",
"U":"Untreated",
"F":"Fungicide",
"A":"Acceleron",
"Be":"Fungicide + Insecticide + Illevo"}

herbtrait_replace = \
{"CV": "No Trait",  
"EN":"2,4-D, glufosinate and glyphosate", 
"LL": "glufosinate", 
"RF": "dicamba, glufosinate and glyphosate", 
"RL": "glufosinate and glyphosate", 
"RR": "glyphosate", 
"RX": "dicamba and glyphosate", 
"ST": "STS", 
"O":"Other"}

In [63]:
entries['seed_treat'] = entries['st'].str.strip()

entries = entries.reset_index(drop=True)

entries.loc[~pd.isnull(entries['ist']),"seed_treat"] = entries['ist'].str.strip()

entries['seed_treat'] = entries['seed_treat'].replace("b","B").replace("na",np.nan)

entries.loc[entries['entry_info'].fillna("").str.contains(",T"),"seed_treat"] = "T"
entries.loc[entries['entry_info'].fillna("").str.contains(",U"),"seed_treat"] = "U"

entries.drop(['st','ist'],axis=1,inplace=True)

### Conventional versus GE

Labeling which seeds are GE and which are conventional

In [65]:
# First label True if its in Liberty or Roundup trials
entries['GE'] = entries['type'] != "conventional"

# Second label true if the "herb trait" is not "CV" for the years after the trials combined
entries.loc[(entries.herb_trait!="CV")&(~pd.isnull(entries.herb_trait)),"GE"] = True

# Third label true if certain letters are at the end of the variety.
# Identified using this regular expression:
ex = "RR$|RR2Y$|RR$|R2$|NRR$|LL$|L[:digit:]{2}$|\\ L$| CA$|^AG"

entries['regex_findall'] = [True if len(re.findall(ex,x))>0 
                                 else False 
                                    for x in list(entries['variety'])]

entries.loc[entries['regex_findall'],"GE"] = True

### Weather Data

In [67]:
loc_to_county = \
      {"dixon_springs" : 17151,
        "dekalb"       : 17037,
        "dwight"       : 17105,
        "erie"         : 17195,
        "mt_morris"    : 17141,
        "mtmorris"     : 17141,
        "monmouth"     : 17187,
        "urbana"       : 17019,
        "goodfield"    : 17203,
        "perry"        : 17149,
        "new_berlin"   : 17167,
        "belleville"   : 17163,
        "brownstown"   : 17051,
        "st_peter"     : 17051,
        "carbondale"   : 17077,
        "harrisburg"   : 17165,
        "elkville"     : 17077,
        "region"       : np.nan}

gdd = pd.read_csv("../data/PRISM/gdd_uiuc.csv")
ppt = pd.read_csv("../data/PRISM/precip_uiuc.csv")

df_w['year'] = df_w['year'].astype(int)

df_w['loc_fips'] = df_w['location'].replace(loc_to_county)

df_w = df_w.merge(gdd.rename(columns={"stco":"loc_fips"}),how='left')
df_w = df_w.merge(ppt.rename(columns={"stco":"loc_fips"}),how='left')

## Merging entries to results

In [69]:
E = entries[['company_name','variety','type','year','maturity','sn','color','sb','prr','herb_trait','seed_treat',"GE"]]

In [70]:
E['year'] = E['year'].astype(int)

df_w['year'] = df_w['year'].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  E['year'] = E['year'].astype(int)


In [71]:
df_w = df_w.merge(E,how='left',on=['company_name','variety','type','year'],suffixes=["","_entry"])

## Data to csv

In [72]:
df_w.to_csv("../data/output_data/soybeans_95_20.csv",index=False)

In [73]:
entries.to_csv("../data/output_data/entries_97_20.csv",index=False)