# Part 0: Libraries:

In [1]:
from collections import OrderedDict
from matplotlib import pyplot as plt
import shapefile
import geopandas as gpd
import pandas as pd
import numpy as np
import requests
import us
import json
import re
%matplotlib inline

# Part 1: Functions:

In [40]:
#Function #1:
def get_county(row):
    """
    INPUT: row (series; row of data)
    OUTPUT: county (str), state (str)
    OVERVIEW: FROM FCC 
    """ 
    lat, lon = row['LATITUDE'],row['LONGITUDE']
    loc_point = gpd.geoseries.Point(lon, lat)
    return loc_point

#Function #2: 
def generates_county_into(state_alphabets): 
    return us.states.lookup(state_alphabets).shapefile_urls('county')

# Part 2: Load/Clean Data:

In [41]:
#1. Descriptions of weather data:

#Historical weather (precipitation/air temperature)
#data (1970~2014) of top 10 agriculture states from NOAA.

#features: 
#EMXP (Extreme maximum daily precipitation)
#MXSD (Maximum snow depth)
#DSNW (Number days with snow depth > 1 inch) 
#TPCP (Total precipitation)
#TSNW (Total snow fall)
#EMXT (Extreme maximum daily temperature)
#EMNT (Extreme minimum daily temperature)
#MMXT (Monthly mean maximum temperature)
#MMNT (Montly mean minimum temperature) 
#MNTM (Monthly mean temperature)

In [4]:
#2: Declaring Variables:
start_year, end_year = 1970, 2014
missing_yield_years = [1982, 1984, 1985]
#Targeting 10 top agriculture states (based on economic output)
#Source: http://www.ers.usda.gov/faqs.aspx (top 10 agriculture states)
targest_states_lower = ["California", "Iowa", "Texas", "Nebraska", "Illinois",\
                  "Minnesota", "Kansas", "Indiana", "North Carolina", "Wisconsin"]
#dataframe to hold all info: 
#yield_master_df = pd.DataFrame()
targest_states_upper = [state.upper() for state in targest_states_lower]
weather_master_df = pd.DataFrame()
#dict of all the maximum number of files a state have on weather: 
weather_data_dict = {'California':16, 'Iowa':5, 'Texas':16, 'Nebraska':6, 'Illinois':6,\
                    'Minnesota':5, 'Kansas':6, 'Indiana':6, 'North Carolina':5, 'Wisconsin':4}

#Google API key for looking up zipcode from long and lat: 
google_api_key = r'AIzaSyBzA6v0m3Jizdxmu--wQH49WHH0UH17RR0'

#state alphabets abbreviations: 
state_alphabet = ['NC', 'NE', 'TX', 'MN', 'IN', 'WI', 'IA', 'KS', 'IL', 'CA']
county_into_urls = []

In [5]:
#3: Load csv files and combine them into master dataframe:  
#for state in targest_states_lower: 
#    state_lower = state.lower()
#    for file_num in xrange(1, weather_data_dict[state]+1):
#        filename = 'weather_{state}_{file_num}.csv'.format(state=state_lower, file_num=file_num)
#        year_df = pd.read_csv("/Users/Hsieh/Desktop/persephone/Data/Weather/{state}/{filename}".format(state=state, filename=filename))     
#        weather_master_df = weather_master_df.append([year_df])

In [6]:
#4: reordering master_df:

#weather_master_df.reset_index(inplace=True)
#back up copy:
#master_df_copy = master_df.copy()
#weather_master_df.to_csv('/Users/Hsieh/Desktop/persephone/Data/raw_master_weather.csv')
weather_master_df = pd.read_csv('/Users/Hsieh/Desktop/persephone/Data/raw_master_weather.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [42]:
#weather_master_df.columns 

In [8]:
#3: Refine dataframe: 
#i: declare list of columns type:
excess_columns = ["Unnamed: 0","index",'Measurement Flag',"Measurement Flag.1","Measurement Flag.2",\
                  "Measurement Flag.3","Measurement Flag.4","Measurement Flag.5","Measurement Flag.6",\
                  "Measurement Flag.7","Measurement Flag.8","Measurement Flag.9","Number of Days","Number of Days.1",\
                  "Number of Days.2","Number of Days.3","Number of Days.4","Number of Days.5","Number of Days.6","Number of Days.7",\
                  "Number of Days.8","Number of Days.9","Quality Flag","Quality Flag.1","Quality Flag.2","Quality Flag.3","Quality Flag.4",\
                  "Quality Flag.5","Quality Flag.6","Quality Flag.7","Quality Flag.8","Quality Flag.9","Units",'Units.1',\
                  "Units.2","Units.3","Units.4","Units.5","Units.6","Units.7","Units.8","Units.9"]
key_columns = ['DATE','LATITUDE','LONGITUDE',]
feature_columns = ['DSNW','EMNT','EMXP','EMXT','MMNT','MMXT','MNTM','MXSD','TPCP','TSNW']
other_columns = ['ELEVATION','STATION','STATION_NAME']
#ii) filter out unnecessary columns:
weather_model_df = weather_master_df.filter(key_columns+feature_columns,axis=1)

In [9]:
#4: Add year column: 
weather_model_df['YEAR'] = weather_model_df['DATE'].apply(lambda x: int(str(x)[0:4]))

In [46]:
#5: clean dataframe: 

#weather_model_df.shape
#weather_model_df_copy = weather_model_df.copy()
#weather_model_df = weather_model_df_copyv
#weather_model_df = weather_model_df_copy
#weather_model_df.info()
#weather_model_df = weather_model_df[weather_model_df["LATITUDE"]!="unknown"]
#weather_model_df = weather_model_df[weather_model_df["LONGITUDE"]!="unknown"]
#weather_model_df = weather_model_df[weather_model_df["LONGITUDE"].notnull()==True]
#check = weather_model_df['LONGITUDE'].apply(lambda x: isinstance(x, float))
#check.sum()
#weather_model_df['LATITUDE'] = weather_model_df['LATITUDE'].apply(lambda x: float(x))
#weather_model_df['LONGITUDE'] = weather_model_df['LONGITUDE'].apply(lambda x: float(x))

In [50]:
#6: map latitude, longtitude to county: 

#gpd_file = gpd.read_file("/Users/Hsieh/Desktop/persephone/Data/uscounties.geojson")
#geo_series = weather_model_df.apply(get_county, axis=1)
#gpd_df = gpd.GeoDataFrame(geometry=geo_series)
#counties_df = gpd.sjoin(gpd_df, gpd_file, op="within")
#counties_df.columns
#weather_model_df['COUNTY'] = counties_df['name']
#weather_model_df['STATE'] = counties_df['state_name']
#weather_master_df.to_csv('/Users/Hsieh/Desktop/persephone/Data/clean_master_weather.csv')
weather_master_df = pd.read_csv("/Users/Hsieh/Desktop/persephone/Data/clean_master_weather.csv")

  interactivity=interactivity, compiler=compiler, result=result)


# Archive Code: 

In [None]:
#1: function: 
"""
def get_county(lat, lon, api_key):
    INPUT: lat (float; lattitude); lon (float; longtitude); api_key (str)
    OUTPUT: county (str; county name), state (str; state name)
    
    GOOGLE_API_GEO_FORMAT = r'https://maps.googleapis.com/maps/api/geocode/json?latlng={},{}&key={}'
    query = GOOGLE_API_GEO_FORMAT.format(lat, lon, api_key)
    response = requests.get(query)

    address = response.json()['results'][0]['formatted_address']
    state = re.findall(r'\w*, \w{2} ', address)[0].split(', ')[1]
    
    return state 
"""

#2: function:
"""
def iterate_rows(lat, lon, key):
    print lat, lon
    return get_county(lat, lon, key)
"""
""""
#3: function:
def check_lat_column(x, options):
    if type(x) != str:
        pass
    elif x[0].isdigit():
        pass
    else:
        options.add(x)
options = set([])
weather_model_df['LATITUDE'].apply(check_lat_column, args = (options,))
"""

#3: 
"""
def get_county(row, api_key):
    INPUT: row (series; row of data); api_key (str)
    OUTPUT: county_address (str; a
    ddresss of county )
     
    GOOGLE_API_GEO_FORMAT = r'https://maps.googleapis.com/maps/api/geocode/json?latlng={},{}&key={}'
    lat, lon = row['LATITUDE'],row['LONGITUDE']
    query = GOOGLE_API_GEO_FORMAT.format(lat, lon, api_key)
    response = requests.get(query)
    print response
    print response.json()
    
    address = response.json()['results'][0]['formatted_address']
    zip_code = re.findall(r'\d{5}', address)[0]
    state = re.findall(r'\w*, \w{2} ', address)[0].split(', ')[1]
    zip_code+","+state
"""

# Testing: 