### Import USGS use data & Create usage table
Here we download the raw usage data for years 2000, 2005, and 2010 from the USGS usage site and synthesize all data into a flat table listing: year, FIPS code, and total annual withdrawals by sector in MGal/day (*so these need to be ajdusted for yearly sums!)

The steps required include:
* Pulling the raw data file, in tab-delimted format, from the USGS server into a pandas dataframe.
* Adjusting field names to a standard nomenclature:
    * Adding year columns to 2000 and 2005 datasets.
    * For year 2000, remapping 'IT', 'LA', 'LS', and 'PE' fields to 'IC', 'AQ', 'LI' and 'PC', respectively([reference]( https://water.usgs.gov/watuse/data/2000/datadict.html)).
        * IT -> IC (Irrigated cropland)
        * LA -> AQ (Aquaculture)
        * LS -> LI (Livestock)
        * PE -> PC (Thermoelectric power closed-loop)
    * For year 2005, remapping 'LA' and 'LS' fields to 'AQ' and 'LI', respectively([reference]( https://water.usgs.gov/watuse/data/2005/datadict.html)). 
        * LA -> AQ (Aquaculture)
        * LS -> LI (Livestock)
* Re-arranging data into a tidy format to facilitate additional analyses.
    * Care must be taken that the FIPS codes are preserved as text, not numbers
* Appending all tables into a single dataframe, with records tagged by the year of the dataset.
* Removing extraneous fields, keeping only the sector total columns:
    
The resulting table will be formatted as follows:

| YEAR | FIPS | UseClass | SourceClass | SourceType | Amount |
| :---: | :---: | :---: | :---: | :---: | :---: | :---: | 
| 2000 | 01001 | PublicSupply | Surface  | Fresh |0.00 |
| 2000 | 01001 | Industrial | Ground  | Saline |0.00 |

This table can then be summarized and joined, by YEAR and FIPS code, to other accounting data tables and summarized by state. 


In [1]:
#Import modules required for analysis
import os
import pandas as pd
import numpy as np

In [2]:
#Create prefix remapping dictionaries to standardize column names for all years to 2010
remapDict = {'IT-':'IC-', #Irrigated cropland
             'LA-':'AQ-', #Aquaculture
             'LS-':'LI-', #Livestock
             'PE-':'PC-', #Closed-loop thermo electric
             'Wtotl':'WTotl' #Capitalization mismatch
            }

In [7]:
#Create a function for importing a table and standardizing all field names
def importTable(year, remapDict):
    
    #Set the download URL using with the year
    theURL = 'http://water.usgs.gov/watuse/data/{0}/usco{0}.txt'.format(year) 
    
    #Download the dataset to a data frame (keeping the FIPS attribute as a string)
    df = pd.read_table(theURL.format(year),dtype={'FIPS':str})
    
    #Remove the STATE, STATEFIPS and COUNTYFIPS columns (Not needed)
    df.drop("STATE",axis=1,inplace=True)
    df.drop("STATEFIPS",axis=1,inplace=True)
    df.drop("COUNTYFIPS",axis=1,inplace=True)
    
    #Use the remap dictionary to rename columns
    
    #Get the current column names as a list
    colNames = df.columns.values.tolist()
    
    for inFld,outFld in remapDict.items():
        #This loops through each item in colNames and replaces it with a revised one
        colNames_update = [x.replace(inFld,outFld) for x in colNames]
        colNames = colNames_update

    #Update the column names in the data frame
    df.columns = colNames
    
    #Add year field, if not present
    if "YEAR" not in df.columns.values: 
        df.insert(1,"YEAR",year)
        
    #Remove unnamed columns
    if "Unnamed" in df.columns.values[-1]:
        df.drop(df.columns.values[-1],axis=1,inplace=True)
        
    #Return the data frame
    return df

In [25]:
#Get the tables
df2000 = importTable(2000,remapDict)
df2005 = importTable(2005,remapDict)
df2010 = importTable(2010,remapDict)
print df2000.shape, df2005.shape, df2010.shape

(3222, 68) (3224, 105) (3224, 114)


In [26]:
#"Melt" data so each use/class/type becomes a new row
df2000m = pd.melt(df2000,id_vars=['FIPS','YEAR'],var_name='Class',value_name='Amount')
df2005m = pd.melt(df2005,id_vars=['FIPS','YEAR'],var_name='Class',value_name='Amount')
df2010m = pd.melt(df2010,id_vars=['FIPS','YEAR'],var_name='Class',value_name='Amount')

In [27]:
df2000m.shape[0], df2005.shape[0],df2010.shape[0]

(212652, 3224, 3224)

In [44]:
print df2000m.size, (df2000m.shape[0] )
print df2005m.size, df2005m.shape[0] 
print df2010m.size, df2010m.shape[0] 

850608 212652
1328288 332072
1444352 361088


In [46]:
#Merge all records into a single table
dfUse = pd.concat([df2000m, df2005m, df2010m],ignore_index=True)

0

In [None]:
df2000.columns.values

In [None]:
#Create columns to hold use class, source class, and source type
df2000x = df2000.assign(UseClass='',SourceClass='',SourceType='')
df2000x.columns.values

In [None]:
#

In [None]:
#Set the output location and filename
dataDir = '../../Data'
outFN = 'AllUsageData.csv'

In [None]:
#Define the function to retrieve data and fix field name issues
def getData(year):
    print "Retrieving data for {}".format(year)
    
    #Create a dictionary of proper dtypes (avoids converting FIPS to numeric types)
    formatDict = {'STATEFIPS':str,'COUNTYFIPS':str,'FIPS':str}

    #Set the base URL to be used to get any annual dataset
    theBaseURL = 'http://water.usgs.gov/watuse/data/{0}/usco{0}.txt' 
    
    #Retrieve the data
    df = pd.read_table(theBaseURL.format(year),dtype=formatDict)
    
    #Remap the IT-, LA-, and LS- columns to IC-, AQ-, and LI-
    fldsIn = ('IT-','LA-','LS-','PE','Wtotl')
    fldsOut = ('IC-','AQ-','LI-','PC','WTotl')

    #Get the current column names as a list
    colNames = df.columns.values.tolist()

    #Create a revised column name list
    for inFld,outFld in zip(fldsIn,fldsOut):
        #This loops through each item in colNames and replaces it with a revised one
        colNames_update = [x.replace(inFld,outFld) for x in colNames]
        colNames = colNames_update

    #Update the column names in the data frame
    df.columns = colNames
    
    #Add year field, if not present
    if "YEAR" not in df.columns.values: 
        df.insert(4,"YEAR",year)
        
    #Remove unnamed columns
    if "Unnamed" in df.columns.values[-1]:
        df.drop(df.columns.values[-1],axis=1,inplace=True)
        
    #Return the updated dataframe
    return df

In [None]:
#Retrieve the data into a dataframe
df2000 = getData(2000)
df2005 = getData(2005)
df2010 = getData(2010)

In [None]:
#Merge the tables into one
df = pd.concat([df2000, df2005,df2010])

In [None]:
#Check that the final merge contains all yearly columns
s00 = set(df2000.columns.values.tolist())
s05 = set(df2005.columns.values.tolist())
s10 = set(df2010.columns.values.tolist())
sXX = set(df.columns.values.tolist())
print s00.issubset(sXX)
print s05.issubset(sXX)
print s10.issubset(sXX)

In [None]:
#Subset and rename columns (FRESH WATER ONLY)

#Create lists of columns to keep and names to assign
useFields = ['YEAR','FIPS','PS-WFrTo','DO-WFrTo','IN-WTotl','IR-WFrTo','LI-WFrTo','AQ-WFrTo','MI-WFrTo','PT-WFrTo','TO-WFrTo']
useNames = ['YEAR','FIPS','Public','Domestic','Industry','Irrigation','Livestock','Aquaculture','Mining','Thermoelectic','TOTAL']

#Subset columns in the list
dfUsage = df[useFields]

#Rename the columns
dfUsage.columns = useNames

In [None]:
#Fix FIPS column
dfUsage['FIPS'] = df['FIPS'].apply(lambda x: str(x).zfill(5))

In [None]:
#Save as a CSV file
dfUsage.to_csv(outFN,index_label="KEY")

In [None]:
#Example of extracting state records, NC = FIPS 37
fips = '37'
fipsFilter = dfUsage['FIPS'].str.startswith(fips,na=False)