# Example use of ctlookup to clean CT Place names

ctlookup translates Connecticut village names to their proper town name.

### Start with some sample data

In [1]:
import pandas as pd
from ctlookup import lookup

def sample_df():
    return pd.DataFrame([["New Preston",1,2],
                      ["Marbledale",110,23],
                      ["Bridgeport",100,97],
                      ["B_water",12,44]
                     ],
                     columns=["TOWN","POP","GDP"])

tmpdf = sample_df()
tmpdf

Unnamed: 0,TOWN,POP,GDP
0,New Preston,1,2
1,Marbledale,110,23
2,Bridgeport,100,97
3,B_water,12,44


### Create a lookup instance

No parameters required unless you want to force a re-download of the raw lookup table, or you want to use a completely different spreadsheet to clean some other type of content besides CT place names.

In [2]:
# Force re-download of lookup table. Slower and requires internet connection.
l = lookup.Lookup(use_inet_csv=True)

# Standard usage
l = lookup.Lookup()

### Use clean() to clean a single place name

In [3]:
# This one works
print "'New Preston' comes out as: " + str(l.clean("New Preston"))
# This one doesn't
print "'New Orleans' comes out as: " + str(l.clean("New Oreleans"))

'New Preston' comes out as: WASHINGTON
'New Orleans' comes out as: None


### Use clean_col to clean a pandas series of place names


In [4]:
l.clean_col(tmpdf["TOWN"])

0    WASHINGTON
1    WASHINGTON
2    BRIDGEPORT
3          None
Name: TOWN, dtype: object

### DEPRECATED - Use clean_dataframe() to add a new column of real place names

This function is deprecated in favor of clean_col, but I'm leaving it in for compatibility.

In [5]:
l.clean_dataframe(tmpdf,"TOWN")
tmpdf["TOWN_REAL"] = l.clean_dataframe(tmpdf,"TOWN")
tmpdf

Unnamed: 0,TOWN,POP,GDP,TOWN_REAL
0,New Preston,1,2,WASHINGTON
1,Marbledale,110,23,WASHINGTON
2,Bridgeport,100,97,BRIDGEPORT
3,B_water,12,44,


### Or you can just overwrite the original column

In [6]:
# Replace in place
tmpdf = sample_df()

l.clean_dataframe(tmpdf,"TOWN")
tmpdf["TOWN"] = l.clean_dataframe(tmpdf,"TOWN")
tmpdf

Unnamed: 0,TOWN,POP,GDP
0,WASHINGTON,1,2
1,WASHINGTON,110,23
2,BRIDGEPORT,100,97
3,,12,44


### Use error parameter to set the default value when no match is found

In [7]:
# Custom error value
tmpdf = sample_df()

tmpdf["TOWN_REAL"] = \
l.clean_dataframe(tmpdf,
                  "TOWN",
                  error="NOT FOUND")
tmpdf

Unnamed: 0,TOWN,POP,GDP,TOWN_REAL
0,New Preston,1,2,WASHINGTON
1,Marbledale,110,23,WASHINGTON
2,Bridgeport,100,97,BRIDGEPORT
3,B_water,12,44,NOT FOUND
