In [28]:
import re
import pandas as pd
import numpy as np

In [35]:
def extract_city_and_state(city_state_str):
    if "," in city_state_str:
        city, state = city_state_str.split(",")[0].strip(),\
                        city_state_str.split(",")[1].strip()
    else:
        city = city_state_str
        state = np.nan
    return city, state

def extract_ports_and_cities(file):
    processing_i94port = False

    i94port_list = []
    city_list = []
    state_list = []
    
    unwanted_chars = r'[^a-zA-Z0-9(), ]'

    for line in txt_file:
        if "I94PORT" in line:
            processing_i94port = True
        if processing_i94port and "=" in line:
            i94port, city_w_state = line.split("=")[0], line.split("=")[1]
            # remove unwanted characters from string
            i94port = re.sub(unwanted_chars, '', i94port)
            city_w_state = re.sub(unwanted_chars, '', city_w_state)
            city, state = extract_city_and_state(city_w_state)
            i94port_list.append(i94port)
            city_list.append(city)
            state_list.append(state)
        if processing_i94port and (";" in line):
            processing_i94port = False
            break
    return i94port_list, city_list, state_list

In [36]:
txt_file = open("../I94_SAS_Labels_Descriptions.SAS", "r")
i94ports, cities, states = extract_ports_and_cities(txt_file)

In [37]:
data = {'i94 Port': i94ports, 'City': cities, 'States': states}

i94ports_df = pd.DataFrame (data, columns = ['i94 Port','City', 'States'])

In [38]:
i94ports_df.head()

Unnamed: 0,i94 Port,City,States
0,ALC,ALCAN,AK
1,ANC,ANCHORAGE,AK
2,BAR,BAKER AAF BAKER ISLAND,AK
3,DAC,DALTONS CACHE,AK
4,PIZ,DEW STATION PT LAY DEW,AK


In [41]:
i94ports_df.shape

(660, 3)

In [40]:
i94ports_df[i94ports_df["States"].isnull()]

Unnamed: 0,i94 Port,City,States
28,MAP,MARIPOSA AZ,
76,WAS,WASHINGTON DC,
516,XXX,NOT REPORTEDUNKNOWN,
517,888,UNIDENTIFED AIR SEAPORT,
518,UNK,UNKNOWN POE,
...,...,...,...
655,ADU,No PORT Code (ADU),
656,AKT,No PORT Code (AKT),
657,LIT,No PORT Code (LIT),
658,A2A,No PORT Code (A2A),


In [39]:
i94ports_df.to_csv("../sample_data/i94ports_to_cities.csv", index=False)

## Clean US Cities Demographics Table

In [14]:
us_cities_demog = pd.read_csv("../sample_data/us-cities-demographics.csv")
us_cities_demog.head()

Unnamed: 0,City;State;Median Age;Male Population;Female Population;Total Population;Number of Veterans;Foreign-born;Average Household Size;State Code;Race;Count
0,Silver Spring;Maryland;33.8;40601;41862;82463;...
1,Quincy;Massachusetts;41.0;44129;49500;93629;41...
2,Hoover;Alabama;38.5;38040;46799;84839;4819;822...
3,Rancho Cucamonga;California;34.5;88127;87105;1...
4,Newark;New Jersey;34.6;138040;143873;281913;58...


In [15]:
columns = us_cities_demog.columns[0].split(";")
columns

['City',
 'State',
 'Median Age',
 'Male Population',
 'Female Population',
 'Total Population',
 'Number of Veterans',
 'Foreign-born',
 'Average Household Size',
 'State Code',
 'Race',
 'Count']

In [16]:
def clean_cities_table(cities_df):
    new_cities_df = cities_df.copy()
    original_col = new_cities_df.columns[0]

    def return_single_col_data(str_of_values, n_col):
        return str_of_values.split(";")[n_col]

    for i, col in enumerate(columns):
        new_cities_df[col] = new_cities_df[original_col].apply(return_single_col_data, n_col=i)

    new_cities_df.drop(columns=[original_col], inplace=True)
    
    return new_cities_df

In [17]:
new_us_cities_demog = clean_cities_table(us_cities_demog)
new_us_cities_demog.head()

Unnamed: 0,City,State,Median Age,Male Population,Female Population,Total Population,Number of Veterans,Foreign-born,Average Household Size,State Code,Race,Count
0,Silver Spring,Maryland,33.8,40601,41862,82463,1562,30908,2.6,MD,Hispanic or Latino,25924
1,Quincy,Massachusetts,41.0,44129,49500,93629,4147,32935,2.39,MA,White,58723
2,Hoover,Alabama,38.5,38040,46799,84839,4819,8229,2.58,AL,Asian,4759
3,Rancho Cucamonga,California,34.5,88127,87105,175232,5821,33878,3.18,CA,Black or African-American,24437
4,Newark,New Jersey,34.6,138040,143873,281913,5829,86253,2.73,NJ,White,76402


In [19]:
new_us_cities_demog.to_csv("../sample_data/new_us_cities_demographics.csv", index=False)