In [1]:
import pandas as pd

from demyst.analytics import Analytics

In [2]:
analytics = Analytics(key="63d2b304d72e0ebdd1e7f4ae45dc085a")

# Sub-task 1

Analyze and clean the input file using Python/Pandas in a Jupyter Notebook. 

## 1. Load CSV file into memory

In [3]:
input_data = pd.read_csv("../data/input_file.csv",
                         delimiter = ',' , 
                         encoding = 'unicode_escape')

In [4]:
# Take a look at our results
input_data.sample(10)

Unnamed: 0,street,state,city,postcode,safety_flag
8240,1280 Market St,TN,Chattanooga,37402-2732,False
1421,4415 Lincoln LN NW,MN,Rochester,55901,False
9991,215 2nd Street Floor 3,CA,San Francisco,94105,False
9152,12391 US Hwy 61,MO,New Madrid,63869,False
6533,2933 Augusta St,CA,San Luis Obispo,93401,False
10089,20 West 22nd St Suite 906,NY,New York,10010,False
9158,119 Wilmont Ct,NY,Hopewell Junction,12533-6372,False
5966,1125 Hillside Lane,CO,Louisville,80027-2909,False
9025,1215 Zeno Lane,PA,Allison Park,15101,False
5161,461 springfield ave,NJ,pennsauken,8053,False


## 2. Get a quick overview of the dataset 

In [5]:
input_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11229 entries, 0 to 11228
Data columns (total 5 columns):
street         11229 non-null object
state          11220 non-null object
city           11220 non-null object
postcode       11220 non-null object
safety_flag    11229 non-null bool
dtypes: bool(1), object(4)
memory usage: 362.0+ KB


## 3. Detect and deal with duplicates

In [6]:
df_dup = input_data.copy()
df_dup =df_dup[df_dup.duplicated(keep=False)]

# Take a look at our results
df_dup

Unnamed: 0,street,state,city,postcode,safety_flag
399,5847 Morningside Ave,TX,Dallas,75206-5919,True
1906,257 15th St,NY,Brooklyn,11215-8700,False
1907,257 15th St,NY,Brooklyn,11215-8700,False
2476,2510 Cinnamon Ridge Ct,OH,Miamisburg,45342-5209,False
2477,2510 Cinnamon Ridge Ct,OH,Miamisburg,45342-5209,False
2483,13028 Chesney Dr,IN,Fishers,46037-7253,False
2633,28216 Rosewood St,MI,Inkster,48141-1753,False
2833,14836 Kentucky Street,MI,Detroit,48238,True
2904,10028 Ives Loop,FL,Hudson,34667-0013,False
3196,14836 Kentucky Street,MI,Detroit,48238,True


In [7]:
print('Shape with duplicates: ', input_data.shape)

Shape with duplicates:  (11229, 5)


In [8]:
df_drop_dup = input_data.copy()
df_drop_dup.drop_duplicates(inplace=True)
print('Shape without duplicates: ', df_drop_dup.shape)

Shape without duplicates:  (11209, 5)


> As missing values need to be handled according to the conditions, we will deal with this problem based on the validation result.

## 4. Validate the input data before enrichment

In [9]:
analytics.validate(input_data)

Column,Status,Description
street,All Valid,All values in this column are good to go.
state,Some Invalid,16.8% of the values of this column failed validation. One example of an invalid value is 'NJ '. Click here for documentation for this column.
city,All Valid,All values in this column are good to go.
postcode,Unrecognized Column Name,This column name is not supported. Click here for a list of all supported column names.
safety_flag,Unrecognized Column Name,This column name is not supported. Click here for a list of all supported column names.


### According to the documentation, we can know that "state" should meet the criteria:

> If US it must be a valid 2 character state code or state name. Empty otherwise

## 4.1 Clean "state"

In [10]:
input_data.state.unique()

array(['FL', 'IN', 'VA', 'OH', 'MA', 'MN', 'WI', 'NC', 'CO', 'CA', 'NY',
       'NH', 'CT', 'MD', 'NJ', 'KY', 'WA', 'IA', 'LA', 'GA', 'IL', 'TX',
       'AZ', 'DE', 'MI', 'RI', 'OR', 'AR', 'HI', 'PA', 'SC', 'AL', 'TN',
       'WY', 'NV', 'NE', 'MT', 'MS', 'OK', 'KS', 'MO', 'ID', 'DC', 'UT',
       'NM', 'ND', 'ME', 'VT', 'AK', 'SD', 'NJ  ', 'NY  ', nan, 'CA  ',
       'OR  ', 'GA  ', 'MI  ', 'AZ  ', 'TX  ', 'PA  '], dtype=object)

### 4.1.1 Remove NaN value

In [11]:
input_data.state.isnull().sum()

9

In [12]:
df_state_cleaned = input_data.copy()
df_state_cleaned = df_state_cleaned.dropna(subset = ["state"])

In [13]:
# Take a look at our results
df_state_cleaned.sample(10)

Unnamed: 0,street,state,city,postcode,safety_flag
8096,3733 Southeast 167th Avenue,OR,Southeast Portland,97236,False
2241,2027 Mondello Court,CA,Riverside,92507,False
10922,"315 Madison Avenue, 3rd Floor",NY,New York,10017,False
8294,907 Amber Ln,AR,Manila,72442-8273,False
2459,12428 W Solano Dr,AZ,Litchfield Park,85340,False
3417,21500 Park Row Drive,TX,Katy,77449,False
6727,6016 Brigids Close Drive,OH,Dublin,43017,False
2354,13375 PINE ST,MI,Taylor,48180,False
63,710 Ashton,TX,Cactus,79013,True
10090,225-16 95 Ave,NY,Floral Park,11001,True


In [14]:
assert df_state_cleaned.state.isnull().sum() == 0

### 4.1.2 Clean the format

In [15]:
df_state_cleaned.state = df_state_cleaned.state.str.strip()

# Take a look at our results
df_state_cleaned.sample(10)

Unnamed: 0,street,state,city,postcode,safety_flag
6281,3036 Hunton Cottage Ln.,VA,Glen Allen,23059,False
4591,503 Moonlight Trail,TX,Stephenville,76401,False
4429,200 E Harrison Ave,IA,Fairfield,52556,False
8511,2111 E Chandler Ave,IN,Evansville,47714-2223,False
9125,1611 Grand Meadows Ct,TX,Katy,77494-2177,False
8152,7905 Fernham Ln,MD,District Heights,20747-4544,False
10691,320 PARK AVE,NJ,PLAINFIELD,7060,False
2198,601 W 88th Ter,MO,Kansas City,64114-2913,False
7200,10834 Carloway Hills Dr,FL,Wimauma,33598-6141,False
6045,3016 Southwest Chastain Avenue,OR,Gresham,97080,False


In [16]:
df_state_cleaned.state.unique()

array(['FL', 'IN', 'VA', 'OH', 'MA', 'MN', 'WI', 'NC', 'CO', 'CA', 'NY',
       'NH', 'CT', 'MD', 'NJ', 'KY', 'WA', 'IA', 'LA', 'GA', 'IL', 'TX',
       'AZ', 'DE', 'MI', 'RI', 'OR', 'AR', 'HI', 'PA', 'SC', 'AL', 'TN',
       'WY', 'NV', 'NE', 'MT', 'MS', 'OK', 'KS', 'MO', 'ID', 'DC', 'UT',
       'NM', 'ND', 'ME', 'VT', 'AK', 'SD'], dtype=object)

### According to the documentation, we can know that "postcode" could be recognized withn the name "post_code" and "post_code" should meet the criteria:

> If US 5 or 9 digit postcode, dash or no dash separating. other countries need be non empty

## 4.2 Clean postcode

In [17]:
df_post_code = df_state_cleaned.copy()

### 4.2.1 Rename column "postcode"

In [18]:
df_post_code = df_state_cleaned.copy()
df_post_code.rename(columns={"postcode":"post_code"}, inplace=True)

In [19]:
# Take a look at our results
df_post_code.sample(10)

Unnamed: 0,street,state,city,post_code,safety_flag
5943,3327 Rainshore Dr,TX,Katy,77449-7020,False
2507,18 Lake St,MA,Somerset,2726,False
7549,909 Grand Ave,CA,Ojai,93023,False
4803,115 Country Club Dr,FL,Niceville,32578-2005,False
736,3100 W End Cir,TN,Nashville,37203-1413,False
3608,500 NW 11th St,TX,Andrews,79714-4707,False
3122,406 Washington Ave.,NY,Brentwood,11717,False
4015,2945 Pheasant Dr,GA,Decatur,30034,False
8657,197 Stephenson Rd,AR,Dierks,71833-8834,False
148,1007 W Deer Creek Dr,IN,Brazil,47834-7919,False


### 4.2.2 Detecting Missing Values

In [20]:
df_post_code.post_code.isnull().sum()

1

### 4.2.3 Remove Missing Values

In [21]:
df_post_code = df_post_code.dropna(subset=["post_code"])

# Take a look at our results
df_post_code.sample(10)

Unnamed: 0,street,state,city,post_code,safety_flag
2413,3 MCFARLAND CT,AL,Bridgeport,35740,True
2659,9042 Sturbridge Place,AL,Montgomery,36116,False
2181,1814 El Paso St,TX,Lewisville,75077,False
8686,1920 W 147th St,CA,Gardena,90249,False
227,1707 Towle St,NE,Falls City,68355-1952,False
7191,1200 Springdale Ln,IL,Elk Grove Village,60007-4654,False
5903,1342 Harpst St,MI,Ann Arbor,48104-6134,False
9239,7708 Mary Lou Way,CA,SACRAMENTO,95832,False
107,4343 E Princeton Ave,AZ,Gilbert,85234-7627,False
2601,570 South muriel drive,CA,Barstow,92311,False


In [22]:
assert df_post_code.post_code.isnull().sum() == 0

## 4.3 Add column "country" needed for enrichment 

In [23]:
df_w_country = df_post_code.copy()

# After going through our dataset, we can know that there are only US states, 
# So we will use the constant'US'
df_w_country["country"] = "US"

## 4.4 Validate the cleaned data before enrichment

In [24]:
df_cleaned = df_w_country.copy()
analytics.validate(df_cleaned)

Column,Status,Description
street,All Valid,All values in this column are good to go.
state,All Valid,All values in this column are good to go.
city,All Valid,All values in this column are good to go.
post_code,All Valid,All values in this column are good to go.
safety_flag,Unrecognized Column Name,This column name is not supported. Click here for a list of all supported column names.
country,All Valid,All values in this column are good to go.


## 5. Save the cleaned data to file

In [25]:
df_cleaned.to_csv("../data/cleaned_data.csv",index=False)

In [26]:
df_saved = pd.read_csv("../data/cleaned_data.csv")

In [27]:
assert df_cleaned.shape[0] == df_saved.shape[0]

In [28]:
assert df_cleaned.shape[1] == df_saved.shape[1]