# Missing Data and Merging Data

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

### Handling Missing Data

In [2]:
data = pd.DataFrame({'col_1': ['a', 'b', np.nan, 'c', None]})

In [3]:
data

Unnamed: 0,col_1
0,a
1,b
2,
3,c
4,


In [4]:
#nan and None are both treated as null
data.isnull()

Unnamed: 0,col_1
0,False
1,False
2,True
3,False
4,True


In [5]:
#quick view of null counts
data.isnull().sum()

col_1    2
dtype: int64

In [6]:
#drop the null
data.dropna()

Unnamed: 0,col_1
0,a
1,b
3,c


##### Controlling the drop

In [61]:
data = pd.DataFrame({'a': [1, 2, np.nan, None],
                    'b': [6, np.nan, None, 3.4],
                    'c': [4, np.nan, np.nan, np.nan],
                    'd': [3, np.nan, np.nan, 8]})

In [62]:
data

Unnamed: 0,a,b,c,d
0,1.0,6.0,4.0,3.0
1,2.0,,,
2,,,,
3,,3.4,,8.0


In [63]:
#default is any row with missing values
data.dropna()

Unnamed: 0,a,b,c,d
0,1.0,6.0,4.0,3.0


In [64]:
#only if all data is missing
data.dropna(how = 'all')

Unnamed: 0,a,b,c,d
0,1.0,6.0,4.0,3.0
1,2.0,,,
3,,3.4,,8.0


In [65]:
data['d'] = np.nan

In [66]:
data

Unnamed: 0,a,b,c,d
0,1.0,6.0,4.0,
1,2.0,,,
2,,,,
3,,3.4,,


In [67]:
#drop based on columns
data.dropna(axis = 1)

0
1
2
3


In [68]:
#only columns with all data missing
data.dropna(axis = 1, how = 'all')

Unnamed: 0,a,b,c
0,1.0,6.0,4.0
1,2.0,,
2,,,
3,,3.4,


##### Using a Threshold

In [69]:
df = pd.DataFrame(np.random.randn(7, 3))

In [70]:
df.loc[:4, 0] = np.nan
df.iloc[:2, 2] = np.nan

In [71]:
df[3] = [np.nan, 4, 4, 4, 4, 4, 4]

In [74]:
df

Unnamed: 0,0,1,2,3
0,,0.61086,,
1,,1.704643,,4.0
2,,-1.713519,-0.448814,4.0
3,,-0.283682,-0.31886,4.0
4,,-0.900964,-1.913339,4.0
5,-0.339258,1.616071,0.396019,4.0
6,0.272042,0.371895,-0.297827,4.0


In [72]:
df.dropna()

Unnamed: 0,0,1,2,3
5,-0.339258,1.616071,0.396019,4.0
6,0.272042,0.371895,-0.297827,4.0


In [73]:
#2 missing values
df.dropna(thresh = 2)

Unnamed: 0,0,1,2,3
1,,1.704643,,4.0
2,,-1.713519,-0.448814,4.0
3,,-0.283682,-0.31886,4.0
4,,-0.900964,-1.913339,4.0
5,-0.339258,1.616071,0.396019,4.0
6,0.272042,0.371895,-0.297827,4.0


In [60]:
#fill missing values
df.fillna(1)

Unnamed: 0,0,1,2,3
0,5.0,-0.259634,5.0,1.0
1,5.0,0.157162,5.0,4.0
2,5.0,-0.869687,-0.173139,4.0
3,5.0,-0.441937,0.084785,4.0
4,5.0,0.538065,-0.578755,4.0
5,-0.032205,0.053763,1.410522,4.0
6,-1.420466,-0.632809,-1.074498,4.0


In [21]:
#map to given columns
df.fillna({0: 0.2, 1: 4})

Unnamed: 0,0,1,2
0,0.2,-0.259634,
1,0.2,0.157162,
2,0.2,-0.869687,-0.173139
3,0.2,-0.441937,0.084785
4,0.2,0.538065,-0.578755
5,-0.032205,0.053763,1.410522
6,-1.420466,-0.632809,-1.074498


In [22]:
df

Unnamed: 0,0,1,2
0,,-0.259634,
1,,0.157162,
2,,-0.869687,-0.173139
3,,-0.441937,0.084785
4,,0.538065,-0.578755
5,-0.032205,0.053763,1.410522
6,-1.420466,-0.632809,-1.074498


In [23]:
#make changes permanent
df.fillna(5, inplace = True)

In [24]:
df

Unnamed: 0,0,1,2
0,5.0,-0.259634,5.0
1,5.0,0.157162,5.0
2,5.0,-0.869687,-0.173139
3,5.0,-0.441937,0.084785
4,5.0,0.538065,-0.578755
5,-0.032205,0.053763,1.410522
6,-1.420466,-0.632809,-1.074498


### PROBLEMS

1. What is the average completion time for these projects?
2. Time by borough?
3. Drop missing data from `latitude` and `longitude` columns, save data to `house_map_data`.
4. Fill in missing values in the `project_completion_date` the given boroughs average completion time for non-null values.

In [25]:
houses = pd.read_json('https://data.cityofnewyork.us/resource/hg8x-zxpr.json')

In [26]:
houses.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 42 columns):
project_id                       1000 non-null int64
project_name                     1000 non-null object
program_group                    1000 non-null object
project_start_date               1000 non-null object
project_completion_date          431 non-null object
building_id                      857 non-null float64
house_number                     1000 non-null object
street_name                      1000 non-null object
borough                          1000 non-null object
postcode                         850 non-null float64
bbl                              833 non-null float64
bin                              713 non-null float64
community_board                  1000 non-null object
council_district                 998 non-null float64
census_tract                     850 non-null float64
neighborhood_tabulation_area     850 non-null object
latitude                         

##### Other Important methods

- `.duplicated()`
- `.drop_duplicates()`
- `.replace()`

In [27]:
data

Unnamed: 0,a,b,c,d
0,1.0,6.0,4.0,
1,2.0,,,
2,,,,
3,,3.4,,


In [28]:
data.duplicated()

0    False
1    False
2    False
3    False
dtype: bool

In [29]:
data['a'] = 4

In [30]:
data

Unnamed: 0,a,b,c,d
0,4,6.0,4.0,
1,4,,,
2,4,,,
3,4,3.4,,


In [31]:
data.duplicated()

0    False
1    False
2     True
3    False
dtype: bool

In [32]:
data.drop_duplicates()

Unnamed: 0,a,b,c,d
0,4,6.0,4.0,
1,4,,,
3,4,3.4,,


In [33]:
data.replace(6, np.nan)

Unnamed: 0,a,b,c,d
0,4,,4.0,
1,4,,,
2,4,,,
3,4,3.4,,


In [34]:
data.replace({4: 'hi', 6: 0})

Unnamed: 0,a,b,c,d
0,hi,0.0,hi,
1,hi,,,
2,hi,,,
3,hi,3.4,,


### Combining data with `merge` and `concat`

In [43]:
df1 = pd.DataFrame({'key': ['a', 'a', 'a', 'b', 'b', 'b', 'c'],
                   'd1': np.random.randint(0, 10, 7)})

df2 = pd.DataFrame({'key': ['a', 'b', 'd'],
                   'd2': np.random.randint(0, 10, 3)})

In [44]:
df1

Unnamed: 0,key,d1
0,a,0
1,a,1
2,a,2
3,b,8
4,b,0
5,b,4
6,c,5


In [45]:
df2

Unnamed: 0,key,d2
0,a,2
1,b,1
2,d,7


In [46]:
pd.merge(df1, df2)

Unnamed: 0,key,d1,d2
0,a,0,2
1,a,1,2
2,a,2,2
3,b,8,1
4,b,0,1
5,b,4,1


In [47]:
pd.merge(df1, df2, on = 'key')

Unnamed: 0,key,d1,d2
0,a,0,2
1,a,1,2
2,a,2,2
3,b,8,1
4,b,0,1
5,b,4,1


In [48]:
df3 = pd.DataFrame({'lkey': ['b', 'a', 'b', 'c', 'c', 'a', 'b'],
                   'd1': np.random.randint(6, 19, 7)})

In [49]:
df4 = pd.DataFrame({'rkey': ['a', 'b', 'd'],
                   'd2': np.random.randint(3, 5, 3)})

In [51]:
pd.merge(df3, df4, left_on = 'lkey', right_on = 'rkey')

Unnamed: 0,lkey,d1,rkey,d2
0,b,18,b,4
1,b,6,b,4
2,b,13,b,4
3,a,9,a,3
4,a,13,a,3


<img src = 'merge_a.png' />

In [52]:
pd.merge(df1, df2)

Unnamed: 0,key,d1,d2
0,a,0,2
1,a,1,2
2,a,2,2
3,b,8,1
4,b,0,1
5,b,4,1


In [53]:
pd.merge(df1, df2, how = 'left')

Unnamed: 0,key,d1,d2
0,a,0,2.0
1,a,1,2.0
2,a,2,2.0
3,b,8,1.0
4,b,0,1.0
5,b,4,1.0
6,c,5,


In [54]:
pd.merge(df1, df2, how = 'right')

Unnamed: 0,key,d1,d2
0,a,0.0,2
1,a,1.0,2
2,a,2.0,2
3,b,8.0,1
4,b,0.0,1
5,b,4.0,1
6,d,,7


In [55]:
pd.merge(df1, df2, how = 'inner')

Unnamed: 0,key,d1,d2
0,a,0,2
1,a,1,2
2,a,2,2
3,b,8,1
4,b,0,1
5,b,4,1


In [56]:
pd.merge(df1, df2, how = 'outer')

Unnamed: 0,key,d1,d2
0,a,0.0,2.0
1,a,1.0,2.0
2,a,2.0,2.0
3,b,8.0,1.0
4,b,0.0,1.0
5,b,4.0,1.0
6,c,5.0,
7,d,,7.0


##### Summary

- `inner`  : Use only the key combinations observed in both tables
- `left`  : Use all key combinations found in the left table
- `right`  : Use all key combinations found in the right table
- `outer`  : Use all key combinations observed in both tables together

##### PROBLEMS

1. Read in the `species.csv` and `surveys.csv` data files located in the `data` directory.
2. Create a `DataFrame` called `v_stack` that vertically stacks the datasets.
3. Create a `DataFrame` called `h_stack` that horizontally stacks the datasets.
4. Read in the `speciesSubset.csv` file, and join this with the survey data using the `species_id` as the left and right key.
5. Use the `speciesSubset.csv` file, and join this with the survey data using the `species_id` as the left and right key and using `left` join.
