In [3]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

s = pd.read_csv('../data/nyc-temps.txt').squeeze()
df = DataFrame({'temp': s, 
                'hour': [0,3,6,9,12,15,18,21] * 91})

df.loc[(df['hour'] == 3) | (df['hour'] == 6), 'temp'] = np.nan

# Beyond 1

By default, the `interpolate` method tries to average the remaining values before and after any `NaN`. However, we can change how it works, by passing `method='nearest'`. Does that change our data substantially?

In [4]:
# No, doesn't seem to change things significantly -- maybe because temperatures
# don't really vary all that much across readings.

df.interpolate(method='nearest').describe()

Unnamed: 0,temp,hour
count,728.0,728.0
mean,-1.050824,10.5
std,5.026357,6.878589
min,-14.0,0.0
25%,-4.0,5.25
50%,0.0,10.5
75%,2.0,15.75
max,12.0,21.0


# Beyond 2

Let's assume that the equipment works fine around the clock, but that it fails to record a reading at -1 degree and below. Are the interpolated values similar to the real (missing) values they replace? Why or why not?

In [5]:
# reset our data
df = DataFrame({'temp': s, 
                'hour': [0,3,6,9,12,15,18,21] * 91})

In [7]:
# Remove values <= -1 to NaN
df.loc[df['temp'] <= -1, 'temp'] = np.nan

In [8]:
# Interpolate!
df = df.interpolate()

In [9]:
# Wow, the mean is now 2 and the median is now 1 -- significantly higher
# Not surprising, of course, given that we removed all very low temperatures!
df['temp'].describe()

count    721.000000
mean       2.022191
std        2.345483
min        0.000000
25%        0.209524
50%        1.000000
75%        3.000000
max       12.000000
Name: temp, dtype: float64

# Beyond 3

A cheap solution to interpolation is to replace `NaN` values with the column's mean. Do this, and compare the new mean and median. Again, why are (or aren't) these values similar to the original ones?

In [12]:
# reset our data
df = DataFrame({'temp': s, 
                'hour': [0,3,6,9,12,15,18,21] * 91})

# Remove values <= -1 to NaN
df.loc[df['temp'] <= -1, 'temp'] = np.nan

df = df.fillna(df.mean())

In [13]:
# Wow, these values are even worse than the interpolated ones!

# Clearly, running .interpolate is a better option than using the mean -- 
# in no small part because it calculated a local mean, rather than
# a global one across all of the data.

df.describe()

Unnamed: 0,temp,hour
count,728.0,728.0
mean,2.763926,10.5
std,1.935689,6.878589
min,0.0,0.0
25%,2.0,5.25
50%,2.763926,10.5
75%,2.763926,15.75
max,12.0,21.0
