In [None]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Smoothing

## Outline
* Quantitative Distributions
    - binning and percentiles
* Categorical Distributions
    - grouping as smoothing 
    - additive smoothing
* Time Series
* Rolling Windows

## Separating signal from noise
* Information Extraction: select what's important
* *Real trends* instead of *coincidence*
<img src="imgs/signal.jpeg" width="50%" />

## Separating signal from noise

* Filter out 'random fluctuations'
* Find the 'right view' of the data
* Make trends clearer through 'simplication'

Noise to someone else may be signal to you!

## Separating signal from noise

Related to:
* Signal-processing and filters (engineering)
* Feature engineering (machine learning)
* Modeling (statistics, social sciences)
    
All of these are intertwined!

## Smoothing

Smoothing extracts trends from data by reducing the variance of nearby observations.

* Origins in engineering (audio/images)
* Good for visualization
* Careful building inferential models on smoothed data!
    - inferential models assess what is/isn't noise

## Distributions

* A **distribution** of a population describes the likelihood that a given value will occur.
* The **empirical distribution** of a dataset describes the frequency that each value was observed.
    - For a large representative sample, these are 'similar'.
* Distributions are defined for both *quantitative* and *categorical* data.

### Quantitative Distributions

* Answers the question: "where is most the data?"
* A (theoretical) distribution is likelihood a given value occurs at infinite precision.
* Samples are approximations:
    - How finely you measure proximity?


In [None]:
# create some data
v = np.concatenate([np.random.normal(x,10,1000//x) for x in range(1,100,10)])
np.random.shuffle(v)
sensor_data = pd.Series(v)

In [None]:
title = '%d instances of observed data' % sensor_data.shape[0]
sensor_data.plot(title=title);

In [None]:
sensor_data.sort_values().reset_index(drop=True).plot(title='values of observations, sorted');

## Empirical Distributions

* Empirical distributions are just histograms!
* Proximity is measured by bin width
* Samples are approximations: how to pick bin width?

In [None]:
# "small bin width": every value happens only once!
sensor_data.value_counts().plot(title='frequency of occurence of each value', figsize=(8,8));

In [None]:
fig, axes = plt.subplots(4,2, figsize=(12,8))
cnt = 0
for b in [500, 200, 100, 50, 20, 10, 5, 2]:
    sensor_data.plot(kind='hist', bins=b, ax=axes[cnt//2, cnt%2], title='bins=%d' %b, density=True)
    cnt += 1
    
plt.tight_layout();

## Smoothing: binning data
* Binning quantitative data approximates a probability distribution.
* An appropriate choice of bin size reduces noise.
    - bin sizes may not be uniform
* Decreases precision: assign a value to it's bin

In [None]:
# sdpd example with subject_age
stops = pd.read_csv('stops_2016.csv')

def clean_age(x):
    if pd.notnull(x) and x.isdigit() and int(x) <= 100:
        return int(x)
    else:
        return np.NaN

ages = stops['subject_age'].apply(clean_age)

In [None]:
# one bin per year
# spiky every five years! (spikes are noise)
# our association of age <-> stops shouldn't vary too much so quickly

ages.plot(kind='hist', bins=100, density=True);

In [None]:
# 1 bin = 5 years
ages.plot(kind='hist', bins=20, density=True);

## Outliers and bucketing into percentiles
* Binning based on relative order limit the effect of outliers.
* Label observations by what percentile they're in:
    - reduce the number of bins
    - outliers no long have their own bins

In [None]:
# Normal histogram; outlier ages included
all_ages = pd.to_numeric(stops.subject_age, errors='coerce')
all_ages.plot(kind='hist', bins=20);

In [None]:
# histogram with percentile bins
bins = np.percentile(all_ages.dropna().values, range(10, 101, 10))

plt.hist(all_ages.dropna(), bins=bins);

## Categorical distributions

* **categorical distribution** is the likelihood of a any categorical value occuring.
* The empirical distibution is the count/proportion of every category.
* For a pandas Series: `Series.value_counts(normalize=True)`

In [None]:
title='categorical distribution of subject_race'
stops.subject_sex.value_counts(dropna=False, normalize=True).to_frame().plot(kind='bar')

## Smoothing categorical distributions
* What if there are too many categories?
* Approach #1: map categories to coarser categories (using domain knowledge)

In [None]:
title='categorical distribution of subject_race'
stops.subject_race.value_counts(normalize=True).to_frame().plot(kind='barh');

In [None]:
race_dict = {'A':'Asian',
             'B':'Black',
             'C':'Asian',
             'D':'Asian',
             'F':'Asian',
             'G':'Asian',
             'H':'Hispanic',
             'I':'American Indian',
             'J':'Asian',
             'K':'Asian',
             'L':'Asian',
             'O':'OTHER',
             'P':'Asian',
             'S':'Asian',
             'U':'Hawaiian',
             'V':'Asian',
             'W':'White',
             'Z':'Asian'
            }

In [None]:
(
    stops
    .subject_race
    .apply(lambda x:race_dict.get(x, np.NaN))
    .value_counts(normalize=True, dropna=False)
    .to_frame()
    .plot(kind='barh')
);

## Smoothing categorical distributions

* Problem: no ability to "coarsen" using domain knowledge.
    - Give up on "coarsening values"; replace value with likelihood
* Uncommon values are more subject to error from noise.
    - rare occurrences should be trusted less!


### Categorical Distributions with many distinct values

* Conisder a "representative" sample of the english language.
* Some words occur a lot; most words occur only once.
* What happens if our sample is is off by one for:
    - the word 'the'?
    - the word 'courteous'?

In [None]:
import re

text = open('pride_and_predjudice.txt').read()
text = re.sub('[^0-9a-zA-Z\s\n]+', '', text)
    
words = pd.Series(text.split()).str.lower().loc[lambda x:x.apply(len) <= 10]

In [None]:
# number of words
word_cnts = words.value_counts()
word_cnts.shape[0]

In [None]:
# And these are only 3% of the words!
words.value_counts().iloc[:200].plot(kind='bar');

In [None]:
# empirical distribution of "the"
(word_cnts.loc['the'] / len(words))

In [None]:
# empirical distribution of "the" (off by a count of one 1)
((word_cnts.loc['the'] + 1) / len(words))

In [None]:
# empirical distribution of "courtious"
(word_cnts.loc['courteous'] / len(words))

In [None]:
# empirical distribution of "the" (off by a count of one 1)
((word_cnts.loc['courteous'] + 1) / len(words))

## Additive Smoothing
* If we see a value a lot:
    - the likelihood of occurance ~ similar to what's observed.
* If we've rarely seen a category:
    - Is it really that rare? 

Noise causes more error in the tail of the distribution!

see: [ref](https://en.wikipedia.org/wiki/Additive_smoothing)

## Additive Smoothing
* Given a dataset of observations $x$ of size $N$, with $d$ categories,
* Smooth the empirical probability a value occurs:
$$ p_i = \frac{x_i}{N} \qquad {\rm (empirical)}$$

$$ p_i = \frac{x_i + \alpha}{N + \alpha d} \qquad {\rm (smoothed)}$$

* Where $\alpha$ reflects a guess that each category has an additional count $\alpha$
* Where $1/d$ is the uniform probability, if each category is equally likely.

## Additive Smoothing
$$ p_i = \frac{x_i + \alpha}{N + \alpha d} \qquad {\rm (smoothed)}$$

* If $\alpha$ is zero, then 

$$ p_i = \frac{x_i + \alpha}{N + \alpha d} = \frac{x_i + 0}{N + 0\cdot d} = \frac{x_i}{N} \qquad {\rm (empirical\ prob)}$$

* If $\alpha >> 0$, then

$$ p_i = \frac{x_i + \alpha}{N + \alpha d} \approx \frac{\alpha}{\alpha\cdot d} = \frac{1}{d} \qquad {\rm (uniform\ prob)}$$


## Additive Smoothing: SDPD race codes
* $N$ -- number of traffic stops
* $d$ -- distinct values of race code
* $\alpha$ -- smoothing parameter
* $x_i$ -- number of stops for a given race code

In [None]:
N = len(stops)
d = stops.subject_race.nunique()
alpha = 100 # effect: over count by 1000 stops!

cnts = stops.subject_race.value_counts(dropna=False)
emp = stops.subject_race.value_counts(dropna=False, normalize=True)

In [None]:
smoothed = ((cnts + alpha)/(N + alpha * d))

In [None]:
pd.concat([emp.rename('empirical distribution'), smoothed.rename('smoothed')], axis=1)

## Additive smoothing: changing $\alpha$
* What happens to the distributions of race codes as $\alpha\to\infty$?

In [None]:
# parameter list 0 to 100k
alphas = 10**np.array(range(7))
alphas = np.append(0, alphas)
alphas

In [None]:
# calculate smoothed distributions
smooth_list = []
for alpha in alphas:
    smoothed = ((cnts + alpha)/(N + alpha * d)).rename(alpha)
    smooth_list.append(smoothed)

In [None]:
# plot smoothed distributions for different values of alpha
pd.concat(smooth_list, axis=1).plot(kind='bar', subplots=True, title=False, figsize=(15,15));

In [None]:
# uniform distribution by number of categories
1 / d

### Bivariate additive smoothing: incidence rate
* examine a categorical attribute $x$ using a second, boolean attribution $b$.

$$ p_i = \frac{b_i}{x_i} \qquad {\rm (empirical)}$$

$$ p_i = \frac{N\cdot(b_i/x_i) + \alpha\cdot(b/N)}{N + \alpha} \qquad {\rm (smoothed)}$$

* Additive smoothing interpolates between:
    1. incidence rate per group, and
    2. overall incidence rate.
* More covered later!

### Bivariate additive smoothing: vehicle stop search rates

* Search rates by race_code are noisy for small groups
* Additive smoothing interpolates between:
    1. search rate per group, and
    2. overall search rate.

In [None]:
def clean_arrested(s):
    if s in ['N', 'n']:
        return 0
    elif s in ['Y', 'y']:
        return 1
    else:
        return np.NaN
    
stops['arrested'] = stops.arrested.apply(clean_arrested)

In [None]:
# Arrest Rates additive smoothing

N = len(stops)
b = stops.arrested.sum()
alpha = 100

arrests_by_subject_race = stops.groupby('subject_race').arrested.sum()
stops_by_subject_race = stops.subject_race.value_counts()

In [None]:
b/N

In [None]:
smoothed = (N * (arrests_by_subject_race / stops_by_subject_race) + alpha * (b/N))/(N + alpha)
smoothed

## Rolling Windows
* Group data in buckets/windows and compute statistics on the buckets
* windows should overlap, to account for sudden changes
* Very common to analyze sequences of events, like time-series

In [None]:
# our "sensor data" with a lot of noise
sensor_data.plot();

In [None]:
# avering over windows of 50 observations
sensor_data.rolling(window=50).mean().plot();

### Rolling Windows in Pandas
* What just happened?
    - `rolling` method 'splits' the dataframe into overlapping windows.
    - apply the desired method to each window.

In [None]:
# Create sample data

df = pd.DataFrame( {'numbers': [1,1,1,2,2,2,3,3,3]})
df

In [None]:
# Calculate the moving sum. That is, take
# the first two values, sum them, 
# then drop the first and add the third, etc.
# can use any aggregation function

df.rolling(window = 2).sum()

In [None]:
# let's create outliers

df = pd.DataFrame( {'numbers': [1,1,1,20,2,2,3,30,3]})
df

In [None]:
# smoothing with average with different window sizes

df["Window = 2"]=df["numbers"].rolling(window = 2).mean()
df["Window = 3"]=df["numbers"].rolling(window = 3).mean()
df["Window = 4"]=df["numbers"].rolling(window = 4).mean()


df

# which window size is better? 
# Is there an optimal size?

In [None]:
df.plot(figsize=(8,6));

## Time Series

* Sequential data is often noisy
* Time series data are a common example of sequential data

## Datetime objects

* A **date object** is  a set of values for the *year*, the *month*, the *day*, and a collection of functions that knows how to handle them.

* A **time object** is constructed in a similar way. 

![](./imgs/datetime_attributes.png)

## Dates and Times in Pandas

Often we have a problem with inconsistent structure of input data.

How many ways can you come up with to record today's date?

* 01-24-2019
* 24-01-2019
* 01.24.2019
* 24.01.2019
* 01/24/2019
* 24/01/2019
* Jan, 24 2019
* 24 Jan, 2019
*  ....

How about the time?



## Example: Is it raining in Seattle?

```
DATE = the date of the observation
PRCP = the amount of precipitation, in inches
TMAX = the maximum temperature for that day, in degrees Fahrenheit
TMIN = the minimum temperature for that day, in degrees Fahrenheit
RAIN = TRUE if rain was observed on that day, FALSE if it was not
```

In [None]:
weather = pd.read_csv("seattleWeather.csv")
weather.head()

In [None]:
# what type is the date column?
weather.info()

### Parsing dates to `datetime` objects

* `DATE` column contains date as a string object. 
* Option 1: use string manipulations to extract needed information by carefully slicing each string. 
    - if your data is not consistent then this approach fails.
* Option 2: convert `object` type to a special `datetime` format.  

In [None]:
weather['DATE'] = pd.to_datetime(weather['DATE'])
weather.info()


In [None]:
# Notice I did not have to specify what number is a month and what number is a day. 

dates = ['2019-01-22', 'Jan 22, 2019', '01/22/2019', '2019.01.22', '2019/01/22','20190122']
pd.to_datetime(dates)


### Specifying formats for parsing dates/times
* `to_datetime` tries guessing the format
* Sometimes, formats are ambiguous:
    * **US**:  MM/DD/YEAR
    * **EUROPE**: DD/MM/YEAR
    * **JAPAN**, CHINA: YEAR/MM/DD
* use the `format` keyword and the [format reference](http://strftime.org/)


In [None]:
# do not expect it to always work

dates = ['09/01/2019']   # what date is it?
pd.to_datetime(dates)

In [None]:
dates = ['09/01/2019']   # what date is it?
pd.to_datetime(dates, format='%d/%m/%Y')

### Datetime objects and the `dt` namespace
* many built-in methods to use
* see a full list of methods [here](https://pandas.pydata.org/pandas-docs/stable/api.html#datetimelike-properties )

In [None]:
months = weather["DATE"].dt.month
months.head()

In [None]:
months.plot(kind='hist', bins=12);

In [None]:
# We can use time stamps to compare dates/times

time_stamp = pd.to_datetime('10/10/1980')
weather.loc[weather['DATE'] <= time_stamp].head()


In [None]:
# Math operations are also possible on dates:
# What day had the most amount of rain?

weather.loc[weather['PRCP'] == weather['PRCP'].max()]


## Time Deltas

* Differences between `datetimes` are called `timedelta` objects. 

In [None]:
# we can subtract dates as well, creating a timedelta object. 

weather.DATE.max() - weather.DATE.min()


### Weather Data Exploration
* Daily precpitation is noisy!
* How can we spot trends in this data?

In [None]:
# Let's plot the weather data

%matplotlib nbagg
%matplotlib

weather.plot(y='PRCP', x="DATE");


In [None]:
# look at last two years

cut_off = pd.to_datetime('1/1/2016')

weather_recent = (
    weather
    .loc[weather['DATE'] >= cut_off]
    .set_index('DATE')
)

weather_recent.head(5)

In [None]:
weather_recent.plot(y='PRCP');


In [None]:
# let's use rolling window approach to smooth the data

smoothed = weather_recent.rolling(window = '30D').mean()
smoothed.head(5)

In [None]:
smoothed.plot(y='PRCP');

In [None]:
# try different smoothing windows
# 
windows =  ['1D', '7D', '30D', '60D', '90D', '180D']
smoothed_list = [weather_recent.rolling(window=win).PRCP.mean().rename(win) for win in windows]

pd.concat(smoothed_list, axis=1).plot(subplots=True);

In [None]:
weather.set_index('DATE').PRCP.plot()

In [None]:
# How is precipitation changing year-after-year?

weather.set_index('DATE').rolling(window='30D').PRCP.mean().plot()

In [None]:
# 1-year rolling windows take away the seasonality: good years and bad years
weather.set_index('DATE').rolling(window='365D').PRCP.mean().loc['1950':].plot();

In [None]:
# can we see longer droughts?
weather.set_index('DATE').rolling(window='3650D').PRCP.mean().loc['1955':].plot();

In [None]:
# what does it look like taking 10 year increments?
weather.set_index('DATE').loc['1950':'2020'].groupby(pd.Grouper(freq='3650D')).PRCP.mean().plot();

In [None]:
# difference between rolling windows and aggregation
#
# aggregation
weather.set_index('DATE').loc['1950':'2020'].groupby(pd.Grouper(freq='365D')).PRCP.mean().plot();

In [None]:
# rolling window
#
weather.set_index('DATE').rolling(window='365D').PRCP.mean().loc['1950':].plot();