In [None]:
%pylab inline
rcParams['figure.figsize'] = (10, 4) #wide graphs by default
from __future__ import print_function
from __future__ import division
from IPython.display import display, HTML, Audio

# Variance and correlation

![http://imgs.xkcd.com/comics/random_number.png](http://imgs.xkcd.com/comics/random_number.png)

![http://imgs.xkcd.com/comics/sports.png](http://imgs.xkcd.com/comics/sports.png)

![http://imgs.xkcd.com/comics/cell_phones.png](http://imgs.xkcd.com/comics/cell_phones.png)

[http://xkcd.com/882/](http://xkcd.com/882/)

## Variance and covariance

$$\operatorname{Var}(X) =\sigma^2 =\int (x-\mu)^2  f(x)  dx =\int x^2  f(x)  dx - \mu^2$$

$$\operatorname{Var}(X) = \sum_{i=1}^n p_i\cdot(x_i - \mu)^2 = \sum_{i=1}^n (p_i\cdot x_i^2) - \mu^2$$

$p_i$ is the probability of $x_i$

http://en.wikipedia.org/wiki/Variance

For equally likely values:

$$\operatorname{Var}(X) = \frac{1}{n} \sum_{i=1}^n (x_i - \mu)^2$$


...or the sum of the squares of the differences from the mean ($\mu$) all divided by the number of elements ($n$).

(Also see [Mean squared error](https://en.wikipedia.org/wiki/Mean_squared_error) and [Least squares](https://en.wikipedia.org/wiki/Least_squares))

<!-- $p_i$ is $f(x)$ so $x$ turns into $i$ -->

#### Variance and Standard Deviation 

In [None]:
sample = normal(scale=2.0, size = 10000)
sample.std(), sample.var()

In [None]:
[print(s, normal(scale=s, size=10000).var()) for s in range(1, 10)]
pass

In [None]:
sample.std(), sample.std()**2, sample.var()

#### Covariance

> a measure of the joint variability of two random variables; show if two sets of sample behave similarly

In [None]:
a = rand(100)
b = rand(100)
cov(a, b)

You get a matrix: [[aa, ab], [ba, bb]]

In [None]:
((a - a.mean())**2).mean()

In [None]:
((a - a.mean()) * (b - b.mean())).mean()

In [None]:
a = rand(100) * 100
b = rand(100) * 100
cov(a, b)
# the thing to look for is whether var(x) similar cov(x, y)[0, 1]

In [None]:
a = rand(100000) * 100
b = rand(100000) * 100
cov(a, b)

In [None]:
# var(a) ==> cov(a, b)[0, 0]
# var(b) ==> cov(a, b)[1, 1]
var(a), var(b)

In [None]:
var(a), cov(a, b)[0,0], abs(var(a) - cov(a, b)[0,0])

#### Comparing noisy signals

In [None]:
a = rand(100)
b = rand(100)
d = sin(linspace(0, 2 * pi * 7, 100))
plot(a + d)
plot(b + d)
pass

In [None]:
cov(a + d, b + d) # the d in both should lead to some correlation (relationship)
# top row is getting similar

In [None]:

# top row is getting very similar
plot(a + d * 100)
plot(b + d * 100)
pass

In [None]:
cov(a + d * 100, b + d * 100)

#### [Pearson r](https://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.stats.pearsonr.html) (related to the covariance)
> a measure of the linear correlation between two variables X and Y. It has a value between +1 and −1, where 1 is total positive linear correlation, 0 is no linear correlation, and −1 is total negative linear correlation

In [None]:
from scipy.stats import pearsonr
pearsonr(a + d, b + d)

In [None]:
a = rand(100)
b = rand(100)
c = rand(100)
cov(a + (2 * c), b + (2 * c)) # the c in both should lead to some correlation (relationship)

In [None]:
pearsonr(a + (2 * c), b + (2 * c))

In [None]:
plot(a + d + 2)
plot(-2 * (b + d))
pass

In [None]:
cov(a + d + 2, -2 * (b + d))
# not so correlated?

In [None]:
pearsonr(a + d + 2, -2 * (b + d))

In [None]:
plot(a + (d * 3))
plot(-(b + (d * 3)))
pass

In [None]:
cov(a + (d * 3), -(b + (d * 3)))
# negatively correlated

In [None]:
pearsonr(a + (d * 3), -(b + (d * 3)))

- - - - - - -
**sample ordering and time does not matter for all the stuff above, but now were moving into time-domain analysis**


## Correlation and autocorrelation

$$(f \star g)(\tau)\ \stackrel{\mathrm{def}}{=} \int_{-\infty}^{\infty} f^*(t)\ g(t+\tau)\,dt$$

$$ (f \star g)(n) = \frac{1}{N} \sum_{i=1}^N f(i) . g(i + n)$$


### [cross correlation](https://en.wikipedia.org/wiki/Cross-correlation)

> In signal processing, cross-correlation is a measure of similarity of two series as a function of the displacement of one relative to the other. This is also known as a sliding dot product or sliding inner-product. It is commonly used for searching a long signal for a shorter, known feature.

Say you have a recording of a trumpet player and you have a clip of a punchy trumpet note. You might use cross-correlation between the recording and the clip to get back a signal whose amplitude is high where the punchy notes might be and low elsewhere.


### [autocorrelation](https://en.wikipedia.org/wiki/Autocorrelation)

> In an autocorrelation, which is the cross-correlation of a signal with itself, there will always be a peak at a lag of zero, and its size will be the signal energy.

This will show where a signal is self-similar.

![](https://upload.wikimedia.org/wikipedia/commons/2/21/Comparison_convolution_correlation.svg)

In [None]:
sig1 = sin(linspace(0, 100 * 2 * pi, 44100))
sig2 = sin(linspace(0.3, 0.3 + (100 * 2 * pi), 44100))
plot(sig1, label='sig1')
plot(sig2, label='sig2')
legend()
xlim((0, 1000))
#xlim((210, 230))
#grid()
# these two signals are offset by how much? describe this in terms of "lag"

In [None]:
argmax(correlate(sig1, sig2, mode='full'))

In [None]:
lags, c, line, b = xcorr(sig1, sig2, maxlags=500, usevlines=False)
xlabel('lag')
ylabel('correlation')
grid()

In [None]:
with xkcd():
    lags, c, line, b = xcorr(sig1, sig2, maxlags=50, usevlines=False)
    xlabel('lag')
    ylabel('correlation')

In [None]:
plot(lags,c)

In [None]:
len(c)

In [None]:
plot(c) # c is for (c)orrelation
grid()
pass

In [None]:
argmax(c) # find the index of the peak

In [None]:
cc_index = lags[argmax(c)] # what's the value of lag at that peak?
print(cc_index, "samples lag")

In [None]:
plot(lags, c)
vlines(cc_index, 0.5, 1.09, color='r', lw=2)
text(cc_index + 1, 1.05, 'CC peak', color='r')
grid()

In [None]:
sr = 44100
f = 100.0
Ps = sr / f
Ps # period of our signal in number of samples

In [None]:
cc_index

In [None]:
cc_index / Ps

In [None]:
2 * pi * cc_index / Ps # we get back our initial phase difference

In [None]:
2 * pi * (cc_index + 1) / Ps

Later, we'll use cross-correlation (CC) to determine pitch and periodicity.

Two different noise samples produce low values of cross-correlation:

In [None]:
noise1 = random.random(44100) - 0.5
noise2 = random.random(44100) - 0.5

In [None]:
lags, c, line, b = xcorr(noise1, noise2, maxlags=500, usevlines=False)
xlabel('lag')
ylabel('correlation')
grid()

And no clear patterns in the CC function:

In [None]:
noise1 = normal(size=44100)
noise2 = normal(size=44100)

In [None]:
lags, c, line, b = xcorr(noise1, noise2, maxlags=500, usevlines=False);
grid()

Cross-correlation can find harder to see relationships:

In [None]:
noiseA = (rand(44100) * 2) - 1
plot(noiseA)
pass

In [None]:
signalA = sin(linspace(0, 2 * pi * 200, 44100)) * 0.3
plot(noiseA + signalA) # noise, but there's a signal in there! can you see it?
pass

In [None]:
noiseB = (rand(44100) * 2) - 1
signalB = sin(linspace(0.7, 0.7 + (2 * pi * 200), 44100)) * 0.3
plot(noiseB + signalB)
pass

In [None]:
plot(noiseB + signalB)
xlim(0, 1000) # yeah, if you know to look closely

In [None]:
lags, c, line, b = xcorr(noiseA + signalA, noiseB + signalB, maxlags=500)
grid()
pass
# here a and c have the same frequency, but different phases
# the CC is still small, but there's a clearly visible pattern

In [None]:
argmax(c[500:600])

In [None]:
m = argmax(c[500:600]) + 500

In [None]:
lags[m]

In [None]:
sr = 44100
f = 200.0
Ps = sr / f
Ps # period in number of samples

In [None]:
lags[m] / Ps

In [None]:
(lags[m] / Ps) * 2 * pi

The result is approximate

# Autocorrelation

In [None]:
lags, c, line, b = acorr(noiseA + signalA, maxlags=500)
#xlim((-10, 60))
pass

Note: this (above) is a stem plot it looks like.

In [None]:
plot(c[501:])
vlines(220.5, -0.15, 0.15)
pass

### Find Inter-aural time delay (or stereo relationships) using running cross-correlation

In [None]:
from scipy.io import wavfile
from IPython.display import Audio

In [None]:
sampleRate, signal = wavfile.read('media/cars.wav')

In [None]:
shape(signal)

In [None]:
Audio(data=signal.flat[0::2], rate=sampleRate)

In [None]:
signal.dtype

In [None]:
shape(signal)

In [None]:
type(signal)

In [None]:
cc_lags = int(ceil(0.001 * sampleRate)) # 1 ms max lag
print(cc_lags)

In [None]:
1024/44100

In [None]:

win_size = 1024
hop = 256

lags = arange(-cc_lags, cc_lags + 1)

rccf = []
for start in arange(0, signal.shape[0] - win_size, hop):
    win = signal[start:start + win_size].astype(float)
    c = correlate(win[:,0], win[:,1], mode='full')
    c /= sqrt(dot(win[:,0], win[:,0]) * np.dot(win[:,1], win[:,1])) # normalize
    c = c[len(win[:,0]) - 1 - cc_lags:len(win[:,0]) + cc_lags] # filter
    rccf.append(c)

In [None]:
imshow(array(rccf).T, aspect='auto')
pass

In [None]:
shape(signal)[0] / 256

In [None]:
plot(signal)
pass

#### CC for tempo and rhythm estimation:

In [None]:
sampleRate, signal = wavfile.read('media/superstition.wav')
shape(signal)
shape(signal.flat[0::2])

In [None]:
Audio(data = signal.flat[0::2], rate = sampleRate)

In [None]:
plot(signal.flat[0::2]) # what are we looking at here?
pass

In [None]:
signal.shape

In [None]:
lags, c, line, b = acorr(signal[0:150000,0].astype(double), maxlags=60000)
pass

In [None]:
lags, c, line, b = acorr(signal[90000:300000,0].astype(double), maxlags=60000)
pass

What question is the plot above answering?

In [None]:
2 * 60 / ((-lags[argmax(c[:12000])]) / 44100)

In [None]:
lags, c, line, b = acorr(signal[90000:300000,0].astype(double), maxlags=60000)
xlim((-10, 60000))
pass

## Two-dimensional cross-correlation

We can find a given pattern in a larger image...

<!--
![sidereusnuncius_crawford_dbyn2008.jpg](media/sidereusnuncius_crawford_dbyn2008.jpg)

-->
![](media/galileo.png)

In [None]:
i = imread('media/galileo.png')
imshow(i)
shape(i)

In [None]:
i.shape

In [None]:
i.dtype

In [None]:
i = sum(i[:,:,:-1], axis=2)/3.0
imshow(i)
colorbar()
shape(i)

In [None]:
imshow(i, cmap=cm.gray)
colorbar()
pass

In [None]:
i = where(i > 0.9, 0, 1)
imshow(i, cmap='gray')
colorbar()

In [None]:
o = imread('media/o.png')
imshow(o)
colorbar()
pass

In [None]:
shape(o)

In [None]:
o.dtype

In [None]:
o.astype(float).sum(axis=-1) / 4

In [None]:
o = o.astype(float).sum(axis=-1) / 4
o = where(o > 0.9, 0, 1)
imshow(o, cmap=cm.gray, interpolation='nearest')
shape(o)

In [None]:
from scipy.signal import correlate2d
cc = correlate2d(i, o)
imshow(cc)
colorbar()
gcf().set_figheight(8)

In [None]:
imshow(cc, cmap=cm.gray)
colorbar()
gcf().set_figheight(8)

In [None]:
imshow(where(cc > 200, 1, 0), interpolation='nearest', cmap=cm.gray)
colorbar()
gcf().set_figheight(8)

In [None]:
imshow(where(cc > 300, 1, 0), interpolation='nearest', cmap=cm.gray)
colorbar()
gcf().set_figheight(8)

In [None]:
from scipy.ndimage.filters import maximum_filter

imshow(maximum_filter(cc, (10,10)))
gcf().set_figheight(8)

In [None]:
subplot(121)
imshow(maximum_filter(cc, (50,50)))
subplot(122)
imshow(i, cmap=cm.gray)
gcf().set_figheight(8)

In [None]:
mf = maximum_filter(cc, (50,50))

In [None]:
argmax(mf)

In [None]:
unravel_index(argmax(mf), mf.shape)

In [None]:
argmax(cc)

In [None]:
unravel_index(argmax(cc), cc.shape)

By: Andrés Cabrera mantaraya36@gmail.com
For MAT course MAT 201A at UCSB

Adapted by Karl Yerkes

This ipython notebook is licensed under the CC-BY-NC-SA license: http://creativecommons.org/licenses/by-nc-sa/4.0/

![http://i.creativecommons.org/l/by-nc-sa/3.0/88x31.png](http://i.creativecommons.org/l/by-nc-sa/3.0/88x31.png)