In [13]:
import numpy as np
import pandas as pd
from io import StringIO 

## Problem 1

Given an array of 1s and 2s, suppose that I need to count how many times it turns from being 1 to being 2. How many switches happen in this array?

In [1]:
np.random.seed(444)

x = np.random.choice([1, 2], size=100000)

For reference, here is the for loop version

In [2]:
def count_transitions(x) -> int:
    count = 0
    for i, j in zip(x[:-1], x[1:]):
        if j==2 and i==1:
            count += 1
    return count

count_transitions(x)

24984

How would you make this vectorized and do it in one line? Profile it also to prove that it is faster.

#### ANSWER:

In [3]:
sum(x[:-1] < x[1:])

24984

Profiling:

In [4]:
%timeit count_transitions(x)

41.5 ms ± 4.76 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [5]:
%timeit sum(x[:-1] < x[1:])

18.9 ms ± 2 ms per loop (mean ± std. dev. of 7 runs, 100 loops each)


The numpy function is in between these:

In [6]:
%timeit list(np.diff(x)).count(1)

23.4 ms ± 2.24 ms per loop (mean ± std. dev. of 7 runs, 100 loops each)


## Problem 2
Given panel data, how can you do a vectorized demeaning by group `i`?

In [11]:
panel_data = '''
i,t,value
0,0,4.688025813099681
0,1,5.52269259395655
0,2,3.794489256250384
0,3,3.9649616347982652
0,4,2.4118897054484862
0,5,3.4117301110880547
0,6,5.471047024539948
0,7,2.2243578376592072
0,8,3.0142946545236295
0,9,3.6286363500138106
1,0,20.763821602823516
1,1,21.364728204694217
1,2,16.473117837949424
1,3,22.906416928450746
1,4,20.685359348048078
1,5,23.98534980700863
1,6,17.449606215978182
1,7,24.30998889198093
1,8,19.684197761131074
1,9,19.468066794961956
2,0,13.345220275775793
2,1,13.759424454205883
2,2,15.433405160603295
2,3,7.599836415708792
2,4,12.815756066971403
2,5,9.567004610686734
2,6,13.707836922291087
2,7,5.037831324914107
2,8,9.862822201697297
2,9,17.52951651777798
3,0,2.8745063600488643
3,1,17.500660845021965
3,2,21.60320009500734
3,3,3.615382578465688
3,4,12.067565035781877
3,5,12.34780837405084
3,6,6.555567279947617
3,7,7.524996593472945
3,8,-2.8197131509063347
3,9,-12.42907679575168
'''

In [14]:
sample = pd.read_csv(StringIO(panel_data))

sample

Unnamed: 0,i,t,value
0,0,0,4.688026
1,0,1,5.522693
2,0,2,3.794489
3,0,3,3.964962
4,0,4,2.41189
5,0,5,3.41173
6,0,6,5.471047
7,0,7,2.224358
8,0,8,3.014295
9,0,9,3.628636


__ANSWER__:

In [15]:
sample_pivot = sample.set_index(["i","t"]).unstack(-2)

sample_pivot

Unnamed: 0_level_0,value,value,value,value
i,0,1,2,3
t,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
0,4.688026,20.763822,13.34522,2.874506
1,5.522693,21.364728,13.759424,17.500661
2,3.794489,16.473118,15.433405,21.6032
3,3.964962,22.906417,7.599836,3.615383
4,2.41189,20.685359,12.815756,12.067565
5,3.41173,23.98535,9.567005,12.347808
6,5.471047,17.449606,13.707837,6.555567
7,2.224358,24.309989,5.037831,7.524997
8,3.014295,19.684198,9.862822,-2.819713
9,3.628636,19.468067,17.529517,-12.429077


In [16]:
sample_pivot - sample_pivot.mean()

Unnamed: 0_level_0,value,value,value,value
i,0,1,2,3
t,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
0,0.874813,0.054756,1.479355,-4.009583
1,1.70948,0.655663,1.893559,10.616571
2,-0.018723,-4.235948,3.56754,14.71911
3,0.151749,2.197352,-4.266029,-3.268707
4,-1.401323,-0.023706,0.949891,5.183475
5,-0.401482,3.276284,-2.298861,5.463719
6,1.657835,-3.259459,1.841972,-0.328522
7,-1.588855,3.600924,-6.828034,0.640907
8,-0.798918,-1.024868,-2.003043,-9.703803
9,-0.184576,-1.240999,5.663651,-19.313167


In [17]:
(sample_pivot - sample_pivot.mean()).stack().reset_index()

Unnamed: 0,t,i,value
0,0,0,0.874813
1,0,1,0.054756
2,0,2,1.479355
3,0,3,-4.009583
4,1,0,1.70948
5,1,1,0.655663
6,1,2,1.893559
7,1,3,10.616571
8,2,0,-0.018723
9,2,1,-4.235948


Alternative way:

In [18]:
avgs = sample.groupby("i")['value'].mean().reset_index()

In [19]:
sample = sample.merge(avgs,on='i',suffixes=["","_mean"])

In [20]:
sample['value_demeaned'] = sample['value'] - sample['value_mean']

In [21]:
sample

Unnamed: 0,i,t,value,value_mean,value_demeaned
0,0,0,4.688026,3.813212,0.874813
1,0,1,5.522693,3.813212,1.70948
2,0,2,3.794489,3.813212,-0.018723
3,0,3,3.964962,3.813212,0.151749
4,0,4,2.41189,3.813212,-1.401323
5,0,5,3.41173,3.813212,-0.401482
6,0,6,5.471047,3.813212,1.657835
7,0,7,2.224358,3.813212,-1.588855
8,0,8,3.014295,3.813212,-0.798918
9,0,9,3.628636,3.813212,-0.184576


## Problem 3

To get mean and variance, the whole thing never needs to be read into memory. Instead, just calculate the sums for each chunk.

Mean:

In [23]:
chonker = pd.read_csv("sample.csv",chunksize=100)

In [24]:
L = []
for i in chonker:
    L+=[i.sum()]

In [25]:
avg = pd.concat(L,axis=1).T.sum()/1000000

In [26]:
avg

0     4.001959
1    20.005804
dtype: float64

Right answer!

Variance

In [27]:
chonker = pd.read_csv("sample.csv",chunksize=100)

L = []
for i in chonker:
    L+=[((i-avg)**2).sum()]

In [28]:
(pd.concat(L,axis=1).T.sum()/(1000000))**.5

0    1.000024
1    3.497127
dtype: float64

Right answer again