In [1]:
%matplotlib inline

import time
import gc

import numpy as np
import pandas as pd

# Make inline plots vector graphics instead of raster graphics
from IPython.display import set_matplotlib_formats
set_matplotlib_formats('pdf', 'svg')

import matplotlib
import matplotlib.pyplot as plt
import matplotlib.patches as patches


import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("paper")

from mpl_toolkits.basemap import Basemap

import mpld3

class d3(object):
    """with statement for d3 in only one plot"""

    def __enter__(self):
        mpld3.enable_notebook()

    def __exit__(self ,type, value, traceback):
        mpld3.disable_notebook()

        
class Swap(object):

    def __init__(self, *args, **kwargs):
        self.args = args
        self.kwargs = kwargs

    def __enter__(self):
        pass

    def __exit__(self, type, value, traceback):
        pass
        
        
class SwapPalette(Swap):
 
# Actually not required. you can just do this:
# with sns.color_palette("PuBuGn_d"):
#     sinplot()  
    
    def __enter__(self):
        self.orig = sns.color_palette()
        sns.set_palette(*self.args, **self.kwargs)
        
    def __exit__(self, type, value, traceback):
        sns.set_palette(self.orig)

        
class SwapStyle(Swap):
    
    def __enter__(self):
        self.orig = sns.axes_style()
        sns.set_style(*self.args, **self.kwargs)
        
    def __exit__(self, type, value, traceback):
        sns.set_style(self.orig)

        
class SwapContext(Swap):
    
    def __enter__(self):
        self.orig = sns.plotting_context()
        sns.set_context(*self.args, **self.kwargs)
        
    def __exit__(self, type, value, traceback):
        sns.set_style(self.orig)

        
class Timer(object):
    
    def __init__(self, verbose=True):
        self.verbose = verbose

    def __enter__(self):
        self.start = time.time()
        return self

    def __exit__(self, *args):
        self.end = time.time()
        self.secs = self.end - self.start
        if self.verbose:
            print('elapsed time: {} secs'.format(self.secs))


In [2]:
import os
from collections import defaultdict
from math import sin, cos, sqrt, atan2, pi

# read in meta data to get sex of each individual
def get_meta_data():

    meta_data_file_name = os.path.join('data', 'nature18964-s2-fixed-genders.csv')
    meta_data = dict()
    with open(meta_data_file_name, 'r') as f:
        keys = f.readline().split(';')
        for l in f:
            d = dict(zip(keys, l.split(';')))
            if d['Embargo level (X=Fully Public, Y=Signed Letter)'] == 'X':
                meta_data[d['Sample ID (SGDP)']] = d

    # dicts of samples from each population and region
    populations = defaultdict(list)
    regions = defaultdict(list)
    for key in meta_data:
        pop = meta_data[key]['Population ID']
        populations[pop].append(key)

        reg = meta_data[key]['Region']
        regions[reg].append(key)

    return meta_data, populations, regions

simons_meta_data, simons_populations, simons_regions = get_meta_data()

simons_pop_locations = dict()
for pop in simons_populations:
    for indiv in simons_populations[pop]:
        try:
            lat = float(simons_meta_data[indiv]['Latitude'].replace(',', '.'))
            long = float(simons_meta_data[indiv]['Longitude'].replace(',', '.'))
        except ValueError:
            print("problem with", pop, simons_meta_data[indiv]['Latitude'], simons_meta_data[indiv]['Longitude'])
            continue
        if pop not in simons_pop_locations:
            simons_pop_locations[pop] = []
        simons_pop_locations[pop].append((lat, long))

def center_geo(latitudes, longitudes):

    lat_list, lon_list = list(latitudes), list(longitudes)
    
    x_list = [cos(lat * pi/180 ) * cos(lon * pi/180 ) for lat, lon in zip(lat_list, lon_list)]
    y_list = [cos(lat * pi/180 ) * sin(lon * pi/180 ) for lat, lon in zip(lat_list, lon_list)]
    z_list = [sin(lat * pi/180 ) for lat in lat_list]
    
    x = sum(x_list)/len(x_list)
    y = sum(y_list)/len(y_list)
    z = sum(z_list)/len(z_list)

    center_long = atan2(y, x)  * 180 / pi
    hyp = sqrt(x * x + y * y)
    center_lat = atan2(z, hyp)  * 180 / pi
    
    return center_lat, center_long
    
              
for pop in simons_pop_locations:    
    simons_pop_locations[pop] = center_geo(*zip(*simons_pop_locations[pop]))
              
              
# populations sorted by main region:
regions = ['Africa', 'WestEurasia', 'CentralAsiaSiberia', 'SouthAsia', 'Oceania', 'EastAsia', 'America']
sorted_simons_populations = list()
for region in regions:
    region_pops = list(set(x.split('-')[0][2:] for x in simons_regions[region]))
    sorted_simons_populations.extend(region_pops)
print(sorted_simons_populations)




problem with Crete ? ?
problem with Crete ? ?
problem with Australian .. ..
problem with Norwegian .. ..
problem with Balochi .. ..
problem with Bergamo .. ..
['Mozabite', 'Esan', 'Masai', 'Mbuti', 'Mende', 'Biaka', 'Mandenka', 'Dinka', 'Yoruba', 'Luhya', 'Khomani_San', 'BantuTswana', 'BantuKenya', 'BantuHerero', 'Ju_hoan_North', 'Saharawi', 'Somali', 'Luo', 'Gambian', 'Saami', 'Norwegian', 'Hungarian', 'Finnish', 'Lezgin', 'Crete', 'Armenian', 'Bergamo', 'Icelandic', 'North_Ossetian', 'Tuscan', 'Iraqi_Jew', 'Iranian', 'Jordanian', 'Tajik', 'Spanish', 'Abkhasian', 'Adygei', 'Czech', 'Georgian', 'Bulgarian', 'Basque', 'French', 'Yemenite_Jew', 'Druze', 'Russian', 'Samaritan', 'Sardinian', 'Greek', 'Polish', 'Turkish', 'Estonian', 'Albanian', 'BedouinB', 'Palestinian', 'English', 'Orcadian', 'Chechen', 'Ulchi', 'Kyrgyz', 'Tubalar', 'Eskimo_Chaplin', 'Eskimo_Sireniki', 'Altaian', 'Tlingit', 'Mansi', 'Mongola', 'Even', 'Yakut', 'Eskimo_Naukan', 'Chukchi', 'Itelman', 'Aleut', 'Kalash', 'Kho

In [3]:
def flatten_column_multi_index(df):
    df.columns = ['_'.join(col).strip() for col in df.columns.values]


In [4]:
colnames = ['chr', 'pos', 'end', 'popu', 'ind1', 'ind2', 'sex2', 'sex2', 'diffs', 'total']
mask = lambda x: np.where(x.total > 30000)
df = (pd.read_table('data/simons_pi_data_x.tsv', names=colnames)
      .assign(pi = lambda x: x.diffs / x.total)
      .assign(species = 'H')
      .loc[mask]
     )[['chr', 'pos', 'species', 'popu', 'pi']]
df = (df
      .groupby(['chr', 'pos', 'species', 'popu'])
      .aggregate([np.mean])
     )
flatten_column_multi_index(df)




simons_pi_data = (df
                  .reset_index()
                  .assign(is_low = lambda x: x.pi_mean < np.mean(x.pi_mean) * 0.05)#0.0002)
                 )
simons_pi_data['pos'] /= 100000

simons_pi_data['popu'] = [x.split('_')[1] for x in simons_pi_data['popu']]

simons_pi_data.head()

Unnamed: 0,chr,pos,species,popu,pi_mean,is_low
0,X,1.0,H,Australian,0.004253,False
1,X,1.0,H,Crete,0.006075,False
2,X,1.0,H,Abkhasian,0.003978,False
3,X,1.0,H,Adygei,0.00477,False
4,X,1.0,H,Aleut,0.003337,False


In [2]:
from __future__ import (absolute_import, division,
                        print_function, unicode_literals)

from math import log
from functools import wraps
from operator import itemgetter
from bisect import bisect
import sys
import pandas as pd
from collections import OrderedDict, defaultdict


def even_windows(df, nrobs):

    intervals = sorted((x.start, x.end) for x in full_df[['start', 'end']].itertuples())
#    intervals = list(df[['start', 'end']].itertuples())
    bins = list()

    queue = list()
    total = 0
    i = 0 # interval index
    pos = 0 # sequence index
    prev_bin_end = 0

    intervals_end = intervals[-1][1]
    while pos < intervals_end:

        # get any new intervals
        while i < len(intervals) and pos == intervals[i][0]:
            assert intervals[i][0] == int(intervals[i][0]), 'only ints please'
            queue.insert(bisect(queue, intervals[i][1]), intervals[i][1]) # put the end in a sorted queue
            i += 1

        # remove intervals no longer overlapping:
        while queue and queue[0] <= pos:
            queue.pop(0)

        # update running total
        total += len(queue)

        if total >= nrobs:
            binsize = pos + 1 - prev_bin_end
            bins.append(binsize)
            prev_bin_end = pos + 1
            total = 0

        pos += 1

    binsize = pos - prev_bin_end
    bins.append(binsize)

    return bins


class Bin(object):
    def __init__(self, binsize=None, logbase=1, bins=None):
        self.bin_size = binsize
        self.log_base = logbase
        self.bin_list = bins
        if self.bin_list is not None:
            assert logbase == 1 and not binsize, "Don't use bins with binsize or logbase"
            self.bin_list = bins[:]
            self.bin_size = self.bin_list.pop(0)
        
    def __iter__(self):
        self.bin_start = 0
        self.exhausted = False
        return self
    
    def next(self):
        next_bin = self.bin_start, self.bin_size
        if self.bin_list is not None:
            self.bin_start += self.bin_size
            if self.bin_list:
                self.bin_size = self.bin_list.pop(0)
            else:
                self.bin_size = float('inf')
        elif self.log_base == 1 or self.bin_start == 0:
            self.bin_start += self.bin_size

        else:
            prev_bin_size = self.bin_size
            self.bin_size = self.log_base**(log(self.bin_size, self.log_base)+1)
            self.bin_start += prev_bin_size
        return next_bin

    __next__ = next # Python 3.X compatibility


def window(size=None, logbase=1, fixed=None, even=None):
    def window_decorator(func):
        @wraps(func)
        def func_wrapper(full_df):

            assert not(fixed and even), "only fixed or even bins - not both"
            if even is None:
                get_bin = iter(Bin(binsize=size, logbase=logbase, bins=fixed))
            else:
                get_bin = iter(Bin(binsize=size, logbase=logbase, bins=even_windows(full_df, even)))    
            
            bin_start, bin_size = get_bin.next()
            
            buf = list()
            list_of_stat_results = list()

            def process(buf):
                df = pd.DataFrame(buf)
                df.loc[df.start < bin_start, 'start'] = bin_start
                df.loc[df.end > bin_start + bin_size, 'end'] = bin_start + bin_size
                list_of_stat_results.append(([bin_start, bin_start + bin_size], func(df)))

            for row_sr in full_df.itertuples():

                while row_sr.start >= bin_start + bin_size:
                    if buf:
                        process(buf)
                    bin_start, bin_size = get_bin.next()
                    buf = [x for x in buf if x.end > bin_start]

                buf.append(row_sr)

            # empty buffer
            while buf:
                process(buf)
                bin_start, bin_size = get_bin.next()
                buf = [x for x in buf if x.end > bin_start]

            # format output
            def concat_dicts(l):
                d = dict()
                pairs = [b for a in zip(*[x.items() for x in l]) for b in a]
                for k, v in pairs:
                    d.setdefault(k, []).append(v)
                return d                

            coordinates, stats = zip(*list_of_stat_results)
            if type(stats[0]) is dict:
                d = OrderedDict(zip(('start', 'end'), zip(*coordinates)))
                d.update(concat_dicts(stats))
                return pd.DataFrame(d)#.drop([0], axis=1)

            else:
                return pd.DataFrame([x + [y] for x, y in list_of_stat_results],
                                    columns=['start', 'end', func.__name__])
        
        return func_wrapper
    return window_decorator


def store_groupby_apply(store_file_name, col_names, fun, df_name='df', group_keys=True):

    if type(col_names) is str:
        col_names = [col_names]    
        
    with pd.get_store(store_file_name) as store:
        groups = store.select(df_name, columns=col_names).drop_duplicates()
        df_list = []
        for tup in groups.itertuples():
            mask = ["{}={}".format(col, getattr(tup, col)) for col in col_names]
            grp_df = store.select(df_name, where = mask)
            stats_df = fun(grp_df)            
            if group_keys:
                stats_df = (stats_df
                            .assign(**dict((col, getattr(tup, col)) for col in col_names))
                            .set_index(col_names)
                            )
            df_list.append(stats_df)

    return pd.concat(df_list)


if __name__ == "__main__":

    full_df = pd.DataFrame({'chrom': ['chr1']+['chr2']*10, 
                        'start': list(range(11)), 
                        'end': list(map(sum, zip(range(11), [5, 1]*5+[20]))),
                        'value': 'AAA',
                       'foo': 7, 'bar': 9, 'baz' : 4})
    print(full_df)
    
    # call this function windows of size 5
    @window(size=5)
    def count1(df):
        return len(df.index)

    print(full_df.groupby(['chrom', 'bar']).apply(count1))
    #print(full_df.groupby(['chrom', 'bar']).apply(count1).reset_index())

    print(full_df.groupby('chrom').apply(count1))#.reset_index())

    # call this function on windows beginning at size 2 increasing by log 2
    @window(size=2, logbase=2)
    def count2(df):
        return len(df.index)

    print(full_df.groupby('chrom').apply(count2))#.reset_index(drop=True))

    # call this function on windows with ~10 observations in each
    @window(even=10)
    def count3(df):
        return {'count': len(df.index), 'sum': sum(df.end-df.start)}

    print(full_df.groupby('chrom').apply(count3))#.reset_index(drop=True))

    # call this function on windows with ~10 observations in each
    @window(even=10)
    def stats_fun(df):
        sr = df[['foo','bar']].sum()
        return sr.to_dict()

    df = full_df.groupby(['chrom', 'baz']).apply(stats_fun)
    print(df)
    print(df.index)

    # write the data frame to a hdf5 store
    def write_df_store(df, store_file_name, df_name='df', table=True, append=False):
        with pd.get_store(store_file_name) as store:
            store.append(df_name, df, data_columns=list(df.columns.values), table=table, append=append)

    write_df_store(full_df, 'groupby.h5')

    # perform the same groupby and apply operation as on the data frame
    df = store_groupby_apply('groupby.h5', ['chrom', 'baz'], stats_fun)
    print(df)
    print(df.index)
    #print(df.reset_index())

    
    # next question: What happens when we group by the index (can we do that?) - and if so: should 
    # we then still add the "groupby" columns to the resulting dataframes?
    

    bar  baz chrom  end  foo  start value
0     9    4  chr1    5    7      0   AAA
1     9    4  chr2    2    7      1   AAA
2     9    4  chr2    7    7      2   AAA
3     9    4  chr2    4    7      3   AAA
4     9    4  chr2    9    7      4   AAA
5     9    4  chr2    6    7      5   AAA
6     9    4  chr2   11    7      6   AAA
7     9    4  chr2    8    7      7   AAA
8     9    4  chr2   13    7      8   AAA
9     9    4  chr2   10    7      9   AAA
10    9    4  chr2   30    7     10   AAA
             start  end  count1
chrom bar                      
chr1  9   0      0    5       1
chr2  9   0      0    5       4
          1      5   10       7
          2     10   15       3
          3     15   20       1
          4     20   25       1
          5     25   30       1
         start  end  count1
chrom                      
chr1  0      0    5       1
chr2  0      0    5       4
      1      5   10       7
      2     10   15       3
      3     15   20       1
      4     

In [11]:
# subset of pops in this data set, in sorted order:
pops_in_df = set(simons_pi_data_win.reset_index()['popu'])
pop_sorting = [x for x in sorted_simons_populations if x in pops_in_df]
print(len(pop_sorting), len(sorted_simons_populations))


106 130


"['Mozabite', 'Esan', 'Masai', 'Mbuti', 'Mende', 'Biaka', 'Mandenka', 'Dinka', 'Yoruba', 'Luhya', 'BantuTswana', 'BantuKenya', 'BantuHerero', 'Saharawi', 'Luo', 'Gambian', 'Saami', 'Hungarian', 'Finnish', 'Lezgin', 'Crete', 'Armenian', 'Bergamo', 'Icelandic', 'Tuscan', 'Iranian', 'Jordanian', 'Tajik', 'Spanish', 'Abkhasian', 'Adygei', 'Georgian', 'Bulgarian', 'Basque', 'French', 'Druze', 'Russian', 'Sardinian', 'Greek', 'Turkish', 'Estonian', 'BedouinB', 'Palestinian', 'English', 'Orcadian', 'Ulchi', 'Kyrgyz', 'Tubalar', 'Tlingit', 'Mansi', 'Mongola', 'Even', 'Yakut', 'Aleut', 'Kalash', 'Mala', 'Pathan', 'Sindhi', 'Makrani', 'Balochi', 'Brahmin', 'Brahui', 'Burusho', 'Kapu', 'Bengali', 'Yadava', 'Madiga', 'Irula', 'Hazara', 'Kusunda', 'Relli', 'Punjabi', 'Australian', 'Igorot', 'Dusun', 'Papuan', 'Bougainville', 'Ami', 'Kinh', 'Oroqen', 'Dai', 'Naxi', 'Korean', 'Yi', 'Tu', 'Han', 'Japanese', 'Uygur', 'She', 'Thai', 'Miao', 'Lahu', 'Cambodian', 'Xibo', 'Tujia', 'Hezhen', 'Burmese', 'May

In [36]:
def horizon_plot(df, key, width, cut=None, start='start', chrom='chrom', pop='pop', pop_sorting=None):
    """
    Horizon bar plot made allowing multiple chromosomes and multiple samples.
    """
    
    from math import isclose, floor, log10
    
    def horizon(row, i, cut):
        """
        Compute the values for the three 
        positive and negative intervals.
        """
        val = getattr(row, i)

        if val < 0:
            for i in range(4):
                yield 0

        val = abs(val)
        for i in range(3):
            yield min(cut, val)
            val = max(0, val-cut)
        yield int(not isclose(val, 0, abs_tol=1e-8)) * cut

        if val >= 0:
            for i in range(4):
                yield 0

    def chrom_sort(item):
        """
        Sorts in a meaningful way for chromosomes.
        """
        if item.startswith('chr'):
            item = item[3:]
        if item.isdigit():
            return int(item)
        else:
            return item

    def round_to_1_signif(x):
        """
        Rounds to first significant digit.
        """
        return round(x, -int(floor(log10(abs(x)))))

    class SwapStyle(object):
        def __init__(self, *args):
            self.style = args
        def __enter__(self):
            self.orig = sns.axes_style()
            sns.set_style(*self.style)
        def __exit__(self ,type, value, traceback):
            sns.set_style(self.orig)
        
    # set cut if not set
    if cut is None:
        cut = max(max(df[key]), max(-df[key])) / 3

    # make the data frame to plot
    row_iter = df.itertuples()
    col_iterators = zip(*(horizon(row, key, cut) for row in row_iter))
    col_names = ('yp1', 'yp2', 'yp3', 'yp4', 
                 'yn1', 'yn2', 'yn3', 'yn4')
    df2 = (df.copy(deep=False)
           .assign(**dict(zip(col_names, col_iterators)))
          )

    # chromosome names
    chrom_names = list(df.groupby(chrom).groups.keys())
    sorted_chrom_names = sorted(chrom_names, key=chrom_sort)
    
    if pop_sorting is None:
        pop_sorting = sorted(set(df.reset_index()[pop]))
    
    # number of populations
    nr_pop = len(df.groupby(pop).groups)
    
    # sizes of chromosomes
    chrom_sizes = list()
    for chrom_name in sorted_chrom_names:
        chrom_subset = df.loc[df.chrom == chrom_name]
        est_chrom_len = np.max(chrom_subset.start) + width
        chrom_sizes.append(est_chrom_len)
        
    # relative width of each plot facet 
    # (using lengths of chromosomes)
    facet_widths_ratios = chrom_sizes# * nr_pop

    # make the plot
    with SwapStyle('ticks'):

        # make the facet grid
        g = sns.FacetGrid(df2, 
                          col=chrom, 
                          row=pop,
                          # sharex=False,
                          sharex=True,
                          # margin_titles=True,
                          size=0.5, 
                          aspect=50,
                          col_order=sorted_chrom_names,
                          row_order=pop_sorting,                      
                          gridspec_kws={'hspace':0.0, 
                                        "width_ratios": facet_widths_ratios}
                         )

        # plot colors
        colours = sns.color_palette("Blues", 3) + ['black'] + \
                  sns.color_palette("Reds", 3) + ['grey']

        # first y tick
        ytic1 = round_to_1_signif(cut / 3)

        for col_name, colour in zip(col_names, colours):
            plt.setp(g.fig.texts, text="") # hack to make y facet labels align...
            # map barplots to each facet
            g.map(plt.bar, 
                  start, 
                  col_name, 
                  edgecolor = "none", 
                  width=width, 
                  color=colour)
            # no tick labels on x
            g.set(xticklabels=[])
            #g.set_titles('{col_name}', '{row_name}')
    
        g.set_ylabels('')

        def add_pop_labels(pop_label, **kwargs):
            p = pop_label.reset_index(drop=True)[0]
            plt.annotate(p, xy=(1.005 , 0.5), xycoords='axes fraction', ha='left', size=8)

        g.map(add_pop_labels, pop)

        def add_chrom_labels(chrom_label, **kwargs):
            p = chrom_label.reset_index(drop=True)[0]
            plt.annotate(p, xy=(0.5 , 1.005), xycoords='axes fraction', ha='center', size=8)

        g.map(add_chrom_labels, chrom)

        for arr in g.axes:
            for ax, max_val in zip(arr, facet_widths_ratios):
                ax.set_xlim(0, max_val+1)
                ax.set_ylim(0, cut)
                ax.set(xlabel='', ylabel='')
                ax.set(xticks=np.arange(0, max_val, round_to_1_signif(max_val) / 10))
                ax.set(yticks=[ytic1, ytic1*2, ytic1*3])
                g.set_titles('', '')
              
        # remove top and right frame
        sns.despine()

        #plt.tight_layout()

        plt.subplots_adjust(right=0.95)
        
        return g.fig


n = 100
df = pd.DataFrame({'chrom': ['chr11']*2*n + ['chr2']*2*n,
                   'pop': ['EUR']*1*n + ['AFR']*1*n + ['EUR']*1*n + ['AFR']*1*n, 
                   'start': list(range(1*n)) * 4, 
                   'pi': list(np.sin(np.linspace(-np.pi, np.pi, 1*n))) * 4})

print(df.head())

with Timer() as t:
    with SwapContext("notebook", font_scale=0.5):
        fig = horizon_plot(df, 'pi', width=1, pop='pop')

        # save to file
        plt.savefig('tmp.pdf')
#         fig.clf() # clean up memory
        plt.close(fig)  # close to allow garbage collection, also suppresses inline plot
#         gc.collect()

   chrom            pi  pop  start
0  chr11 -1.224647e-16  EUR      0
1  chr11 -6.342392e-02  EUR      1
2  chr11 -1.265925e-01  EUR      2
3  chr11 -1.892512e-01  EUR      3
4  chr11 -2.511480e-01  EUR      4
elapsed time: 8.443495988845825 secs


In [3]:
df = simons_pi_data.assign(chrom = simons_pi_data.chr,
                           start = simons_pi_data.pos*100000, 
                           end = simons_pi_data.pos*100000+100000)

win_size = 1000000

@window(size=win_size)
def pi_mean(df):
    return np.mean(df.pi_mean)

with Timer() as t:
    simons_pi_data_win = df.groupby(['chrom', 'popu']).apply(pi_mean)

with Timer() as t:
    with SwapContext("notebook", font_scale=0.5):
        fig = horizon_plot(simons_pi_data_win.reset_index(), 
                           'pi_mean', 
                           width=win_size, 
                           pop='popu',
                           pop_sorting = pop_sorting,
                           cut=0.0005) # width should be end-start
        # save to file
        plt.savefig('tmp.pdf')
#         fig.clf() # clean up memory
        plt.close(fig)  # close to allow garbage collection, also suppresses inline plot
#         gc.collect()
    

NameError: name 'simons_pi_data' is not defined

In [None]:
df = simons_pi_data.assign(chrom = simons_pi_data.chr,
                           start = simons_pi_data.pos*100000, 
                           end = simons_pi_data.pos*100000+100000)

win_size = 100000

@window(size=win_size)
def pi_mean(df):
    return np.mean(df.pi_mean)

with Timer() as t:
    simons_pi_data_win = df.groupby(['chrom', 'popu']).apply(pi_mean)

with Timer() as t:
    with SwapContext("notebook", font_scale=0.5):
        fig = horizon_plot(simons_pi_data_win.reset_index(), 
                           'pi_mean', 
                           width=win_size, 
                           pop='popu',
                           pop_sorting = pop_sorting,
                           cut=0.0005) # width should be end-start
        # save to file
        plt.savefig('tmp.pdf')
        fig.clf() # clean up memory
        plt.close(fig)  # close to allow garbage collection, also suppresses inline plot
        gc.collect()