In [None]:
sys.path.insert(0, '../../../selection_paper/agam-report-base/src/python')
ag1k_dir = '/kwiat/vector/ag1000g/release'
from ag1k import phase1_ar3

In [None]:
phase1_ar3.init(os.path.join(ag1k_dir, 'phase1.AR3'))

In [None]:
chromosomes = "2R", "2L", "3R", "3L", "X"

In [None]:
@functools.lru_cache()
def calculate_summary_stats(chrom, pop, window_size=100000):
    
    ix = phase1_ar3.df_samples.query("population == @pop").index
    accessibility = phase1_ar3.accessibility[chrom]["is_accessible"][:]
    
    pos = allel.SortedIndex(phase1_ar3.callset_pass[chrom]["variants/POS"][:])
    eqw = allel.equally_accessible_windows(accessibility, window_size)
    g = allel.GenotypeChunkedArray(
        phase1_ar3.callset_pass[chrom]["calldata/genotype"]).take(ix, axis=1)
    ac = g.count_alleles()
    
    theta, wins, nb, counts = allel.stats.windowed_watterson_theta(
        pos, ac, windows=eqw, is_accessible=accessibility)
    
    pi, wins, nb, counts = allel.stats.windowed_diversity(
        pos, ac, windows=eqw, is_accessible=accessibility)
    
    tajD, wins, counts = allel.stats.windowed_tajima_d(pos, ac, windows=eqw)
    
    df = pd.DataFrame.from_dict({"start": eqw[:, 0], 
                                 "stop": eqw[:, 1], 
                                 "diversity": pi, 
                                 "tajimaD": tajD, 
                                 "theta": theta})

    df["midpoint"] = eqw.mean(1)
    
    return df

In [None]:
stats = {c: calculate_summary_stats(chrom=c, pop="BFS", window_size=100000) for c in chromosomes}

In [None]:
### Use a .gff3 file to annotate above windows

In [None]:
gff3 = allel.FeatureTable.from_gff3(phase1_ar3.geneset_agamp42_fn, attributes=["ID"])

In [None]:
gff3

In [None]:
annotated_data = {}

# annotate these data
for chrom in chromosomes:

    d = stats[chrom].copy()
    
    # extract the relevant seq id and use pandas interval indexing
    features = pd.DataFrame(gff3.query("seqid == '{0}'".format(chrom)).values)
    features.index = pd.IntervalIndex.from_arrays(
        features.start, features.end, closed="both")

    # logic to extract relevant rows, filter by annot type, 
    # drop duplicates and join ID column 
    d["gene"] = d.apply(
        lambda y: ", ".join(
            features.loc[[y.start, y.stop]].query(
                "type == 'gene'").ID.drop_duplicates()), 1)

    annotated_data[chrom] = d