# Extracting lxmx data for analysis

You will need the following packages installed to be able to run the notebook.

In [1]:
import os
import numpy as np
import pandas as pd
import pickle #for saving and loading the final dictionary

In [2]:
DATA_DIR = 'data'
OUTPUT_DIR = 'outputs'
FILENAME = 'lxmx_data.xls'
SPECIES_COL = 4

In [3]:
LXMX = pd.read_excel(os.path.join(DATA_DIR, FILENAME), index_col=None, header=None)

## Find the species: 
* store their names 
* slice indices in the DataFrame

In [4]:
def find_species(df):
    species_ix = np.where(pd.isna(df[SPECIES_COL]) == False)[0]
    prev_index = -100
    fixed_species_ix = []
    for ix in species_ix:
            if ix-1 == prev_index:
                continue
            else:
                fixed_species_ix.append(ix)
                prev_index = ix
    species_names = list(df[SPECIES_COL][fixed_species_ix])
    fixed_species_ix.append(len(df))
    species_ix = np.array(fixed_species_ix)
    species = {}
    for i in range(len(species_names)):
        s = species_names[i]
        species[s] = (species_ix[i], species_ix[i+1]) 
    return species

In [5]:
SPECIES_ix = find_species(LXMX)

## Auxillary classes

#### Data
The Data object stores the matrices, the life history variables, and the auhtor/collector of the data entry as mentioned in the xls.

In [6]:
class Data:
    def __init__(self, df):
        cols = ['age', 'lx', 'mx', 's', ]
        L = len(df)
#         df.columns = cols
#         print('=====================')
#         print(df)
        df.index = range(L)
        self.matrix =df.loc[0:L-2, 0:3]
        self.matrix.columns = cols
        self.author = df[L-1:L][7].item()
        self.vars = df.loc[0:1, 5:6]
        self.vars.columns = ['0', '1']
#         print('=====================')
        
    def __repr__(self):
        return self.author

#### Species
Species object contains the raw DataFrame slice of all of the species data entries, as well as the name and notes (if present).

In [7]:
class Species:
    def __init__(self, name, df, lookup):
        self.name = name
        self.df = df[lookup[name][0]:lookup[name][1]]
        self.df.index = range(len(self.df))
        self.notes = df[lookup[name][0]+1:lookup[name][0]+2][4].item()
        if pd.isna(self.notes):
            self.notes = None
        Species.clean_data(self, lookup[name][1])
        
    def __repr__(self):
        return '{}: {} data entries, notes: {}'.format(self.name, len(self.data), self.notes)
    
    @staticmethod
    def clean_data(s, end):
        ix = list(np.where(pd.isna(s.df[6]) == False)[0][::2])
        N = len(ix)
        ix.append(end)
        s.data=[]
        for n in range(N):
            s.data.append(Data(s.df[ix[n]:ix[n+1]]))
        
        
        

In [8]:
SPECIES = {}
for s in SPECIES_ix:
    SPECIES[s] = Species(s, LXMX, SPECIES_ix)

In [46]:
with open(os.path.join(OUTPUT_DIR, 'species.pkl'), 'wb') as f:
    pickle.dump(SPECIES, f)

PicklingError: Can't pickle <class '__main__.Species'>: it's not the same object as __main__.Species

## Working with the Species objects

You can list the keys with the following code (splice at the end is for the sake of limiting output), 

In [32]:
list(SPECIES.keys())[0:10]

['Spermophilus columbianus',
 'Spermophilus lateralis',
 'Theropithecus gelada',
 'Yellow-bellied marmot',
 'Spermophilus armatus',
 'Tursiops truncatus',
 'Papio cynocephalus',
 'Helogale parvula',
 'Callorhinus ursinus',
 'Phacochoerus aethiopicus']

and access the Species object as such.

In [38]:
SPECIES['Papio cynocephalus']

Papio cynocephalus: 1 data entries, notes: CMR

In [40]:
SPECIES['Godley (stationary)'].data

[Caughley 1970]

In [41]:
SPECIES['Godley (stationary)'].data[0].matrix

Unnamed: 0,age,lx,mx,s
0,0.0,1.0,0.0,0.467
1,1.0,0.467,0.005,0.976
2,2.0,0.456,0.135,0.932
3,3.0,0.425,0.44,0.871
4,4.0,0.37,0.42,0.841
5,5.0,0.311,0.465,0.791
6,6.0,0.246,0.425,0.764
7,7.0,0.188,0.46,0.8
8,8.0,0.139,0.486,0.719
9,9.0,0.1,0.5,0.7
