# BEAST Analysis Notebook

---

# 0. SETUP

In [1]:
import os
import pandas as pd
import seaborn as sns

In [2]:
# ------------------------------------------
BRANCH_LIST = {
    "0.PRE": ["0.PRE1", "0.PRE2"], 
    "0.ANT4" : ["0.ANT4"], 
    "0.PE": ["0.PE2", "0.PE4m", "0.PE4m", "0.PE4t", "0.PE4a", "0.PE5", "0.PE7", "0.PE8", "0.PE10"],   
    "0.ANT": ["0.ANT1", "0.ANT2","0.ANT3","0.ANT5"],     
    "1.PRE" : ["1.PRE0","1.PRE1", "1.PRE2", "1.PRE3"],    
    "1.ANT": ["1.ANT1"], 
    "1.IN": ["1.IN1","1.IN2","1.IN3"],   
    "1.ORI" : ["1.ORI1", "1.ORI2", "1.ORI3"],
    "2.ANT": ["2.ANT1","2.ANT2","2.ANT3" ], 
    "2.MED": ["2.MED0", "2.MED1","2.MED2","2.MED3" ], 
    "3.ANT": ["3.ANT1", "3.ANT2" ],     
    "4.ANT": ["4.ANT1" ],           
}

NUM_STATES = 10

---

# 1. IMPORT

## Import Log Files

In [3]:
log_dir = "/mnt/c/Users/ktmea/Projects/plague-phylogeography-projects/main/beast/all/chromosome/clade"

# Construct a dictionary to hold the dataframes
log_dict = {branch:{"strict": {}, "relaxed": {}} for branch in BRANCH_LIST}

for branch in BRANCH_LIST:
    print(branch)
    for filename in os.listdir(log_dir):
        filepath = os.path.join(log_dir, filename)
        if branch in filename:
            # Find Strict Clock log
            if "SC" in filename:
                strict_log = filepath
            # Find relaxed clock log
            elif "UCLN" in filename:
                relaxed_log = filepath
                
    # Add log files to dict
    log_dict[branch]["strict"]["logfile"] = strict_log
    log_dict[branch]["relaxed"]["logfile"] = relaxed_log

0.PRE
0.ANT4
0.PE
0.ANT
1.PRE
1.ANT
1.IN
1.ORI
2.ANT
2.MED
3.ANT
4.ANT


## Initialize the Parameter data frame

## Parse Log Files to DataFrames

In [10]:
for branch in BRANCH_LIST:
    for clock in log_dict[branch]:
        print(branch, clock)
        log_filename = log_dict[branch][clock]["logfile"]
        with open(log_filename) as infile:
            read_file = infile.read().strip().split("\n")
            
            # Detect how many lines are headers with "#"
            for i in range (0, len(read_file)):
                # These are the column names
                if not read_file[i].startswith("#"):
                    col_names = read_file[i].split("\t")
                    # Add branch and clock to col_names
                    col_names = ["clade","clock"] + col_names
                    break
                    
            # Create dataframe from column names
            param_df = pd.DataFrame({col:[] for col in col_names})
                    
            # Remove the header lines
            read_file = read_file[i+1:]
            
            # Sample the desired number of states, from the end!
            sample_file = read_file[len(read_file)-NUM_STATES:]

            # Parse values into dict
            for line in sample_file:
                split_line = line.split("\t")
                line_dict = {col:float(val) for col,val in zip(col_names, split_line)}
                # add the clade and col values
                param_dict = {"clade": branch, "clock" : clock} + line_dict
                print(param_dict)
        break
    break

0.PRE strict


TypeError: unsupported operand type(s) for +: 'dict' and 'dict'

In [None]:
df = log_dict["0.PRE"]["strict"]["df"]

sns.kdeplot(x=df["meanRate"])