In [None]:
import pandas as pd
import matplotlib
matplotlib.use('Qt5Agg')
import matplotlib.pyplot as plt
import csv
from datetime import datetime
import numpy as np

In [None]:
# --- get df we wrote to plot multiple curves ---

data = pd.read_csv('/media/hill/DATA-LINUX/abm-data/host-density-olderruns/new-model/aggregate-runs/adddf')
data.head(20)
# data.vals

In [None]:
fig, ax = plt.subplots()
for label, df in data.groupby('habitat_suitability'):
    df.plot('host_density', 'mean', ax=ax, label=label)
plt.legend(title="Habitat Suitability")
plt.show(block=True)

# --- get df we wrote to plot multiple curves ---

In [None]:
# ------------------ process Repast file sinks ------------------
# step 1
# Create a dictionary that maps the run# to host_density { run#: host_density }

# paramfile = "/media/hill/DATA-LINUX/abm-data/host-density/testparams1hab7"
paramfile = "/media/hill/DATA-LINUX/abm-data/host-density-olderruns/new-model/aggregate-runs/params_05habitat"
paramd = {}
with open(paramfile, 'r') as file:
    for line in file:
        result = line.replace("\t",",").replace('\n','').split(',')

        # attempt to add the density value to run, if it doesn't exist, create it
        try:
            paramd[int(result[0])].append(float(result[4]), float(result[8]))
        except KeyError:
            paramd[int(result[0])] = (float(result[4]), float(result[8]))  # { run#: host_density }

for k, v in paramd.items():
    print(k,v)


In [None]:
# NEW CODE - dont build so many data structures
# step 2: read csv, build df

colnames = ['run', 'tick', 'lifestate', 'total_ixodes']
# csv_file = "/media/hill/DATA-LINUX/abm-data/host-density/testdf3-badlines"
csv_file = "/media/hill/DATA-LINUX/abm-data/host-density-olderruns/new-model/aggregate-runs/density-new.2020.Jul.11_hab05"
before = datetime.now()
# df = pd.read_csv(csv_file, names=colnames, error_bad_lines=False, dtype={'run':np.int32, 'tick':np.float, 'lifestate':str, 'total_ixodes':str})
df = pd.read_csv(csv_file,skiprows=8, names=colnames, error_bad_lines=False)
after = datetime.now()
print("Run time: ", after-before)
# Reading csv takes ~15-30s
df.head(10)

In [None]:
# step 2.b filter df

# print("Dropping columns...")
# before = datetime.now()
# df.drop(['lifestate', 'tick'], axis=1, inplace=True)
# after = datetime.now()
# print("Dropping runtime: ", after-before)

print("Filtering database...")
before = datetime.now()
print("Max before filtering...", df['tick'].max())
df = df[df['tick'] < 451] # vectorize & remove rows with bad lines
after = datetime.now()
print("Filtering runtime: ", after-before)
print("Max after filtering...", df['tick'].max())

df.head()


In [None]:
# 9/15 removing outliers. test code to find out how many outliers we have

for run, data in df.groupby('run'):
    days = data['tick'].max()
    print((run,days) if days < 450 else '')


In [None]:
# Doesn't seem to be much performance from dropping. Test on larger files
# df.drop(['tick', 'lifestate'], axis=1, inplace=True)
# df.head(10)

In [None]:
print("Groupby and agg...")
before = datetime.now()
n_ticks_df = df.groupby(['run'], as_index=False)
# n_ticks = n_ticks.agg(total_ixodes=pd.NamedAgg(column='name', aggfunc='nunique')) # namedAgg example
n_ticks_df = n_ticks_df.agg({'total_ixodes': 'nunique', 'tick': 'max'}) # TEST THIS FOR OUTLIERS
after = datetime.now()
print("Groupby runtime: ", after-before)
# takes ~14s
n_ticks_df.head()

In [None]:
# Filter outliers - working
n_ticks_df = n_ticks_df[n_ticks_df['tick'] > 90]

In [None]:
# step 3
# Iterate through param dictionary and add the host density value to the associated run
for key, value in paramd.items():
    n_ticks_df.loc[n_ticks_df['run']==key, 'host_density' ] = float(value) #FIXME? check theres no off by 1
n_ticks_df.head()


In [None]:
# Step 4 make final df
#

final_df = n_ticks_df.groupby('host_density')['total_ixodes'].agg({'mean', 'std'})
final_df.head()


In [None]:
# Step 5 - write to csv
# could add some kind of 'first file' flag to change mode between write and append
final_df.to_csv('/media/hill/DATA-LINUX/abm-data/host-density-olderruns/new-model/aggregate-runs/dfagg', mode='w')

In [None]:
# example of how to plot
plt.errorbar(final_df.index, final_df['mean'], yerr=2*final_df['std'])
plt.plot(marker='o')
plt.show()

In [None]:
# # OLD step 4 make the final df
# n_ticks = n_ticks.groupby('host_density', )
# n_ticks = n_ticks.agg({'total_ixodes': 'mean'})
# n_ticks.head()

In [None]:
# mean = n_ticks.groupby('host_density')['total_ixodes'].mean()
# std = n_ticks.groupby('host_density')['total_ixodes'].std()
# mean

In [None]:
# How to get rid of the bad lines ------------
print(df['tick'].max())
before = datetime.now()
new_df = df[df['tick'] < 451 ]
after = datetime.now()
print("Run time: ", after-before)
print(new_df['tick'].max())
print(new_df.head())

In [None]:
# Note, as of yet there is no notable performance benefit from dropping these
df.drop(['tick','lifestate'], axis=1, inplace=True)
df.head()

In [None]:
# -- OLD way
# Create a dictionary that maps the run# to total ixode { run#: cumulative_ixode }
ixode_count_dict = {}
colnames = ['run', 'tick', 'lifestate', 'name']
csv_file = "/media/hill/DATA-LINUX/abm-data/host-density/testdf1hab"
df = pd.read_csv(csv_file, names=colnames, header=None, error_bad_lines=False)
for run in df.groupby('run'):
    current_df = run[1]
    ixode_count_dict[run[0]] = len(current_df['name'].unique())


In [None]:
# Creating a data frame from the cumulative_ixodes dict generated above
df_final = pd.DataFrame(ixode_count_dict.items(), columns=['run', 'total_ixodes'])
df_final['host_density'] = 0
df_final.dtypes

In [None]:
# Iterate through param dictionary and add the host density value to the associated run
for key, value in paramd.items():
    for i in range(len(paramd)):
        # print(type(value))
        df_final.loc[df_final['run']==key, 'host_density'] = value

df_final.dtypes

In [None]:
# Now we have a df with | 'run' | 'total_ixode' | 'host_density' |
# Create a dictionary with { host_density: agg_ixodes }
agg_ixodes_dict = {}
for density in df_final.groupby('host_density'):
    tmp_df = density[1] # density is a tuple, so density[1] is the df we want
    # print(density[0])
    agg_ixodes_dict[density[0]] = tmp_df['total_ixodes'].agg('mean')


# in test file - should get 1880.75 for 0.4, 7898.5 for 0.7, and 24917 for 1.0

In [None]:
# Create a dataframe from the agg_ixodes_dict that we can then plot
df_agg_final = pd.DataFrame(agg_ixodes_dict.items(), columns=['host_density', 'agg_ixodes'])
df_agg_final.head()

In [None]:
# testing plotting visuals
matplotlib.use('Qt5Agg')
plt.plot(df_agg_final['host_density'], df_agg_final['agg_ixodes'])
plt.show(block=True)

In [None]:
with open(paramfile, 'r') as file:
    reader = csv.reader(file)
    param_str = {rows[0]: "Lifestage: {}\nStarting Ixodes: {}\nHabitat Suitability: {}".format(
        rows[1], rows[2], rows[3]) for rows in reader}
    print(param_str)

In [None]:
fig, ax = plt.subplots()
ax.set_title("Test Agg Host Density")
ax.set_ylabel("Total Ixodes")
ax.set_xlabel("Host Density")
props = dict(facecolor='wheat', alpha=0.5)
ax.plot(df_agg_final['host_density'], df_agg_final['agg_ixodes'])
plt.figtext(0.5,0.5, 'Lifestage: adult\nStarting Ixodes: 10\nHabitat Suitability: 0.05', bbox=props)
plt.show(block=True)

In [None]:
df2 = df_agg_final.copy()
ax=df2.plot()
df_agg_final.plot(ax=ax)
plt.show(block=True)

In [None]:
# -- OLD way
# Create a dictionary that maps the run# to total ixode { run#: cumulative_ixode }
ixode_count_dict = {}
colnames = ['run', 'tick', 'lifestate', 'name']
csv_file = "/media/hill/DATA-LINUX/abm-data/host-density/testdf1hab"
df = pd.read_csv(csv_file, names=colnames, header=None, error_bad_lines=False)
for run in df.groupby('run'):
    current_df = run[1]
    ixode_count_dict[run[0]] = len(current_df['name'].unique())


In [None]:
# Creating a data frame from the cumulative_ixodes dict generated above
df_final = pd.DataFrame(ixode_count_dict.items(), columns=['run', 'total_ixodes'])
df_final['host_density'] = 0
df_final.dtypes

In [None]:
# Iterate through param dictionary and add the host density value to the associated run
for key, value in paramd.items():
    for i in range(len(paramd)):
        # print(type(value))
        df_final.loc[df_final['run']==key, 'host_density'] = value

df_final.dtypes

In [None]:
# Now we have a df with | 'run' | 'total_ixode' | 'host_density' |
# Create a dictionary with { host_density: agg_ixodes }
agg_ixodes_dict = {}
for density in df_final.groupby('host_density'):
    tmp_df = density[1] # density is a tuple, so density[1] is the df we want
    # print(density[0])
    agg_ixodes_dict[density[0]] = tmp_df['total_ixodes'].agg('mean')


# in test file - should get 1880.75 for 0.4, 7898.5 for 0.7, and 24917 for 1.0


In [None]:
# Create a dataframe from the agg_ixodes_dict that we can then plot
df_agg_final = pd.DataFrame(agg_ixodes_dict.items(), columns=['host_density', 'agg_ixodes'])
df_agg_final.head()

In [None]:
matplotlib.use('Qt5Agg')
plt.plot(df_agg_final['host_density'], df_agg_final['agg_ixodes'])
plt.show(block=True)
