In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import requests
import urllib
import os

In [None]:
# Read in files
filenames = {'obs': 'observations.csv', 'loc': 'locations.csv', 'nme': 'name_classifications.csv'}
data = dict()
for file in filenames:
    data[file] = pd.read_csv(os.path.join('..', 'data', filenames[file]), sep='\t')

In [None]:
# Clean observations dataframe (relevant columns, drop na)
data['obs'] = data['obs'][['id', 'name_id', 'when', 'location_id']]
data['obs'].dropna(inplace=True, ignore_index=True)
data['obs'].drop_duplicates(subset=['id'], inplace=True, ignore_index=True)
data['obs'].rename(columns={'id': 'obs_id'}, inplace=True)
data['obs'].head(2)

In [None]:
# Clean locations file
data['loc'] = data['loc'][['id', 'name']]
data['loc'].dropna(inplace=True, ignore_index=True)
data['loc'].drop_duplicates(subset=['id'], inplace=True, ignore_index=True)
data['loc'].rename(columns={'id': 'location_id'}, inplace=True)
data['loc'].head(2)

In [None]:
# Add is_sc column to locations dataframe
data['loc']['name'] = data['loc']['name'].str.lower()
data['loc']['split_name'] = data['loc']['name'].str.replace(' ', '').str.split(',')
data['loc']['is_sc'] = data['loc']['name'].str.contains('santa cruz', case=False)
data['loc']['is_sc'] = data['loc'].apply(lambda x: (x['split_name'][-1] == 'usa' and x['split_name'][-2] == 'california') if x['is_sc'] else False, axis=1)
data['loc'].head(2)

In [None]:
# Clean name dataframe
data['nme'] = data['nme'][['name_id', 'domain', 'kingdom', 'phylum', 'class', 'order', 'family']]
data['nme'].dropna(inplace=True, ignore_index=True)
data['nme'].drop_duplicates(subset=['name_id'], inplace=True, ignore_index=True)
data['nme'].head(2)

In [None]:
# Define custom corrections for name data
data['nme']['class'] = np.where(data['nme']['order'].eq('Physarales'), 'Myxogastria', data['nme']['class'])
data['nme']['phylum'] = np.where(data['nme']['class'].eq('Myxogastria'), 'Mycetozoa', data['nme']['phylum'])
data['nme']['kingdom'] = np.where(data['nme']['phylum'].eq('Mycetozoa'), 'Amoebozoa', data['nme']['kingdom'])

# Replace Myxomycota with Mycetozoa as technically Myxomycota is not a phylum
data['nme']['phylum'] = np.where(data['nme']['phylum'].eq('Myxomycota'), 'Mycetozoa', data['nme']['phylum'])

# Replace phylums with lower observations with "Other"
main_phylums = ['Mycetozoa', 'Basidiomycota', 'Ascomycota']
data['nme']['phylum'] = np.where(~data['nme']['phylum'].isin(main_phylums), 'Other', data['nme']['phylum'])

data['nme'].head(2)

In [None]:
# Get location ids for sc only
sc_loc_df = data['loc'].query('is_sc == True')[['location_id']]
sc_loc_df.head(2)

In [None]:
# Add observations to santa cruz locations, drop cases with no observations
all_df = sc_loc_df.merge(data['obs'], how='left', on='location_id')
all_df.dropna(subset=['obs_id'], inplace=True, ignore_index=True)
print(all_df.shape)
all_df.head(2)

In [None]:
# Add name data 
all_df = all_df.merge(data['nme'], how='left', on='name_id')
all_df.dropna(subset=['domain', 'kingdom', 'phylum', 'class', 'order', 'family'], inplace=True, ignore_index=True)
print(all_df.shape)
all_df.head(2)

In [None]:
# Add yead and month data and keep only rows where year >= 1994
all_df['when'] = pd.to_datetime(all_df['when'])
all_df['year'] = all_df['when'].dt.year
all_df = all_df.query('year >= 1994')
all_df['year'] = all_df['year'].astype('str')
all_df['month'] = all_df['when'].dt.month.astype('str').str.zfill(2)
all_df.head(2)

In [None]:
# Construct df with all desired dates and x-value
x_mult = 1
dates_df = pd.DataFrame([(str(y), str(m).zfill(2)) for y in range(1994, 2024) for m in range(1, 13)], columns=['year', 'month'])
dates_df.sort_values(by=['year', 'month'], ascending=True, inplace=True)
dates_df['x'] = range(1, dates_df.shape[0] + 1)
dates_df['x'] = dates_df['x'] * x_mult
dates_df.head()

In [None]:
# Get list of all phylum-order combinations to create
po_df = all_df[['phylum', 'order']].drop_duplicates(ignore_index=True)
po_df.head(2)

In [None]:
# Create dataframe of all date/phylum/order combinations to consider
po_df['tmp'] = 1
dates_df['tmp'] = 1
combo_df = po_df.merge(dates_df, on='tmp')
combo_df.drop(columns=['tmp'], inplace=True)
combo_df.head(2)

In [None]:
# Count number of observations by phylum, order, year, and month
count_df = all_df.groupby(by=['phylum', 'order', 'year', 'month'])['obs_id'].count().reset_index()
count_df.rename(columns={'obs_id': 'total_obs'}, inplace=True)
count_df.head()

In [None]:
count_df.query('year == 2023')

In [None]:
# Join desired combinations of what/when with actual observation counts and fillna with 0
count_df = combo_df.merge(count_df, how='left', on=['phylum', 'year', 'month', 'order'])
count_df['total_obs'] = count_df['total_obs'].fillna(0)
count_df.head()

In [None]:
# Drop year and month, no longer necessary
count_df.drop(columns=['year', 'month'], inplace=True)
count_df.head(2)

In [None]:
# This is our main count dataframe
count_df.head()

In [None]:
# Change datatype to int for total observations and sort dataframe
count_df['total_obs'] = count_df['total_obs'].astype(int)
count_df.sort_values(by=['phylum', 'order', 'x'], ascending=True, inplace=True, ignore_index=True)
count_df.head()

In [None]:
# Add y-axis to count_df (always set to 0 for now)
count_df['y'] = 0
count_df.head(2)

In [None]:
# Add column with size for bubble plot
s_scale = 20
count_df['s'] = count_df['total_obs'] * s_scale
count_df.drop(columns=['total_obs'], inplace=True)
count_df['s'] = count_df['s'].replace(0, np.nan)
count_df.head()

In [None]:
# Remove rows that wouldn't draw anything because s is zero
final_df = count_df.copy()
final_df.dropna(subset=['s'], ignore_index=True, inplace=True)
final_df.head(2)

In [None]:
# Check unique number of orders
len(final_df.order.unique())

In [None]:
final_df.to_csv('final_df.csv', index=False)

## Scatter

In [None]:
# final_df.sort_values(by=['phylum', 'order'], ascending=True, inplace=True)
# for order in final_df.order.unique():
#     print(order)
#     sub_df = final_df.query(f'order == "{order}"')
#     # Set the dimensions of the plot area in pixels
#     dpi = 100
#     fig, ax = plt.subplots(figsize=(826/dpi, 1101/dpi), dpi=dpi)
    
#     # Scatter plot with bubble sizes
#     ax.scatter(sub_df['x'], sub_df['y'], s=sub_df['s'], alpha=0.5)
    
#     # Draw an "x" at the start
#     x_position = 1
#     y_position = 0
#     ax.scatter(x_position, y_position, marker='|', color='red', s=100)
    
#     # Draw an "x" at the start
#     x_position = 360
#     y_position = 0
#     ax.scatter(x_position, y_position, marker='|', color='red', s=100)
    
    
#     # Save the figure as an SVG file
#     plt.savefig(os.path.join('figs', f'{order.lower()}.svg'), format='svg', dpi=dpi, bbox_inches='tight')