In [None]:
import hubmapbags
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path
from datetime import datetime
import seaborn as sns
import numpy as np

In [None]:
token = ''hubmapbags.utilities.clean()
hubmapbags.reports.daily(token=token)

In [None]:
#get daily report
report_output_directory = "daily-report"
if not Path(report_output_directory).exists():
    Path(report_output_directory).mkdir()

now = datetime.now()
report_output_filename = f'{report_output_directory}/{str(now.strftime("%Y%m%d"))}.tsv'
print(report_output_filename)

df = pd.read_csv(report_output_filename, sep="\t")

In [None]:
#dataframe keys
df.keys()

# Plots by Group Name

In [None]:
df['group_name'].unique()

In [None]:
group = df.groupby(['group_name', 'status']).count()[['data_type']]
print(group.to_markdown())

In [None]:
hubmapbags.plots.by_group(df)

# Plots by Data Type

In [None]:
df['data_type'].unique()

In [None]:
data_type = df.groupby(['data_type', 'status']).count()
data_type

In [None]:
df['status'].unique()

In [None]:
hubmapbags.plots.by_data_type(df)

In [None]:
plt.rcParams['figure.figsize'] = [50.0, 50.0]
plt.rcParams['figure.dpi'] = 500

g = sns.displot(df[df['dataset_type']=='Primary'], height=10, x="data_type", hue="status", multiple='stack',aspect = 2)
plt.xticks(df[df['dataset_type']=='Primary']['data_type'],df[df['dataset_type']=='Primary']['data_type'],rotation=45, fontsize=10, ha='right')

#g.set_xticklabels(g.get_xticklabels(), rotation=40, ha="right")

plt.tight_layout()

g.set(xlabel='Data Type',
       ylabel='Count',
       title=str(now.strftime("%Y%m%d")))

sns.move_legend(g, "center right", ncol=1, title='Dataset status', frameon=False)

plt.show()

In [None]:
df_shormin = df.copy()

In [None]:
df_shormin['published_datetime']

In [None]:
df_shormin = df_shormin[~df_shormin['published_datetime'].isnull()]

In [None]:
df_shormin['published_datetime']

In [None]:
df_shormin['published_datetime'] = pd.to_datetime(df['published_datetime'])  # convert column to datetime

# Extract year, month, and day
df_shormin['year'] = df_shormin['published_datetime'].dt.year
df_shormin['month'] = df_shormin['published_datetime'].dt.month
df_shormin['day'] = df_shormin['published_datetime'].dt.day

In [None]:
df_shormin

In [None]:
# Convert year to string
df_shormin['year_str'] = df_shormin['year'].astype(str)

# Count occurrences for each year
year_counts = df_shormin['year_str'].value_counts().sort_index()
colors = plt.cm.inferno(np.linspace(0, 1, len(year_counts)))
# Plot
plt.figure(figsize=(10,6))
plt.bar(year_counts.index, year_counts.values, color=colors)
plt.xticks(rotation=45)
plt.xlabel('Year')
plt.ylabel('Count')
plt.title('Number of Data Published Per Year')
plt.show()

In [None]:
from pandas.api.types import CategoricalDtype

# Convert month to its name
df_shormin['month'] = df_shormin['published_datetime'].dt.month_name()

# Define the order of the months
months_in_order = CategoricalDtype(['January', 'February', 'March', 'April', 'May', 'June', 
                                    'July', 'August', 'September', 'October', 'November', 'December'], 
                                    ordered=True)

# Convert the month column to ordered categories
df_shormin['month'] = df_shormin['month'].astype(months_in_order)

# Count occurrences for each month
month_counts = df_shormin['month'].value_counts().sort_index()
colors = plt.cm.inferno(np.linspace(0, 1, len(month_counts)))
# Plot
plt.figure(figsize=(10,6))
plt.bar(month_counts.index, month_counts.values, color=colors)
plt.xticks(rotation=45)
plt.xlabel('Month')
plt.ylabel('Count')
plt.title('Number of Data Published Per Month')
plt.show()

In [None]:

# Extract date only (without time)
df_shormin['day'] = df_shormin['published_datetime'].dt.date

# Count occurrences for each day
day_counts = df_shormin['day'].value_counts().sort_index()

# Plot
plt.figure(figsize=(10,6))
plt.plot(day_counts.index, day_counts.values, color='b')  # Line plot
plt.scatter(day_counts.index, day_counts.values, color='r')  # Scatter plot
plt.xticks(rotation=45)
plt.xlabel('Day')
plt.ylabel('Count')
plt.title('Number of Data Published Per Day')
plt.show()


In [None]:
# plot status per day count

In [None]:
df_shormin_status = df.copy()

In [None]:

df_shormin_status['published_datetime'] = pd.to_datetime(df_shormin_status['published_datetime'])  # convert column to datetime
# Extract day
df_shormin_status['day'] = df_shormin_status['published_datetime'].dt.day

In [None]:
df_shormin_status['status']

In [None]:
status = df_shormin_status.groupby(['status']).count()

In [None]:
status

In [None]:
df_shormin_status['status'].unique()

In [None]:
import pandas as pd

# Convert 'day' to datetime (if it's not already)
df_shormin_status['day'] = pd.to_datetime(df_shormin_status['published_datetime'])

# Group by 'day' and 'status', then unstack 'status' to get it as columns
status_counts = df_shormin_status.groupby(['day', 'status']).size().unstack()

# Plot
plt.figure(figsize=(12, 8))
for status in status_counts.columns:
    plt.scatter(status_counts.index, status_counts[status], label=status)
    plt.plot(status_counts.index, status_counts[status], label=status)

plt.xlabel('Day')
plt.ylabel('Count')
plt.title('Status Counts Per Day')
plt.legend()
plt.show()