In [None]:
import glob
import pandas as pd
import os
import matplotlib.pyplot as plt

In [None]:
fire_csv_list = sorted(glob.glob('../../tweets/megafires/clean_csvs/*.csv'))

In [None]:
base_df = pd.DataFrame(None, index=range(len(fire_csv_list)), columns=[])
base_df['origin_csv'] = [os.path.basename(f) for f in fire_csv_list]
base_df['name'] = ['-'.join(os.path.basename(os.path.splitext(f)[0])
                   .split('-')[1:]) for f in fire_csv_list]
base_df['year'] = [os.path.basename(f).split('-')[0] for f in fire_csv_list]

# Function for basic bar plot on some column

In [None]:
def basic_plot(df, target_col):
    plt.figure(figsize=(60,20), facecolor='white')
    plt.tick_params(axis='both', which='major')
    df = df.sort_values(target_col, ascending=False)
    plt.bar(df['name'], df[target_col])
    plt.xticks(fontsize=45, rotation=90)
    plt.yticks(fontsize=40)
    plt.ylabel(target_col, fontsize=60)

    plt.show()

# Volume Stats

In [None]:
import datetime

In [None]:
def read_df_timestamps(csv_path):
    df = pd.read_csv(csv_path)
    # Drop invalid timestamps
    valid_ts = df.Timestamp.str[-3:] == "UTC"
    df = df.loc[valid_ts]
    df['Timestamp'] = pd.to_datetime(df.Timestamp, format='%Y-%m-%d %H:%M:%S UTC')
    return df

def read_plot_days(csv_path):
    df = read_df_timestamps(csv_path)
    plt.figure(figsize=(20,10), facecolor='white')
    df['Timestamp'].groupby(
        [df['Timestamp'].dt.year, df['Timestamp'].dt.month]
    ).count().plot(kind="bar")
    
def volume_stats(df):
    out_dict = {}
    ts = df['Timestamp']
    out_dict['first_tweet'] = ts.min()
    out_dict['last_tweet'] =  ts.max()
    out_dict['vol_length_days'] = (ts.max() - ts.min()).days
    ts_day_counts = ts.groupby([ts.dt.year, ts.dt.month, ts.dt.day]).count()
    ts_hour_counts = ts.groupby([ts.dt.year, ts.dt.month, ts.dt.day, ts.dt.hour]).count()
    out_dict['vol_max_hour'] = ts_hour_counts.max()
    out_dict['vol_max_day'] = ts_day_counts.max()
    out_dict['vol_avg_daily'] = ts_day_counts.mean()
    out_dict['vol_min_daily'] = ts_day_counts.min()
    out_dict['vol_total'] = ts.shape[0]
    return out_dict

In [None]:

out_dict_list = []
for fcsv in fire_csv_list:
    df = read_df_timestamps(fcsv)
    out_dict = volume_stats(df)
    out_dict['origin_csv'] = os.path.basename(fcsv)
    out_dict_list.append(out_dict)
vol_df = pd.DataFrame.from_dict(out_dict_list)

In [None]:
atts_df = base_df.merge(vol_df, on='origin_csv')

In [None]:
basic_plot(atts_df, 'vol_max_hour')

# Tweet Deltas and Entropy

In [None]:
import scipy.stats
import numpy as np

In [None]:
def tweet_entropy(deltas):
    delta_histo = np.histogram(deltas, np.arange(0,np.max(deltas)+120,120))
    return scipy.stats.entropy(delta_histo[0])

def delta_stats(df):
    out_dict = {}
    tweet_times = df['Timestamp'].values
    # Times between tweets
    tweet_deltas_s = np.diff(tweet_times)/np.timedelta64(1, 's')
    out_dict['delta_entropy'] = tweet_entropy(tweet_deltas_s)
    out_dict['delta_mean_sec'] = np.mean(tweet_deltas_s)
    out_dict['delta_max_sec'] = np.max(tweet_deltas_s)
    out_dict['delta_min_sec'] = np.min(tweet_deltas_s)
    out_dict['delta_std_sec'] = np.std(tweet_deltas_s)

    return out_dict

In [None]:
out_dict_list = []
for fcsv in fire_csv_list:
    df = read_df_timestamps(fcsv)
    out_dict = delta_stats(df)
    out_dict['origin_csv'] = os.path.basename(fcsv)
    out_dict_list.append(out_dict)
delta_df = pd.DataFrame.from_dict(out_dict_list)


In [None]:
atts_df = atts_df.merge(delta_df, on='origin_csv')

In [None]:
atts_df.columns

In [None]:
basic_plot(atts_df,'delta_entropy')