# Spatial tweet exploration

In [None]:
import geopandas as gpd
import glob
import pandas as pd
import os
from ast import literal_eval
from shapely.geometry import Point, MultiPoint, Polygon
import matplotlib.pyplot as plt
import numpy as np
import pyproj
import hdmedians as hd
import datetime
import ipywidgets as widgets
import pandas as pd

In [None]:
fire_csv_list = sorted(glob.glob('../../tweets/megafires/clean_csvs/*.csv'))[:-1] # Ignore Taylor Creek for now

In [None]:
base_df = pd.DataFrame(None, index=range(len(fire_csv_list)), columns=[])
base_df['origin_csv'] = [os.path.basename(f) for f in fire_csv_list]
base_df['name'] = ['-'.join(os.path.basename(os.path.splitext(f)[0])
                   .split('-')[1:]) for f in fire_csv_list]
base_df['year'] = [os.path.basename(f).split('-')[0] for f in fire_csv_list]

# Read and Plot

In [None]:
basemap = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))

def find_coord_order(df):
    coord_1 = np.array([z[0] for z in df['Coordinates']])
    coord_2 = np.array([z[1] for z in df['Coordinates']])
    
    if min(coord_1) < -70:
        df['Coordinates'] = [[z[0], z[1]] for z in df['Coordinates']]
    elif min(coord_2) < -70:
        df['Coordinates'] = [[z[1], z[0]] for z in df['Coordinates']]
    else:
        ValueError('Coordinates not obviously lat/lon or lon/lat')
        
    return df
        
def gpd_read_fire_csv(csv_path):
    # Important: Coordinates are not always in same order. Need to figure this out
    wgs84_crs = {'init' :'epsg:4326'}
    df = pd.read_csv(csv_path)
    df.loc[df['Coordinates'].apply(str) == 'nan', 'Coordinates'] = '[]'
    df = df.loc[df['Coordinates'] != '[]']
    # Add comma to list
    df.loc[~df['Coordinates'].str.contains(','), 'Coordinates'] = \
        df.loc[~df['Coordinates'].str.contains(','), 'Coordinates'].str.replace(r'([0-9])\s+([-0-9])',r'\1, \2')
    if df.shape[0] > 0:
        df['Coordinates'] = df['Coordinates'].apply(literal_eval)
        df = find_coord_order(df)
        df['geometry'] = df.apply(lambda z: Point(z.Coordinates[0], z.Coordinates[1]), axis=1)
        geo_df = gpd.GeoDataFrame(df, crs=wgs84_crs)
        # Time stamps
        valid_ts = geo_df.Timestamp.str[-3:] == "UTC"
        geo_df = geo_df.loc[valid_ts]
        geo_df['Timestamp'] = pd.to_datetime(geo_df.Timestamp, format='%Y-%m-%d %H:%M:%S UTC')
        geo_df.sort_values(by='Timestamp', ascending=True, inplace=True)
        return geo_df
    else:
        return df

def plot_fire_tweets(csv_path):
    geo_df = gpd_read_fire_csv(csv_path)
    if geo_df.shape[0] > 0:
        base = basemap.plot(figsize=(30,15))
        geo_df.plot(ax = base, marker='o', color='red', markersize=30)
        plt.show()
    else:
        print('No spatial tweets')

In [None]:
for fcsv in fire_csv_list:
    print(os.path.basename(fcsv))
    plot_fire_tweets(fcsv)

# Explore center options
It's important to come up with a metric of event center and capture spatial dispersion, so let's see what we can do

In [None]:
def get_init_center(df, min_tweets=10):
    total_tweets = df.shape[0]
    date_list = df.Timestamp.dt.date.unique()
    if total_tweets < min_tweets:
        sub_df = df
    else:
        first_ts = date_list[0]
        date_index = 0
        init_ts_list = [first_ts]
        while df.Timestamp.dt.date.isin(init_ts_list).sum() < min_tweets:
            date_index+=1
            init_ts_list.append(date_list[date_index])
        sub_df = df.loc[df.Timestamp.dt.date.isin(init_ts_list)]

    init_center = np.array(hd.geomedian(np.array([sub_df.geometry.x,sub_df.geometry.y])))
    
    return init_center

def plot_center(df, center_df):
    base = basemap.plot(figsize=(30,15))
    df.plot(ax = base, marker='o', color='red', markersize=30)
    center_df.plot(ax = base, marker='o', color='green', markersize=100)
    plt.show()
    return
    
def plot_init_center(csv_path, min_tweets=10):
    geo_df = gpd_read_fire_csv(csv_path)
    if geo_df.shape[0] == 0:
        print('No spatial tweets')
        return
    else:
        center = get_init_center(geo_df, min_tweets=min_tweets)
        center_point = Point(center[0], center[1])
        center_df = gpd.GeoDataFrame(pd.DataFrame({'center':[center_point]}), geometry='center')
        plot_center(geo_df, center_df)
    return

def plot_final_center(csv_path):
    geo_df = gpd_read_fire_csv(csv_path)
    if geo_df.shape[0] == 0:
        print('No spatial tweets')
        return
    else:
        center = np.array(hd.geomedian(np.array([geo_df.geometry.x, geo_df.geometry.y])))
        center_point = Point(center[0], center[1])
        center_df = gpd.GeoDataFrame(pd.DataFrame({'center':[center_point]}), geometry='center')
        plot_center(geo_df, center_df)
    return

In [None]:
# Plot initial and final centers for mcmurray
plot_init_center(fire_csv_list[-2])
plot_final_center(fire_csv_list[-2])

In [None]:
geo_df = gpd_read_fire_csv( '../../tweets/megafires/clean_csvs/2012-waldo-canyon-fire-co.csv')
geo_df.shape

In [None]:
# Reproject coords based on event center
def reproject_aeqd(geo_df, center):
    lon = center[0]
    lat = center[1]
    aeqd = {'proj':'aeqd', 'ellps':'WGS84', 'datum':'WGS84', 'lat_0':lat, 'lon_0':lon}
    return geo_df.to_crs(aeqd)

def calc_date_distances(df):
    center = get_init_center(df)
    df = reproject_aeqd(df, center)
    df['distance_from_center'] = df.geometry.distance(Point([0,0]))
    df = df.loc[:, ['distance_from_center', 'Timestamp']]
    date_grps = df.groupby(df.Timestamp.dt.date).quantile(.85)
    date_grps['mean_dist_until_now'] = [df.loc[df.Timestamp.dt.date <= ts, 'distance_from_center'].quantile(.8) for ts in df.Timestamp.dt.date.unique()]
    return date_grps

def plot_mean_dist(df):
    date_dist_df = calc_date_distances(df)
    max_length = min(30, date_dist_df.shape[0])
    plt.figure()
    plt.plot(date_dist_df.index.values[:max_length], date_dist_df.mean_dist_until_now.values[:max_length])
    plt.xticks(rotation=90)
    plt.show()
    

In [None]:
plot_mean_dist(geo_df)

# Visualize spread of event

In [None]:
geo_df = gpd_read_fire_csv( '../../tweets/megafires/clean_csvs/2012-waldo-canyon-fire-co.csv')
geo_df.shape

In [None]:
%matplotlib notebook
import ipywidgets as widgets

def date_widget(df):
    start_date = df.Timestamp.dt.date.min()
    end_date = df.Timestamp.dt.date.max()

    dates = pd.date_range(start_date, end_date, freq='D')

    options = [(date.strftime(' %d %b %Y '), date) for date in dates]

    selection_slider = widgets.SelectionSlider(
        options=options,
        description='Dates',
        orientation='horizontal',
        layout={'width': '600px'}
    )

    return selection_slider


def update_map(df, date, dots):
    sub_df = df.loc[geo_df.Timestamp.dt.floor('D') <= date]
    dots.set_xdata(sub_df.geometry.x)
    dots.set_ydata(sub_df.geometry.y)
    plt.show()
    return


fig, ax = plt.subplots(figsize=(10,5))
basemap.plot(ax=ax)
sel_slider = date_widget(geo_df)
dots, = ax.plot(geo_df.geometry.x, geo_df.geometry.y, 'ro',markersize=3,alpha=0.1)

widgets.interact(
    update_map,
    date=sel_slider,
    df=widgets.fixed(geo_df),
    dots=widgets.fixed(dots)
);

plot_mean_dist(geo_df)

# Derive Metrics

In [None]:
def reproject_aeqd(geo_df, center):
    lon = center[0]
    lat = center[1]
    aeqd = {'proj':'aeqd', 'ellps':'WGS84', 'datum':'WGS84', 'lat_0':lat, 'lon_0':lon}
    return geo_df.to_crs(aeqd), aeqd

def calc_spatial_metrics(geo_df):
    out_dict = {}    
    out_dict['volume_spatially_enabled'] = geo_df.shape[0]

    # Distribution around center
    out_dict['spatial_mean'] = [geo_df.geometry.x.mean(), geo_df.geometry.y.mean()]
    out_dict['spatial_median'] = [geo_df.geometry.x.median(), geo_df.geometry.y.median()]
    # geo_df, aeqd = reproject_aeqd(geo_df, [geo_df.geometry.x.median(),geo_df.geometry.y.median()])
    geo_df, aeqd = reproject_aeqd(geo_df, [geo_df.sort_values(by='Timestamp')[0:50].geometry.x.median(),
                                           geo_df.sort_values(by='Timestamp')[0:50].geometry.y.median()]) 
    base = basemap.to_crs(aeqd).plot(figsize=[30,10])
    geo_df.geometry.plot(ax=base,marker='o', color='red', markersize=30)
    multipoint_geo = MultiPoint(geo_df.geometry.values)
    center = Point([0,0])
    out_dict['spatial_convex_area'] = multipoint_geo.convex_hull.area
    distances = geo_df.geometry.distance(center)
    out_dict['spatial_min_dist'], out_dict['spatial_mean_dist'], out_dict['spatial_90p_dist'],\
        out_dict['spatial_max_dist'], out_dict['spatial_median_dist'], out_dict['spatial_std_dist'] =\
        distances.min(), distances.mean(), distances.quantile(.9), distances.max(), distances.median(), distances.std()
    
    # Irregularity, entropy, etc.
    
    return out_dict
    
    


In [None]:
geo_df = gpd_read_fire_csv(fire_csv_list[-2])

In [None]:
geo_df.sort_values(by='Timestamp')[0:50].geometry.x.median(), geo_df.sort_values(by='Timestamp')[0:50].geometry.y.median()

In [None]:
aeqd = {'proj':'aeqd', 'ellps':'WGS84', 'datum':'WGS84', 'lat_0':, 'lon_0':0}
base = basemap.to_crs(aeqd).plot(figsize=[30,10])
geo_df.to_crs(aeqd).geometry.plot(ax=base,marker='o', color='red', markersize=30)

In [None]:
geo_df.

# Movie Plot!