# Exploratory Data Visualization
**First look what data you have and testing hunches**

NOTE: goal is to show benifit of fast and easy crossfiltering of large(ish) dataset

## Overview and Requirements
Super short version of intro notebook and restate requirments

## Imports


In [None]:
import cuxfilter
import cudf
import cugraph
from pyproj import Proj, Transformer
from pathlib import Path

DATA_DIR = Path("../data")
FILENAME = Path("data.csv")

## Load Cleaned Data / Check

In [None]:
data = cudf.read_csv(DATA_DIR / FILENAME)

## Intro to cuxfilter

Cuxfilter is part of the Nvidia RAPIDS suite of open source software libraries and APIs, that gives you the ability to execute end-to-end data science and analytics pipelines entirely on GPUs.

It acts as a connector library, which provides the connections between different visualization libraries and a GPU dataframe without much hassle. This also allows the user to use charts from different libraries in a single dashboard, while also providing the interaction.

## Charts 

### [API Documentation](https://docs.rapids.ai/api/cuxfilter/stable/charts/charts.html)


#### [Example Dashboard 2](https://github.com/rapidsai/cuxfilter#example-1)

<div>
<img src="https://raw.githubusercontent.com/rapidsai/cuxfilter/branch-0.16/docs/_images/demo.gif" width="800" height="500" style="margin-left:25%"/>
</div>

#### [Example Dashboard 1](https://github.com/rapidsai/cuxfilter#example-2)

<div>
<img src="https://raw.githubusercontent.com/rapidsai/cuxfilter/branch-0.16/docs/_images/demo2.gif" width="800" height="500" style="margin-left:25%"/>
</div>

## Data Preprocessing

#### Extract Edges

In [None]:
G = cugraph.Graph() 
G.from_cudf_edgelist(data, source='from_station_id', destination='to_station_id')
edges = G.edges()

In [None]:
edges.head()

#### Transform the coordinates to overlay it on top of a map in cuxfilter

In [None]:
def transform_coords(df, x='x', y='y'):
    # Apply transformation
    transform_4326_to_3857 = Transformer.from_crs('epsg:4326', 'epsg:3857')
    df['x'], df['y'] = transform_4326_to_3857.transform(df[x].to_array(), df[y].to_array())
    return df

trips = transform_coords(data, x='latitude_start', y='longitude_start')

#### Extract day_type from the dataset (weekend / weekday)

In [None]:
trips['day_type'] = 0
#day =5 or 6 is weekend
trips.loc[trips.query('day>4').index, 'day_type'] = 1

#### Get global week number in the dataset

In [None]:
year0 = int(trips.year.min()) #2014

"""
trips.groupby('year').week.max().to_pandas().to_dict() is {2014: 52, 2015: 53, 2016: 53, 2017: 52}

Since 2015 and 2016 have 53 weeks, we add 1 to global week count for their following years - 2016 & 2017
(data.year/2016).astype('int') => returns 1 if year>=2016, else 0
"""
trips['all_time_week'] = data.week + 52*(data.year - year0) + (data.year/2016).astype('int')

In [None]:
# create a weekday string map
days_of_week_map = {
    0: 'sunday',
    1: 'monday',
    2: 'tuesday',
    3: 'wednesday',
    4: 'thursday',
    5: 'friday',
    6: 'saturday',
}

month_map = {
    1: 'jan', 2: 'feb', 3: 'mar', 4: 'apr', 5: 'may', 6: 'jun', 7: 'jul', 8: 'aug', 9: 'sep', 10: 'oct', 11: 'nov', 12: 'dec'
}
day_type_map = {0:'weekday', 1:'weekend', '':'all'}

In [None]:
trips = trips[[
    'year', 'month', 'week', 'day', 'hour', 'gender', 'from_station_name',
    'from_station_id', 'to_station_id', 'x', 'y', 'from_station_name', 'to_station_name', 'all_time_week', 'day_type'
]]
trips.head()

## cuxfilter example 1

In [None]:
cux_df = cuxfilter.DataFrame.from_dataframe(data)

In [None]:
charts = [
    cuxfilter.charts.bar('hour', title='trips per hour'),
    cuxfilter.charts.bar('month', x_label_map=month_map),
    cuxfilter.charts.bar('day', x_label_map=days_of_week_map),
    cuxfilter.charts.multi_select('year'),
    cuxfilter.charts.multi_select('day_type', label_map=day_type_map),
]

#trips per hour
d = cux_df.dashboard(charts, layout=cuxfilter.layouts.feature_and_double_base, title='Bike Trips Dashboard')

# d.show() #default parameter -> notebook_url="http://localhost:8888"

<img src="../images/cuxfilter_02_dashboard_1.png" width="800" height="500" style="margin-left:25%"/>

## cuxfilter example 2

In [None]:
charts = [
    cuxfilter.charts.bar('all_time_week', title='rides per week'),
    cuxfilter.charts.heatmap(x='all_time_week', y='day', aggregate_col='temperature',
                             aggregate_fn='mean', point_size=40, legend_position='right',
                             title='mean temperature'),
    cuxfilter.charts.multi_select('day_type', label_map=day_type_map),
]

d = cux_df.dashboard(charts, layout=cuxfilter.layouts.feature_and_base)

# d.show() #default parameter -> notebook_url="http://localhost:8888"

<img src="../images/cuxfilter_02_dashboard_2.png" width="800" height="500" style="margin-left:25%"/>

## cuxfilter example 3

In [None]:
cux_df = cuxfilter.DataFrame.load_graph((trips, edges))

In [None]:
charts = [
    cuxfilter.charts.graph(node_id='from_station_id',
                        edge_source='src', edge_target='dst',
                        node_aggregate_fn='count',
                        node_pixel_shade_type='linear', node_point_size=35,
                        edge_render_type='curved',#other option available -> 'curved'
                        edge_transparency=0.5, tile_provider='CARTODBPOSITRON', title='Graph for trip source_stations (color by count)'
                      ),
    cuxfilter.charts.multi_select('year'),
    cuxfilter.charts.multi_select('day_type', label_map=day_type_map),
    cuxfilter.charts.bar('from_station_id'),
    cuxfilter.charts.bar('to_station_id'),
    cuxfilter.charts.view_dataframe(['from_station_name', 'from_station_id'], drop_duplicates=True)
]

d = cux_df.dashboard(charts, layout=cuxfilter.layouts.feature_and_triple_base, theme=cuxfilter.themes.rapids)

# d.show() #default parameter -> notebook_url="http://localhost:8888"

<img src="../images/cuxfilter_02_dashboard_3.png" width="800" height="500" style="margin-left:25%"/>

## cuxfilter example 4

#### Using force_atlas2 from cugraph to check important stations (non_geo spatial layout)

In [None]:
ITERATIONS=500
THETA=1.0
OPTIMIZE=True

trips_force_atlas2_layout = cugraph.layout.force_atlas2(G, max_iter=100,
                strong_gravity_mode=False,
                outbound_attraction_distribution=True,
                lin_log_mode=False,
                barnes_hut_optimize=OPTIMIZE, barnes_hut_theta=THETA, verbose=True)

In [None]:
#merging force_atlas2 df with trips to get a final_df
final_df = trips_force_atlas2_layout.merge(
                trips[['from_station_id', 'from_station_name', 'year', 'hour', 'day_type', 'x', 'y']],
                left_on='vertex',
                right_on='from_station_id',
                suffixes=('', '_original')
)

final_df.head()

#### Cuxfilter Dashboard

In [None]:
cux_df = cuxfilter.DataFrame.load_graph((final_df, edges))

In [None]:
charts= [
  cuxfilter.charts.graph(
      edge_source='src', edge_target='dst', edge_color_palette=['gray', 'black'],node_pixel_shade_type='linear',
      title='ForceAtlas2 Layout Graph', edge_render_type='curved'
  ),
  cuxfilter.charts.scatter(x='x_original', y='y_original', tile_provider='CARTODBPOSITRON',
                           point_size=3, pixel_shade_type='linear', pixel_spread='spread',
                          title='Original Layout'),

  cuxfilter.charts.multi_select('year'),
  cuxfilter.charts.multi_select('day_type', label_map={0:'weekday', 1:'weekend', '':'all'}),
  cuxfilter.charts.bar('hour', title='Trips per hour'),
  cuxfilter.charts.bar('from_station_id', title='Source station'),
  cuxfilter.charts.view_dataframe(['from_station_id', 'from_station_name'], drop_duplicates=True)
] 
d = cux_df.dashboard(charts, layout=cuxfilter.layouts.double_feature_quad_base, theme=cuxfilter.themes.rapids)

# d.show() #default parameter -> notebook_url="http://localhost:8888"

<img src="../images/cuxfilter_02_dashboard_4.png" width="800" height="500" style="margin-left:25%"/>

## summary of interesting exploratory findings
list of interesting patterns we found and want to continue to explore