# Looking at Data

> Multiple ways to look at data 

- prettify: true

[![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/karpatic/karpatic/main?filepath=src%2Fipynb%2Fdatalabs%2F01_scooter_exploration.ipynb)
[![Binder](https://pete88b.github.io/fastpages/assets/badges/colab.svg)](https://colab.research.google.com/github/karpatic/karpatic/blob/main/src/ipynb/datalabs/01_scooter_exploration.ipynb)
[![Binder](https://pete88b.github.io/fastpages/assets/badges/github.svg)](https://github.com/karpatic/karpatic/blob/main/src/ipynb/datalabs/01_scooter_exploration.ipynb)
[![Open Source Love svg3](https://badges.frapsoft.com/os/v3/open-source.svg?v=103)](https://github.com/ellerbrock/open-source-badges/)

[![NPM License](https://img.shields.io/npm/l/all-contributors.svg?style=flat)](https://github.com/karpatic/karpatic/blob/main/LICENSE)
[![Active](http://img.shields.io/badge/Status-Active-green.svg)](https://karpatic.github.io) 
[![GitHub last commit](https://img.shields.io/github/last-commit/karpatic/karpatic.svg?style=flat)]()  

[![GitHub stars](https://img.shields.io/github/stars/karpatic/karpatic.svg?style=social&label=Star)](https://github.com/karpatic/karpatic) 
[![GitHub watchers](https://img.shields.io/github/watchers/karpatic/karpatic.svg?style=social&label=Watch)](https://github.com/karpatic/karpatic) 
[![GitHub forks](https://img.shields.io/github/forks/karpatic/karpatic.svg?style=social&label=Fork)](https://github.com/karpatic/karpatic) 
[![GitHub followers](https://img.shields.io/github/followers/karpatic.svg?style=social&label=Follow)](https://github.com/karpatic/karpatic)  

Today we will run through a few experiments to work with data

We will be using a library created by bnia among others

In [None]:
#hide 
!pip install geopandas dataplay

In [None]:
import numpy as np
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
import networkx as nx
import warnings
warnings.filterwarnings('ignore')

Lets start from where we left off last time

In [None]:
shortname = 'libcard'

In [None]:
# Create the url we will use to query the data from the ESRI api endpoint.
baseurl = "https://services1.arcgis.com/mVFRs7NF4iFitgbY/ArcGIS/rest/services/"
slug = "/FeatureServer/0/query?where=1%3D1&outFields=*&returnGeometry=true&f=pgeojson"
url = baseurl+shortname+slug 

In [None]:
# Use the geopandas library to read it in and immediately set it's index and drop an undesired column.
gdf = gpd.read_file(url).set_index('CSA2010').drop(axis='1', columns=['OBJECTID','Shape__Area','Shape__Length'])

In [None]:
import csv
gdf.drop(axis='1', columns=['geometry']).to_csv(shortname+'.csv', quoting=csv.QUOTE_ALL) 

In [None]:
df = gdf.drop(axis='1', columns=['geometry'])

In [None]:
df.head()

In [None]:
test = df.transpose().reset_index()
test.head(1)

In [None]:
test = pd.melt(test, id_vars=['index'], value_vars=test.columns[1:].values, ignore_index=False)

https://seaborn.pydata.org/examples/horizontal_boxplot.html

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

sns.set_theme(style="ticks")

# Initialize the figure with a logarithmic x axis
f, ax = plt.subplots(figsize=(7, 6))
ax.set_xscale("log")

# Load the example planets dataset
planets = sns.load_dataset("planets")

# Plot the orbital period with horizontal boxes
sns.boxplot(x="value", y="index", data=test,
            whis=[0, 100], width=.6, palette="vlag")

# Add in points to show each observation
sns.stripplot(x="value", y="index", data=test,
              size=4, color=".3", linewidth=0)

# Tweak the visual presentation
ax.xaxis.grid(True)
ax.set(ylabel="")
sns.despine(trim=True, left=True)

In [None]:
import seaborn as sns
sns.set_theme(style="ticks")

sns.pairplot(df.reset_index(), hue="CSA2010")

In [None]:
import matplotlib.pyplot as plt
import numpy as np 
from pandas import DataFrame
import seaborn as sns
%matplotlib inline
# We can change the size of our images like this:
plt.figure(figsize=(10,10))

# And heatmaps are as simple as this:
sorted_df = df.sort_values(by=['libcard19'], ascending = False)
sns.heatmap(sorted_df)

In [None]:
df.plot.line()

In [None]:
import geopandas as gpd
import numpy as np
import pandas as pd
from branca.colormap import linear
from dataplay import intaker 
# conditionally loaded ->  from dataplay import geoms

u = intaker.Intake
rdf = u.getData('https://services1.arcgis.com/mVFRs7NF4iFitgbY/ArcGIS/rest/services/Biz1_/FeatureServer/0/query?where=1%3D1&outFields=*&returnGeometry=true&f=pgeojson')
# rdf.set_index('CSA2010', drop=True, inplace=True)
rdf.drop(labels=['OBJECTID_1', 'Shape__Area', 'Shape__Length'], axis=1, inplace=True)

ndf = rdf.filter(regex='biz1|CSA2010', axis=1)

# Calculate number of years available
n_periods = len(ndf.columns) - 1
# Get starting year.
startAt = "20"+ndf.columns[1][-2:]

# Create a 'YEAR' index with the assumption that all following years exist
datetime_index = pd.date_range(startAt, periods=n_periods, freq="Y")
dt_index_epochs = datetime_index.astype(int) // 10 ** 9
dt_index = dt_index_epochs.astype("U10")

styledata = {}
# For the Index of each CSA
for idx, csa in rdf.iterrows():
    df = pd.DataFrame( { "color": csa.values[1:-1] }, index=dt_index, )
    styledata[idx] = df

max_color, min_color = 0, 0
for country, data in styledata.items():
    max_color = max(max_color, data["color"].max())
    min_color = min(max_color, data["color"].min())

cmap = linear.PuRd_09.scale(min_color, max_color)
def norm(x): return (x - x.min()) / (x.max() - x.min())
for country, data in styledata.items():
    data["color"] = data["color"].apply(cmap)
    data["opacity"] = 1

styledict = { str(country): data.to_dict(orient="index") for country, data in styledata.items() }

# { CSA : { timestamp: {color: value, opacity:value } }, 
#    CSA : { timestamp: {color: value, opacity:value } }, 
#    ... 
# }

import folium
from folium.plugins import TimeSliderChoropleth

m = folium.Map([39.28759453969165, -76.61278931706487], width='75%', height='75%', zoom_start=12)
g = TimeSliderChoropleth( rdf.to_json(), styledict=styledict, ).add_to(m)
m.save(outfile= "test.html")
m

In [None]:
u = intaker.Intake
rdf = u.getData('https://services1.arcgis.com/mVFRs7NF4iFitgbY/ArcGIS/rest/services/Biz1_/FeatureServer/0/query?where=1%3D1&outFields=*&returnGeometry=true&f=pgeojson')
# rdf.set_index('CSA2010', drop=True, inplace=True)
rdf.head()
rdf.drop(labels=['OBJECTID_1', 'Shape__Area', 'Shape__Length'], axis=1, inplace=True)
rdf.sort_values(by=['biz1_19'], ascending = False, inplace=True)

vs10to19Ind = rdf.filter(regex='biz1|CSA2010', axis=1)

What we want is 1 record for every year and every CSA as a column. To do this, transpose the dataset. Set the CSA labels (first row) as our columns, relabel the index (for clarity) and cast our datatypes.

What we want is 1 record for every year and every CSA as a column. To do this, transpose the dataset. Set the CSA labels (first row) as our columns, relabel the index (for clarity) and cast our datatypes.

In [None]:
vs10to19Indt = vs10to19Ind.T
vs10to19Indt.columns = vs10to19Indt.iloc[0]
vs10to19Indt = vs10to19Indt[1:]
vs10to19Indt.index.name = 'variable'
vs10to19Indt = vs10to19Indt.astype('float64')

In [None]:
#craetes a correlation matrix
cor_matrix = vs10to19Indt.iloc[:,:].corr()
#shows the first 5 rows
cor_matrix.head(5)

In [None]:
df = vs10to19Indt.copy()
import matplotlib.pyplot as plt
f = plt.figure(figsize=(19, 15))
plt.matshow(df.corr(), fignum=f.number)
irange = range(df.select_dtypes(['number']).shape[1])
labels = df.select_dtypes(['number']).columns
# plt.xticks(irange, labels, fontsize=14, rotation=45)
plt.yticks(irange, labels, fontsize=14)
cb = plt.colorbar()
cb.ax.tick_params(labelsize=14)
plt.title('Correlation Matrix', fontsize=16);

In [None]:
#craetes a correlation matrix
cor_matrix = vs10to19Indt.iloc[:,:].corr()
#shows the first 5 rows
cor_matrix.head(5)

In [None]:
#extracts the indices from the correlation matrix
lblVals = cor_matrix.index.values

In [None]:
#Changes from dataframe to matrix, so it is easier to create a graph with networkx
cor_matrix = np.asmatrix(cor_matrix)
#Crates graph using the data of the correlation matrix
G = nx.from_numpy_matrix(cor_matrix)

#relabels the nodes to match the  stocks names
G = nx.relabel_nodes(G,lambda x: lblVals[x])

#Shows the first 5 edges with their corresponding edges
# OLD: G.edges(data=True)[:5]
list(G.edges(data=True))[0:5]

In [None]:
!pip install VitalSigns

In [None]:
import VitalSigns

In [None]:
from dataplay import corr

In [None]:
corr.create_corr_network_5(G, corr_direction="positive",min_correlation=0.7)

In [None]:
corr.create_corr_network_5(G, corr_direction="negative",min_correlation=-0.7)

We want to create a linear regression for each CSA using {X: year, Y: value} for a given indicator

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.linear_model import LinearRegression

# Create 3 columns: CSA2010	variable value
wdf = vs10to19Ind.melt(id_vars='CSA2010', value_vars=vs10to19Ind.columns[1:])

# Convert indicator labels into our X (Year) column 
wdf['variable'] = wdf['variable'].apply(lambda x: int(x.replace('biz1_','') ) )

findf = {'CSA':[], 'B':[], 'M':[] }
# For each CSA 
for csa in wdf.CSA2010.unique():
  CsaData = wdf[ wdf['CSA2010']==csa]
  X = CsaData[['variable']] #.values # returns: [10 11 12 13 14 15 16 17 18 19]
  y = CsaData[['value']] #.values
  regressor = LinearRegression()
  regressor.fit(X, y)
  y_pred = regressor.predict(X)
  plt.scatter(X, y, color = 'red')
  plt.plot(X, regressor.predict(X), color = 'blue')
  plt.title('biz1: '+ csa)
  plt.xlabel('YEAR')
  plt.ylabel('VALUE')
  display( plt.show() )
  display( print('B: ', regressor.coef_, 'Y: ', regressor.intercept_) ) 
  findf['CSA'].append(csa)
  findf['B'].append(regressor.intercept_[0])
  findf['M'].append(regressor.coef_[0][0])

In [None]:
lin_reg_df = pd.DataFrame(data=findf)

In [None]:
lin_reg_df.head()

In [None]:
lin_reg_dft = lin_reg_df.T
lin_reg_dft.columns = lin_reg_dft.iloc[0]
lin_reg_dft = lin_reg_dft[1:]
lin_reg_dft.index.name = 'variable'
lin_reg_dft = lin_reg_dft.astype('float64')

In [None]:
lin_reg_dft

We may need to normalize the data for this to be useable

In [None]:
df = lin_reg_dft.copy()
import matplotlib.pyplot as plt
f = plt.figure(figsize=(19, 15))
plt.matshow(df.corr(), fignum=f.number)
irange = range(df.select_dtypes(['number']).shape[1])
labels = df.select_dtypes(['number']).columns
# plt.xticks(irange, labels, fontsize=14, rotation=45)
plt.yticks(irange, labels, fontsize=14)
cb = plt.colorbar()
cb.ax.tick_params(labelsize=14)
plt.title('Correlation Matrix', fontsize=16);

In [None]:
# This dataset is taken from the public database provided by BNIAJFI hosted by Esri / ArcGIS
# BNIA ArcGIS Homepage: https://data-bniajfi.opendata.arcgis.com/
final = u.getData('https://services1.arcgis.com/mVFRs7NF4iFitgbY/ArcGIS/rest/services/Biz1_/FeatureServer/0/query?where=1%3D1&outFields=*&returnGeometry=true&f=pgeojson')
final.head(1)

In [None]:
final['centroid'] = final['geometry'].representative_point()

In [None]:
pd.set_option('precision', 0)
fileNames = []
labelBounds = True
specialLabelCol = False # Labels on GEOM Centroids
saveGifAs = './test.gif'
label = 'Household Poverty'
annotation = 'Source: Maryland Vital Statistics; Analysis by: Baltimore Neighborhood Indicators Alliance' 
fontsize='22'

In [None]:
# Get only the results tab
td = final.copy()
td = td.reindex(sorted(td.columns), axis=1)

In [None]:
# Coerce columns stored as floats into integers. 
# This will ensure numbers are rounded to whole digits when displaying the reults
regexMatchingColumnsToMakeTheGifWith = 'biz1'
gifCols = td.filter(regex=regexMatchingColumnsToMakeTheGifWith).columns.values

td[gifCols] = td[gifCols].fillna(-1)
td[gifCols] = td[gifCols].astype('int32')
td.head()

https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.hist.html

In [None]:
final.filter(regex=regexMatchingColumnsToMakeTheGifWith).hist(figsize=(20, 10), bins=10)

In [None]:
saveGifAs = './test.gif'
labelBounds = False # 'CSA2010'
annotation = 'Source: Baltimore Neighborhood Indicators Alliance' 
title = 'Indicator Name' 
fontsize='22'

In [None]:
from dataplay import gifmap
from dataplay.gifmap import getAbsMinMax

In [None]:
import re 
td = td.rename(columns=lambda x: re.sub('biz1_','final',x))

In [None]:
gifmap.createGifMap(td, saveGifAs, labelBounds, title, annotation, fontsize)