### Exploratory Data Exploration
In this notebook we will explore the datasets in more details toi try to underestand the data a bit better. We will also try to find some insights that will help us to build a better model.We will approach the dataseets i the same order as they werre intrroduced in the previous notebook.
1. Zurich Statistical Geospatial Data
2. Zurich Dog Dataset
3. Zurich Population Dataset
4. Zurich Income Dataset
5. Zurich Household Dataset

We start with the ncessary imports and load the datasets.

### Imports and Configuration

In [1]:
from IPython.display import clear_output
from panel import widgets as pnw  # For widgets and formatting
import numpy as np  # For number computing
import pandas as pd  # For data manipulation
import panel as pn
from bokeh.models import FixedTicker
import holoviews as hv
from holoviews import opts
import geoviews as gv
from geoviews import tile_sources as gvt
import geopandas as gpd
import hvplot.pandas  # noqa
import spatialpandas as spd
import seaborn as sns
import matplotlib.pyplot as plt
from PIL import Image, ImageDraw
from sklearn.preprocessing import StandardScaler

from tqdm.notebook import tqdm  # Progress bars
from wordcloud import WordCloud  # For generating word cloud visualizations

import helper_functions as hf

clear_output()

In [2]:
hvplot.extension("bokeh")
hv.extension("bokeh")
gv.extension("bokeh")
pn.extension(templates="fast")
pn.config.throttled = True

clear_output()

#### Load in the data


In [3]:
neighborhood_gdf = gpd.read_file("../data/zurich_neighborhoods.geojson")
district_gdf = gpd.read_file("../data/zurich_districts.geojson")
district_desc = pd.read_csv("../data/zurich_districts.csv")
dog_data_train = pd.read_csv("../data/processed_dog_data_train.csv")
# Fix data types as they were lost when saving to csv
dog_data_train["owner_id"] = dog_data_train["owner_id"].astype("string").str.zfill(6)
dog_data_train["sub_district"] = (
    dog_data_train["sub_district"].astype("string").str.zfill(3)
)

In [4]:
poly_opts = dict(
    width=600,
    height=600,
    color_index=None,
    xaxis=None,
    yaxis=None,
    backend_opts={"toolbar.autohide": True},
)
# Neighborhood polygons
neighborhood_poly = gv.Polygons(neighborhood_gdf).opts(
    tools=["hover", "tap"],
    **poly_opts,
    line_color="skyblue",
    line_width=2,
    fill_color="lightgray",
    fill_alpha=0,
    line_alpha=0.5,
)
# district polygons
district_poly = gv.Polygons(district_gdf.merge(district_desc)).opts(
    **poly_opts,
    line_color="pink",
    fill_alpha=0.02,
    tools=["tap", "box_select"],
    line_width=3,
    line_alpha=0.5,
)

# add a basemap
basemap = gv.tile_sources.EsriImagery()  # .opts(alpha=0.5, bgcolor="black")


district_neighborhood_overlay = (basemap * neighborhood_poly * district_poly).opts(
    title="Zurich Districts and Neighborhoods"
)

# Create a panel for the map
pn.pane.HoloViews(district_neighborhood_overlay)

BokehModel(combine_events=True, render_bundle={'docs_json': {'07e9f668-4e19-4561-9c18-eb4d0fac8e96': {'version…

In [5]:
# create a stream which selects a district from the map
select_district = hv.streams.Selection1D(source=district_poly)


@pn.depends(select_district.param.index)
def display_info(index):
    """Displays a brief description of the selected district"""
    if not index:
        return pn.pane.Markdown("No district selected")
    else:
        selected_district = (
            district_poly.iloc[index[0]]
            .data[["district", "district_name", "desc", "link"]]
            .drop_duplicates()
        )
        dname = selected_district["district_name"].values[0]
        dnum = selected_district["district"].values[0]
        ddesc = selected_district["desc"].values[0]
        link = selected_district["link"].values[0]
        return pn.pane.Markdown(
            f"""
            <div style="
            border: 2px solid #4a4a4a;
            border-radius: 10px;
            padding: 20px 20px 20px 20px;
            background-color: #f9f9f9;
            box-shadow: 0 4px 8px 0 rgba(0,0,0,0.2);
            word-wrap: break-word;
            ">
            <h2 style='color: #008080;'>{dnum}</h2>
            <h1 style='color: #000080;'>{dname}</h1>
            <h3 style='color: #708090;'>{ddesc}</h3>
            <a href="{link}" >Source</a>
            </div>
            
            """,
            width=300,
        )


@pn.depends(select_district.param.index)
def display_wordcloud(index):
    """Displays a wordcloud of the selected district based on the description
    of the district in the shape of the district polygon"""
    if len(index) == 0:
        text = "district select on map"
        wordcloud = WordCloud(width=800, height=500, background_color="white").generate(
            text
        )
        return hv.RGB(np.array(wordcloud)).opts(
            width=500, height=500, active_tools=["box_zoom"]
        )
    else:
        selected_district = (
            district_poly.iloc[index[0]]
            .data[["district", "district_name", "desc"]]
            .drop_duplicates()
        )
        dname = selected_district["district_name"].values[0]
        dnum = selected_district["district"].values[0]
        ddesc = selected_district["desc"].values[0]
        text = f"{dnum} {dname} {ddesc}"

        polygon = district_poly.iloc[index[0]].data["geometry"].iloc[0]

        # Get the bounding box of the polygon
        minx, miny, maxx, maxy = polygon.bounds

        # Calculate the width and height of the bounding box
        margin = 0.1
        width = (maxx - minx) * (1 + margin)
        height = (maxy - miny) * (1 + margin)
        # Calculate the new minimum x and y coordinates
        minx -= width * margin / 2
        miny -= height * margin / 2

        # Create a new image with the same aspect ratio as the bounding box
        image_width = 800
        image_height = int(image_width * height / width)
        test = Image.new("1", (image_width, image_height), 0)

        # Convert the coordinates to a numpy array
        coords = np.array(list(polygon.exterior.coords))
        coords -= [minx, miny]
        coords *= [image_width / width, image_height / height]
        coords[:, 1] = image_height - coords[:, 1]
        # Convert the coordinates back to a list of tuples
        scaled_coords = list(map(tuple, coords))

        # Draw the scaled polygon onto the image
        ImageDraw.Draw(test).polygon(scaled_coords, outline=1, fill=1)

        wordcloud = WordCloud(
            mask=~np.array(test) * 255,
            # color_func=lambda *args, **kwargs: breed_color,
            include_numbers=True,
            margin=20,
            # contour_color=breed_color,
            contour_width=5,
            width=800,
            height=500,
            background_color="white",
        ).generate(text)
        return hv.RGB(np.array(wordcloud)).opts(
            width=800,
            height=500,
            tools=["box_zoom"],
            active_tools=["box_zoom"],
            xaxis=None,
            yaxis=None,
            backend_opts={"toolbar.autohide": True},
        )


district_layout = pn.Column(
    pn.pane.HoloViews(display_wordcloud),
    # pn.bind(display_wordcloud, select_district.param.index),
    pn.Row(
        neighborhood_poly * district_poly,
        pn.panel(display_info, width=300),
        # pn.bind(display_info, select_district.param.index),
    ),
    sizing_mode="stretch_width",
)

district_layout_card = pn.Card(
    district_layout,
    title="District Descript",
    sizing_mode="stretch_width",
)
district_layout_card

BokehModel(combine_events=True, render_bundle={'docs_json': {'591d71e3-a375-4910-80de-186b47a7567f': {'version…

In [6]:

# A single row from the dog data
(
    dog_data_train.describe(include="all")
    .T.infer_objects()
    .sort_values(by="unique")
    .fillna("")
)
dog_data_train.sample().T

Unnamed: 0,5290
roster,2015
owner_id,119421
dog_size,K
dog_age,1
age_group_10,40
age_group_20,40
mixed_type,PB
is_pure_breed,True
is_male_owner,False
is_male_dog,False


In [7]:
def update_xaxis(plot, element):
    """Hook to update the x-axis ticker on the plot."""
    plot.state.xaxis.ticker = FixedTicker(ticks=list(range(2015, 2023)))


dogs_total_by_roster = dog_data_train.groupby("roster").size()
print(f"Total number of dogs per year:\n{dogs_total_by_roster}")

total_dogs_line = dogs_total_by_roster.hvplot.bar().opts(
    show_legend=False,
    title="Total Dogs Registered Each Year",
    active_tools=["box_zoom"],
    # height=500,
    # width=400,
)

dog_count_yoy_pct_change = dogs_total_by_roster.pct_change().fillna(0) * 100
total_dogs_yoy_bar = dog_count_yoy_pct_change.hvplot(kind="line").opts(
    hooks=[update_xaxis],
    active_tools=["box_zoom"],
    title="YOY % Change in Dog Count",
    ylabel="%",
)

(total_dogs_line + total_dogs_yoy_bar).cols(1).opts(shared_axes=False)

Total number of dogs per year:
roster
2015    6964
2016    6917
2017    7145
2018    7395
2019    7643
2020    7840
dtype: int64


In [8]:
scaler = StandardScaler()
# get the count of dogs by sub-district and roster
dog_count_by_sub_d_roster = (dog_data_train.groupby(
    ["roster", "sub_district"],
    as_index=False).size().pivot(index="sub_district",
                                 columns="roster",
                                 values="size"))
# put the sub-district back into the columns
dog_count_df_std = pd.DataFrame(
    scaler.fit_transform(dog_count_by_sub_d_roster),
    columns=dog_count_by_sub_d_roster.columns,
    index=dog_count_by_sub_d_roster.index,
)
# get the percent change of the dog count by sub-district and roster
dog_count_pct_change_std = pd.DataFrame(
    scaler.fit_transform(
        dog_count_by_sub_d_roster.pct_change(axis=1).fillna(0) * 100),
    columns=dog_count_by_sub_d_roster.columns,
    index=dog_count_by_sub_d_roster.index,
)
# plot the standardized dog count and percent change
dog_count_df_std.unstack().reset_index(name="count_std").merge(
    dog_count_pct_change_std.unstack().reset_index(
        name="pct_change_std")).hvplot.scatter(
            by="roster",
            y="count_std",
            x="pct_change_std",
            height=600,
            width=600,
            xlim=(-3, 3),
            ylim=(-3, 3),
        ) * hv.VLine(0).opts(color="lightgray",
                             line_dash="dashed") * hv.HLine(0).opts(
                                 color="lightgray", line_dash="dashed")

In [9]:
# same plot but without the fillna(0) in the pct_change_std
dog_count_pct_change_long = (
    dog_count_by_sub_d_roster.pct_change(axis=1).unstack()
    # .dropna()
    .reset_index(name="pct_change"))
dog_count_long = dog_count_by_sub_d_roster.unstack().reset_index(
    name="count").dropna()

dog_count_std = scaler.fit_transform(
    dog_count_by_sub_d_roster.unstack().values.reshape(-1, 1))

dog_count_long["count_std"] = dog_count_std

(dog_count_long.merge(dog_count_pct_change_long).hvplot.scatter(
    by="roster",
    y="count_std",
    x="pct_change",
    height=600,
    width=600,
) * hv.VLine(0).opts(color="lightgray", line_dash="dashed") *
 hv.HLine(0).opts(color="lightgray", line_dash="dashed"))



In [10]:
# display a sample of the dog data train
dog_data_train.sample(3)
dog_data_train.info()

dog_data_train.describe(include="all").T.round(2).infer_objects().sort_values(by="unique").fillna("")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43904 entries, 0 to 43903
Data columns (total 17 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   roster             43904 non-null  int64 
 1   owner_id           43904 non-null  string
 2   dog_size           43904 non-null  object
 3   dog_age            43904 non-null  int64 
 4   age_group_10       43904 non-null  int64 
 5   age_group_20       43904 non-null  int64 
 6   mixed_type         43904 non-null  object
 7   is_pure_breed      43904 non-null  bool  
 8   is_male_owner      43904 non-null  bool  
 9   is_male_dog        43904 non-null  bool  
 10  dog_color_en       43904 non-null  object
 11  standard           43904 non-null  object
 12  standard_2         43904 non-null  object
 13  pet_count          43904 non-null  int64 
 14  district           43904 non-null  int64 
 15  sub_district       43904 non-null  string
 16  age_group_missing  43904 non-null  int64

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
is_pure_breed,43904.0,2.0,True,31443.0,,,,,,,
is_male_owner,43904.0,2.0,False,30290.0,,,,,,,
is_male_dog,43904.0,2.0,False,22190.0,,,,,,,
dog_size,43904.0,3.0,K,27609.0,,,,,,,
mixed_type,43904.0,4.0,PB,31443.0,,,,,,,
sub_district,43904.0,34.0,092,3458.0,,,,,,,
standard_2,43904.0,101.0,none,31443.0,,,,,,,
dog_color_en,43904.0,178.0,black,4627.0,,,,,,,
standard,43904.0,239.0,unknown,5480.0,,,,,,,
owner_id,43904.0,10945.0,105585,71.0,,,,,,,


In [None]:
# Calculate yearly counts
yearly_counts = dog_data_train.groupby(['roster', 'sub_district', 'standard'
                                        ]).size().reset_index(name='size')

# Calculate yearly change
yearly_counts['yearly_change'] = yearly_counts.groupby(
    ['sub_district', 'standard'])['size'].diff().fillna(0)

# Calculate change of the change (second derivative)
yearly_counts['change_of_change'] = yearly_counts.groupby(
    ['sub_district', 'standard'])['yearly_change'].diff().fillna(0)

# Normalize the changes
yearly_counts['change_of_change_normalized'] = yearly_counts.groupby(
    ['sub_district', 'standard'])['change_of_change'].transform(
        lambda x: (x - x.mean()) / x.std()).fillna(0).round(2)

# Rank the breeds
yearly_counts['rank'] = yearly_counts.groupby(
    ['roster',
     'sub_district'])['change_of_change_normalized'].rank(ascending=False,
                                                          na_option='bottom')

# Create target variable
yearly_counts['is_emerging'] = (yearly_counts['rank'] == 1).astype(int)

# set all 2015 records to  be non-emerging
yearly_counts.loc[yearly_counts['roster'].isin([2015, 2016]),
                  'is_emerging'] = 0

# yearly_counts.sort_values(by=['sub_district', 'standard', 'roster']).head(50)
yearly_counts.sort_values(by=['is_emerging', 'roster'],
                          ascending=False).head(50)
yearly_counts.sort_values(by=['size'], ascending=False).head(50)
yearly_counts.query('roster > 2016').sort_values(by=['is_emerging', 'roster'],
                                                 ascending=False).head(50)

In [None]:
# groupby subdistrict and roster and count the number of each breed
breed_count_by_roster_sub_d = dog_data_train.groupby(
    ["roster", "sub_district", "standard"]).size().unstack(level=2).fillna(0)

# breed_count_by_roster_sub_d.divide(breed_count_by_roster_sub_d.sum(axis=1), axis=0)
breed_count_by_roster_sub_d.sort_index(level='roster').groupby(level='sub_district').diff().fillna(0)
grouped_breed_count = breed_count_by_roster_sub_d.groupby(level='sub_district')
grouped_breed_count.pct_change().fillna(0)
total_breed_count = grouped_breed_count.sum()
total_breed_count


# filter the breed count by sub-district and roster to only include breeds in the breeds_more_than_threshold list
# this is a multi-index dataframe with sub-district and breeds
breed_count_by_roster_sub_d