### Exploratory Data Exploration
In this notebook we will explore the datasets in more details toi try to underestand the data a bit better. We will also try to find some insights that will help us to build a better model.We will approach the dataseets i the same order as they werre intrroduced in the previous notebook.
1. Zurich Statistical Geospatial Data
2. Zurich Dog Dataset
3. Zurich Population Dataset
4. Zurich Income Dataset
5. Zurich Household Dataset

We start with the ncessary imports and load the datasets.

### Imports and Configuration

In [74]:
from IPython.display import clear_output
from panel import widgets as pnw  # For widgets and formatting
import numpy as np  # For number computing
import pandas as pd  # For data manipulation
import panel as pn
from bokeh.models import FixedTicker
import holoviews as hv
from holoviews import opts
import geoviews as gv
from geoviews import tile_sources as gvt
import geopandas as gpd
import hvplot.pandas  # noqa
import spatialpandas as spd
import seaborn as sns
import matplotlib.pyplot as plt
from PIL import Image, ImageDraw
from sklearn.preprocessing import StandardScaler

from tqdm.notebook import tqdm  # Progress bars
from wordcloud import WordCloud  # For generating word cloud visualizations

import helper_functions as hf

clear_output()

In [26]:
hvplot.extension("bokeh")
hv.extension("bokeh")
gv.extension("bokeh")
pn.extension(templates="fast")
pn.config.throttled = True

clear_output()

#### Load in the data


In [40]:
neighborhood_gdf = gpd.read_file("../data/zurich_neighborhoods.geojson")
district_gdf = gpd.read_file("../data/zurich_districts.geojson")
district_desc = pd.read_csv("../data/zurich_districts.csv")
dog_data_train = pd.read_csv("../data/processed_dog_data_train.csv")
# Fix data types as they were lost when saving to csv
dog_data_train["owner_id"] = dog_data_train["owner_id"].astype("string").str.zfill(6)
dog_data_train["sub_district"] = (
    dog_data_train["sub_district"].astype("string").str.zfill(3)
)

In [41]:
poly_opts = dict(
    width=600,
    height=600,
    color_index=None,
    xaxis=None,
    yaxis=None,
    backend_opts={"toolbar.autohide": True},
)
# Neighborhood polygons
neighborhood_poly = gv.Polygons(neighborhood_gdf).opts(
    tools=["hover", "tap"],
    **poly_opts,
    line_color="skyblue",
    line_width=2,
    fill_color="lightgray",
    fill_alpha=0,
    line_alpha=0.5,
)
# district polygons
district_poly = gv.Polygons(district_gdf.merge(district_desc)).opts(
    **poly_opts,
    line_color="pink",
    fill_alpha=0.02,
    tools=["tap", "box_select"],
    line_width=3,
    line_alpha=0.5,
)

# add a basemap
basemap = gv.tile_sources.EsriImagery()  # .opts(alpha=0.5, bgcolor="black")


district_neighborhood_overlay = (basemap * neighborhood_poly * district_poly).opts(
    title="Zurich Districts and Neighborhoods"
)

# Create a panel for the map
pn.pane.HoloViews(district_neighborhood_overlay)

BokehModel(combine_events=True, render_bundle={'docs_json': {'0f6f8357-c542-420f-8c58-de03ec0429fb': {'version…

In [42]:
# create a stream which selects a district from the map
select_district = hv.streams.Selection1D(source=district_poly)


@pn.depends(select_district.param.index)
def display_info(index):
    """Displays a brief description of the selected district"""
    if not index:
        return pn.pane.Markdown("No district selected")
    else:
        selected_district = (
            district_poly.iloc[index[0]]
            .data[["district", "district_name", "desc", "link"]]
            .drop_duplicates()
        )
        dname = selected_district["district_name"].values[0]
        dnum = selected_district["district"].values[0]
        ddesc = selected_district["desc"].values[0]
        link = selected_district["link"].values[0]
        return pn.pane.Markdown(
            f"""
            <div style="
            border: 2px solid #4a4a4a;
            border-radius: 10px;
            padding: 20px 20px 20px 20px;
            background-color: #f9f9f9;
            box-shadow: 0 4px 8px 0 rgba(0,0,0,0.2);
            word-wrap: break-word;
            ">
            <h2 style='color: #008080;'>{dnum}</h2>
            <h1 style='color: #000080;'>{dname}</h1>
            <h3 style='color: #708090;'>{ddesc}</h3>
            <a href="{link}" >Source</a>
            </div>
            
            """,
            width=300,
        )


@pn.depends(select_district.param.index)
def display_wordcloud(index):
    """Displays a wordcloud of the selected district based on the description
    of the district in the shape of the district polygon"""
    if len(index) == 0:
        text = "district select on map"
        wordcloud = WordCloud(width=800, height=500, background_color="white").generate(
            text
        )
        return hv.RGB(np.array(wordcloud)).opts(
            width=500, height=500, active_tools=["box_zoom"]
        )
    else:
        selected_district = (
            district_poly.iloc[index[0]]
            .data[["district", "district_name", "desc"]]
            .drop_duplicates()
        )
        dname = selected_district["district_name"].values[0]
        dnum = selected_district["district"].values[0]
        ddesc = selected_district["desc"].values[0]
        text = f"{dnum} {dname} {ddesc}"

        polygon = district_poly.iloc[index[0]].data["geometry"].iloc[0]

        # Get the bounding box of the polygon
        minx, miny, maxx, maxy = polygon.bounds

        # Calculate the width and height of the bounding box
        margin = 0.1
        width = (maxx - minx) * (1 + margin)
        height = (maxy - miny) * (1 + margin)
        # Calculate the new minimum x and y coordinates
        minx -= width * margin / 2
        miny -= height * margin / 2

        # Create a new image with the same aspect ratio as the bounding box
        image_width = 800
        image_height = int(image_width * height / width)
        test = Image.new("1", (image_width, image_height), 0)

        # Convert the coordinates to a numpy array
        coords = np.array(list(polygon.exterior.coords))
        coords -= [minx, miny]
        coords *= [image_width / width, image_height / height]
        coords[:, 1] = image_height - coords[:, 1]
        # Convert the coordinates back to a list of tuples
        scaled_coords = list(map(tuple, coords))

        # Draw the scaled polygon onto the image
        ImageDraw.Draw(test).polygon(scaled_coords, outline=1, fill=1)

        wordcloud = WordCloud(
            mask=~np.array(test) * 255,
            # color_func=lambda *args, **kwargs: breed_color,
            include_numbers=True,
            margin=20,
            # contour_color=breed_color,
            contour_width=5,
            width=800,
            height=500,
            background_color="white",
        ).generate(text)
        return hv.RGB(np.array(wordcloud)).opts(
            width=800,
            height=500,
            tools=["box_zoom"],
            active_tools=["box_zoom"],
            xaxis=None,
            yaxis=None,
            backend_opts={"toolbar.autohide": True},
        )


district_layout = pn.Column(
    pn.pane.HoloViews(display_wordcloud),
    # pn.bind(display_wordcloud, select_district.param.index),
    pn.Row(
        neighborhood_poly * district_poly,
        pn.panel(display_info, width=300),
        # pn.bind(display_info, select_district.param.index),
    ),
    sizing_mode="stretch_width",
)

district_layout_card = pn.Card(
    district_layout,
    title="District Descript",
    sizing_mode="stretch_width",
)
district_layout_card

BokehModel(combine_events=True, render_bundle={'docs_json': {'80f954a7-6d71-49b2-a471-4b24b484163c': {'version…

In [43]:

# A single row from the dog data
(
    dog_data_train.describe(include="all")
    .T.infer_objects()
    .sort_values(by="unique")
    .fillna("")
)
dog_data_train.sample().T

Unnamed: 0,22315
roster,2018
owner_id,086923
dog_size,I
dog_age,13
age_group_10,50
age_group_20,40
mixed_type,BU
is_pure_breed,False
is_male_owner,False
is_male_dog,False


In [45]:
def update_xaxis(plot, element):
    """Hook to update the x-axis ticker on the plot."""
    plot.state.xaxis.ticker = FixedTicker(ticks=list(range(2015, 2023)))


dogs_total_by_roster = dog_data_train.groupby("roster").size()
print(f"Total number of dogs per year:\n{dogs_total_by_roster}")

total_dogs_line = dogs_total_by_roster.hvplot.bar().opts(
    show_legend=False,
    title="Total Dogs Registered Each Year",
    active_tools=["box_zoom"],
    # height=500,
    # width=400,
)

dog_count_yoy_pct_change = dogs_total_by_roster.pct_change().fillna(0) * 100
total_dogs_yoy_bar = dog_count_yoy_pct_change.hvplot(kind="line").opts(
    hooks=[update_xaxis],
    active_tools=["box_zoom"],
    title="YOY % Change in Dog Count",
    ylabel="%",
)

(total_dogs_line + total_dogs_yoy_bar).cols(1).opts(shared_axes=False)

Total number of dogs per year:
roster
2015    6974
2016    6925
2017    7152
2018    7397
dtype: int64


In [123]:
scaler.fit_transform(dog_count_by_sub_d_roster)
scaler.fit_transform(dog_count_by_sub_d_roster.pct_change().fillna(0) * 100)

array([[-0.23341569, -0.23717248, -0.25538832, -0.22748386],
       [-0.5194446 , -0.53061546, -0.61620692, -0.4768262 ],
       [ 0.66757537,  0.73881313,  0.87612616,  0.29257304],
       [-0.38631115, -0.39048298, -0.37882626, -0.35130693],
       [ 5.54111795,  5.51390386,  5.3874891 ,  5.58704114],
       [-0.42141993, -0.44085643, -0.50306316, -0.41126465],
       [-0.09410713, -0.07856612,  0.01222782, -0.01372967],
       [-0.0705042 , -0.05700511, -0.14998974, -0.14377792],
       [-0.37607513, -0.38336899, -0.41351797, -0.35519509],
       [-0.08780098, -0.11966251, -0.08241795, -0.08489972],
       [-0.47506365, -0.48773607, -0.57090234, -0.45233937],
       [ 0.41055284,  0.50966868,  0.77326121,  0.55996613],
       [-0.29056101, -0.30853902, -0.34807981, -0.3004743 ],
       [-0.33955037, -0.33974713, -0.34876927, -0.32921643],
       [-0.22317716, -0.19167606, -0.20254027, -0.09244493],
       [ 0.27623582,  0.24426232,  0.27649876,  0.08069801],
       [-0.27744544, -0.

In [132]:
scaler = StandardScaler()


dog_count_by_sub_d_roster = (
    dog_data_train.groupby(["roster", "sub_district"], as_index=False)
    .size()
    .pivot(index="sub_district", columns="roster", values="size")
)

dog_count_df_std = pd.DataFrame(
    scaler.fit_transform(dog_count_by_sub_d_roster),
    columns=dog_count_by_sub_d_roster.columns,
    index=dog_count_by_sub_d_roster.index,
)
dog_count_pct_change_std = pd.DataFrame(
    scaler.fit_transform(
        dog_count_by_sub_d_roster.pct_change(axis=1).fillna(0) * 100),
    columns=dog_count_by_sub_d_roster.columns,
    index=dog_count_by_sub_d_roster.index,
)
dog_count_df_std.unstack().reset_index(name="count_std").merge(
    dog_count_pct_change_std.unstack().reset_index(name="pct_change_std")
).hvplot.scatter(
    by="roster",
    y="count_std",
    x="pct_change_std",
    height=600,
    width=600,
    xlim=(-3, 3),
    ylim=(-3, 3),
) * hv.VLine(
    0
).opts(
    color="lightgray", line_dash="dashed"
) * hv.HLine(
    0
).opts(
    color="lightgray", line_dash="dashed"
)

In [None]:




dog_count_pct_change_long = (
    dog_count_by_sub_d_roster.pct_change(axis=1).unstack()
    # .dropna()
    .reset_index(name="pct_change")
)
dog_count_long = dog_count_by_sub_d_roster.unstack().reset_index(name="count").dropna()

dog_count_std = scaler.fit_transform(
    dog_count_by_sub_d_roster.unstack().values.reshape(-1, 1)
)


dog_count_long["count_std"] = dog_count_std

(
    dog_count_long.merge(dog_count_pct_change_long).hvplot.scatter(
        by="roster",
        y="count_std",
        x="pct_change",
        height=600,
        width=600,
    )
    * hv.VLine(0).opts(color="lightgray", line_dash="dashed")
    * hv.HLine(0).opts(color="lightgray", line_dash="dashed")
)


# dog_count_long.merge(dog_count_pct_change_long).hvplot.scatter(
#     by="roster",
#     y="count_std",
#     x="pct_change",
# ) * hv.Labels(
#     dog_count_long.merge(dog_count_pct_change_long),
#     ["pct_change", "count_std"],
#     ["sub_district"],
# ).opts(
#     text_font_size="8pt",
# )

In [None]:
('cleaned_data.csv')

# Display the first few rows of the dataframe
print(df.head())

# Display the summary statistics of the dataframe
print(df.describe())

# Display the information of the dataframe
print(df.info())

# Plotting the distribution of numerical variables
df.hist(bins=50, figsize=(20,15))
plt.show()

# Checking for correlation among variables
corr_matrix = df.corr()
print(corr_matrix)

# Visualizing the correlation matrix using a heatmap
plt.figure(figsize=(10,8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')
plt.show()

# Boxplot for understanding the distributions and to observe the outliers
numerical_features = df.select_dtypes(include=[np.number]).columns
for i in numerical_features:
    plt.figure(figsize=(8,6))
    sns.boxplot(df[i])
    plt.show()

# Countplot for understanding the distribution of categorical variables
categorical_features = df.select_dtypes(include=[np.object]).columns
for i in categorical_features:
    plt.figure(figsize=(8,6))
    sns.countplot(df[i])
    plt.show()
```

# Feature Selection
Select the most relevant features for the model.

In [None]:
```python
# Import necessary libraries for feature selection
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

# Separate the target variable and rest of the variables using column locations
X = df.iloc[:,0:20]  #independent columns
y = df.iloc[:,-1]    #target column

# Apply SelectKBest class to extract top 10 best features
bestfeatures = SelectKBest(score_func=chi2, k=10)
fit = bestfeatures.fit(X,y)

# Get the scores for each feature
dfscores = pd.DataFrame(fit.scores_)

# Get the column names
dfcolumns = pd.DataFrame(X.columns)

# Concat two dataframes for better visualization 
featureScores = pd.concat([dfcolumns,dfscores],axis=1)
featureScores.columns = ['Specs','Score']  #naming the dataframe columns

# Print the top 10 best features
print(featureScores.nlargest(10,'Score'))  
```

# Model Selection
Choose the appropriate model for the task.

In [None]:
```python
# Import necessary libraries for model selection
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

# Define a function to select the best model
def select_best_model(X, y):
    # Split the dataset into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Define the models
    models = [
        ('Logistic Regression', LogisticRegression()),
        ('Decision Tree', DecisionTreeClassifier()),
        ('Random Forest', RandomForestClassifier()),
        ('SVM', SVC()),
        ('KNN', KNeighborsClassifier())
    ]

    # Iterate over the models and calculate the accuracy
    for name, model in models:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        print(f'{name} Accuracy: {accuracy * 100}%')

# Call the function
select_best_model(X, y)
```

# Model Training
Train the selected model using the cleaned and processed data.

In [None]:
```python
# Import necessary libraries for model training
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

# Assuming that the best model selected was Logistic Regression
model = LogisticRegression()

# Split the dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
model.fit(X_train, y_train)

# Predict the test set results
y_pred = model.predict(X_test)

# Print the classification report
print(classification_report(y_test, y_pred))

# Print the confusion matrix
print(confusion_matrix(y_test, y_pred))
```

# Model Evaluation
Evaluate the performance of the trained model.

In [None]:
```python
# Import necessary libraries for model evaluation
from sklearn.metrics import roc_curve, auc

# Calculate the probabilities of the predictions
y_pred_proba = model.predict_proba(X_test)[::,1]

# Calculate the false positive rate, true positive rate and thresholds
fpr, tpr, _ = roc_curve(y_test, y_pred_proba)

# Calculate the Area Under the Curve (AUC)
auc = auc(fpr, tpr)

# Plot the ROC curve
plt.plot(fpr,tpr,label="data 1, auc="+str(auc))
plt.legend(loc=4)
plt.show()
```

# Model Optimization
Optimize the model parameters for better performance.

In [None]:
```python
# Import necessary libraries for model optimization
from sklearn.model_selection import GridSearchCV

# Define the parameter grid for Logistic Regression
param_grid = {
    'penalty': ['l1', 'l2', 'elasticnet', 'none'],
    'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
    'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
    'max_iter': [100, 1000, 2500, 5000]
}

# Create a GridSearchCV object
grid = GridSearchCV(LogisticRegression(), param_grid, verbose=3)

# Fit the model with the combinations of different parameters
grid.fit(X_train, y_train)

# Print the best parameters found by GridSearchCV
print(grid.best_params_)

# Print the best score found by GridSearchCV
print(grid.best_score_)

# Use the best estimator for prediction on test set
best_model = grid.best_estimator_
y_pred = best_model.predict(X_test)

# Print the classification report
print(classification_report(y_test, y_pred))

# Print the confusion matrix
print(confusion_matrix(y_test, y_pred))

# Calculate the probabilities of the predictions
y_pred_proba = best_model.predict_proba(X_test)[::,1]

# Calculate the false positive rate, true positive rate and thresholds
fpr, tpr, _ = roc_curve(y_test, y_pred_proba)

# Calculate the Area Under the Curve (AUC)
auc = auc(fpr, tpr)

# Plot the ROC curve
plt.plot(fpr,tpr,label="data 1, auc="+str(auc))
plt.legend(loc=4)
plt.show()
```

# Model Deployment
Deploy the trained and optimized model for predictions.

In [None]:
```python
# Import necessary libraries for model deployment
import pickle

# Save the trained model as a pickle string.
saved_model = pickle.dumps(best_model)

# Load the pickled model
model_from_pickle = pickle.loads(saved_model)

# Use the loaded pickled model to make predictions
model_from_pickle.predict(X_test)
```