In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
from shapely.geometry import Point
import json
import plotly.express as px
import geopandas as gpd
import plotly.graph_objects as go

# Filter For Sweden Data Only

In [None]:
df = pd.read_csv('stops_data.csv')

# label column
df['label'] = np.where(df['key'].isna(), 'Unlabeled', 'Labeled')

# filter for Sweden only using geojson file
sweden_geojson = 'sweden_geodata/se.json'
sweden = gpd.read_file(sweden_geojson)

# create GeoDataFrame from df
gdf = gpd.GeoDataFrame(df, geometry=[Point(xy) for xy in zip(df.lon, df.lat)], crs="EPSG:4326")

# ensure the coordinate reference systems match, reproject if not
if sweden.crs != gdf.crs:
    sweden = sweden.to_crs(gdf.crs)

# spatial join to filter out points outside of Sweden
sweden_gdf = gpd.sjoin(gdf, sweden, how='inner', predicate='within')

# convert to pandas df
sweden_df = pd.DataFrame(sweden_gdf.drop(columns='geometry'))

# save filtered df
sweden_df.drop(['index_right', 'source', 'id', 'name'], axis=1).to_csv('sweden_df.csv', sep=',', index=False, encoding='utf-8')

# Geo Scatter (Map Plot)

In [None]:
df = pd.read_csv('sweden_df.csv')

In [None]:
def geo_scatter(df, samplesize):
    
    df = df.sample(samplesize).copy()
    px.set_mapbox_access_token('pk.eyJ1Ijoiam9sYW5kaXVzIiwiYSI6ImNsdTFnMnEyYTBtMTkyanFzeGhheXZlNTAifQ.R135YPOJBXyYHSfeL42e_w')
    
    fig = px.scatter_mapbox(df, lat="lat", lon="lon",
             #           color_discrete_map={"Labeled": "blue", "Unlabeled": "red"},
                        color="value", # can also color by "label"
                        zoom=4,
                 #       hover_data=["key", "category", "value"],
                        mapbox_style='open-street-map', title=title)
    fig.update_traces(marker=dict(size=5))
    fig.update_layout(width=1000, height=1000)
    
    return fig

# Unlabeled vs Labeled Pie Chart & Label Counts of 'value'

In [None]:
total = df['label'].value_counts().values.sum()

def fmt(x):
    return '{:.1f}%\n{:.0f}'.format(x, total*x/100)

fig = plt.figure(figsize=(7, 5))
plt.pie(df.label.value_counts(), labels=df.label.unique(), colors=['lightblue','darksalmon'], autopct=fmt)

# show plo
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(5,4))
df.value.value_counts().plot(kind='bar')
plt.xticks(rotation=0)
plt.xlabel('value', size=12)
plt.ylabel('Count')
plt.title('Label counts of value')
#plt.tight_layout()
#plt.savefig('LabelCounts.png')
plt.show()

# Cloropleth with Unlabeled/Labeled ratio for Swedish Administrative Regions

In [None]:
admin_areas = gpd.read_file('sweden_geodata/se_adminregions.json')

source: https://simplemaps.com/gis/country/se#admin1

In [None]:
# convert df to a gdf
gdf_points = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df.lon, df.lat), crs='EPSG:4326')

# ensure administrative areas are in the same CRS as points
admin_areas = admin_areas.to_crs('EPSG:4326')

# assign each point to an administrative area
gdf_joined = gpd.sjoin(gdf_points, admin_areas, how='inner', predicate='within')

# calculate counts of Unlabeled to Labeled and their ratios per area
label_counts = gdf_joined.groupby('name')['label'].value_counts().unstack(fill_value=0)
label_counts['ratio'] = label_counts['Unlabeled'] / (label_counts['Unlabeled'] + label_counts['Labeled'])
label_counts.reset_index(inplace=True)

# merge counts and ratios back to the admin_areas gdf
admin_areas_ratios = admin_areas.merge(label_counts, left_on='name', right_on='name', how='left')

fig = px.choropleth_mapbox(admin_areas_ratios, geojson=admin_areas_ratios.geometry.__geo_interface__,
                           locations=admin_areas_ratios.index, color='ratio',
                           color_continuous_scale="thermal", hover_name="name",
                           range_color=(0, admin_areas_ratios['ratio'].max()),
                           mapbox_style="carto-positron",
                           zoom=5, center={"lat": 60.128161, "lon": 18.643501},  # Center on Sweden
                           opacity=0.5)
fig.update_layout(width=500, height=900)
fig.show()

# Sweden Administrative Regions and Unlabeled Ratio Correlation

In [None]:
import requests

# fetch the population table from wikipedia
url = 'https://en.wikipedia.org/wiki/Counties_of_Sweden'
tables = pd.read_html(requests.get(url).text)

table = pd.DataFrame(tables[4]).drop(['ISO', 'NUTS', 'CoA', 'Governor', 'Administrative centre'], axis=1)  # pop. data table

table = table.rename(columns={'Population (2021-12-31)[1]': 'Population', 'County (Län)': 'name'})

In [None]:
counties = pd.merge(admin_areas_ratios, table, on='name', how='inner').drop(['source'], axis=1)

corr_cols = corr_cols[['ratio', 'Area (km2)', 'Population', 'Density',
       'centroid_lon', 'centroid_lat']]
rename = ['unlabeled ratio', 'area (km2)', 'population', 'pop. density',
       'reg. centroid lon', 'reg. centroid lat']
corr_cols.columns = rename

correlation_matrix = corr_cols.corr()

In [None]:
plt.figure(figsize=(8, 6))
sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap="coolwarm", square=True, linewidths=.5)
plt.title('Correlation Matrix for Swedish Regions')
plt.xticks(rotation=45)
plt.yticks(rotation=45)
plt.show()