In [None]:
%pip install seaborn

In [1]:
import pandas as pd
import geopandas as gpd
from sklearn.cluster import KMeans
import seaborn as sns
import matplotlib.colors as mcolors
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
from sklearn.metrics import silhouette_score

In [2]:
gdf = gpd.read_file("data/administrative_units/gradovi_opcine_zupanije.geojson")
gdf['Županija'] = gdf['text_right'].str.strip()
gdf['Grad/općina'] = gdf['text_left'].str.strip()

In [3]:
def create_static_map(df, cluster_colors):
    # Plot the GeoDataFrame using the assigned colors
    fig, ax = plt.subplots(1, 1, figsize=(12, 12))
    df.plot(
        color=df['Color'],
        ax=ax
    )
    
    # Sort the clusters in ascending order
    sorted_clusters = sorted(df['Cluster'].unique())
    
    # Create the legend patches in sorted order
    legend_patches = [
        mpatches.Patch(color=cluster_colors[label], label=f"Cluster {label}")
        for label in sorted_clusters
    ]
    
    # Add the legend to the plot
    plt.legend(
        handles=legend_patches,
        title="Clusters",
        loc="upper left"
    )

    plt.show()

In [4]:
def determine_number_of_clusters(df, features):
  data = df[features].copy() 

  # Range of k values to try
  k_values = range(2, 9)

  inertia = []
  silhouette_scores = []

  for k in k_values:
      kmeans = KMeans(n_clusters=k, random_state=42, n_init=50)
      kmeans.fit(data)  
      inertia.append(kmeans.inertia_)
      silhouette_scores.append(silhouette_score(data, kmeans.labels_))

  # Plot Elbow Method
  plt.figure(figsize=(8, 4))
  plt.subplot(1, 2, 1)
  plt.plot(k_values, inertia, marker='o')
  plt.title('Elbow Method')
  plt.xlabel('Number of clusters (k)')
  plt.ylabel('Inertia')

  # Plot Silhouette Score
  plt.subplot(1, 2, 2)
  plt.plot(k_values, silhouette_scores, marker='o')
  plt.title('Silhouette Score')
  plt.xlabel('Number of clusters (k)')
  plt.ylabel('Silhouette Score')

  plt.tight_layout()
  plt.show()

In [5]:
def cluster(df, features, k):
    data = df[features].copy()
    kmeans = KMeans(n_clusters=k, n_init=50)
    kmeans.fit(data)

    # Create a copy of the original DataFrame to avoid modifying it in place
    df_copy = df.copy()

    # Add cluster labels to the copied DataFrame
    df_copy['Cluster'] = kmeans.labels_

    # Create and format the cluster centers DataFrame
    cluster_centers_df = pd.DataFrame(kmeans.cluster_centers_, columns=features).T
    formatted_df = cluster_centers_df.style.format("{:.6f}")

    # Define a color palette for the clusters
    palette = sns.color_palette("Set2", n_colors=df_copy['Cluster'].nunique())
    cluster_colors = {label: mcolors.to_hex(color) for label, color in enumerate(palette)}
    df_copy['Color'] = df_copy['Cluster'].map(cluster_colors)

    # Call the function to create the map
    create_static_map(df_copy, cluster_colors)

    # Return the formatted cluster table and the modified DataFrame
    return formatted_df, df_copy


IZBORNI REZULTATI

In [None]:
election_results = pd.read_csv("data/election_results/kombinirani_rezultati.csv")

party_percentages = election_results.columns[34:61] 
party_percentages = list(party_percentages)
print(party_percentages)

In [7]:
election_results['Županija'] = election_results['Županija'].str.strip()  # Strip any extra spaces
election_results['Grad/općina'] = election_results['Grad/općina/država'].str.strip()

df = gdf.merge(election_results, on=['Županija', 'Grad/općina'], how='inner')

In [None]:
determine_number_of_clusters(df, party_percentages)

In [None]:
formatted_cluster_table, clustered_df = cluster(df, party_percentages, 4)
formatted_cluster_table

BRAČNI STATUS

In [None]:
census_df = pd.read_csv("data/census/bračni_status.csv")
census_percentages = census_df.columns[16:26]
census_percentages =list(census_percentages)
print(census_percentages)
df = gdf.merge(census_df, on=['Županija', 'Grad/općina'], how='inner')
df = df[df['Starost'] == 'Ukupno']

In [None]:
determine_number_of_clusters(df, census_percentages)

In [None]:
formatted_cluster_table, clustered_df = cluster(df, census_percentages, 3)
formatted_cluster_table

BROJ ŽIVOROĐENE DJECE

In [None]:
census_df = pd.read_csv("data/census/broj_živorođene_djece.csv")
census_percentages = census_df.columns[[17] + list(range(19, 30))]
census_percentages =list(census_percentages)
print(census_percentages)
df = gdf.merge(census_df, on=['Županija', 'Grad/općina'], how='inner')
df = df[df['Starost'] == 'Ukupno']

In [None]:
determine_number_of_clusters(df, census_percentages)

In [None]:
formatted_cluster_table, clustered_df = cluster(df, census_percentages, 3)
formatted_cluster_table

DOB

In [None]:
census_df = pd.read_csv("data/census/dob.csv")
census_percentages = census_df.columns[[14] + [18] +[23] + [25]]
census_percentages =list(census_percentages)
print(census_percentages)
df = gdf.merge(census_df, on=['Županija', 'Grad/općina'], how='inner')
df = df[df['Spol'] == 'sv.']

In [None]:
determine_number_of_clusters(df, census_percentages)

In [None]:
formatted_cluster_table, clustered_df = cluster(df, census_percentages, 4)
formatted_cluster_table

DRŽAVLJANSTVO

In [None]:
census_df = pd.read_csv("data/census/državljanstvo.csv")
census_df['Samo Hrvatsko %'] = (census_df['Hrvatsko (ukupno)'] - census_df['Hrvatsko i drugo']) / census_df['Ukupno'] *100

census_percentages = census_df.columns[[7] + [9] + [11]]
census_percentages =list(census_percentages)
census_percentages.append('Samo Hrvatsko %')
print(census_percentages)
df = gdf.merge(census_df, on=['Županija', 'Grad/općina'], how='inner')

In [None]:
determine_number_of_clusters(df, census_percentages)

In [None]:
formatted_cluster_table, clustered_df = cluster(df, census_percentages, 4)
formatted_cluster_table

EKONOMSKA AKTIVNOST

In [None]:
census_df = pd.read_csv("data/census/ekonomska_aktivnost.csv")
census_percentages = census_df.columns[[16] + [17] + list(range(19,25))]
census_percentages =list(census_percentages)
print(census_percentages)
df = gdf.merge(census_df, on=['Županija', 'Grad/općina'], how='inner')
df = df[(df['Spol'] == 'sv.') & (df['Starost'] == 'Ukupno')]

In [None]:
determine_number_of_clusters(df, census_percentages)

In [None]:
formatted_cluster_table, clustered_df = cluster(df, census_percentages, 5)
formatted_cluster_table

MATERINSKI JEZIK

In [None]:
census_df = pd.read_csv("data/census/materinski_jezik.csv")
census_percentages = [col for col in census_df.columns[5:56] if col.endswith('%')]
census_percentages =list(census_percentages)
print(census_percentages)
df = gdf.merge(census_df, on=['Županija', 'Grad/općina'], how='inner')

In [None]:
determine_number_of_clusters(df, census_percentages)

In [None]:
formatted_cluster_table, clustered_df = cluster(df, census_percentages, 5)
formatted_cluster_table

MIGRACIJSKA OBILJEŽJA

In [None]:
census_df = pd.read_csv("data/census/migracijska_obilježja.csv")
census_df['Od rođenja u istom gradu/općini %'] = (census_df['Od rođenja stanuju u istom naselju'] + census_df['Iz drugog naselja istoga grada ili općine1)']) / census_df['Ukupan broj stanovnika'] * 100
census_percentages = [col for col in census_df.columns[list(range(20,22)) + list(range(23,30))] if col.endswith('%')]
census_percentages =list(census_percentages)
census_percentages.append('Od rođenja u istom gradu/općini %')
print(census_percentages)
df = gdf.merge(census_df, on=['Županija', 'Grad/općina'], how='inner')
df = df[df['Spol'] == 'sv.']

In [None]:
determine_number_of_clusters(df, census_percentages)

In [None]:
formatted_cluster_table, clustered_df = cluster(df, census_percentages, 4)
formatted_cluster_table

MJESTO ROĐENJA

In [None]:
census_df = pd.read_csv("data/census/mjesto_rođenja_stanovanja.csv")
census_df['Rođeni u gradu/općini stanovanja %'] = (census_df['U mjestu stanovanja'] + census_df['U drugom naselju istoga grada/općine2)']) / census_df['Ukupan broj stanovnika'] * 100
census_percentages = [col for col in census_df.columns[list(range(22,25)) + list(range(26, 33))] if col.endswith('%')]
census_percentages =list(census_percentages)
census_percentages.append('Rođeni u gradu/općini stanovanja %')
print(census_percentages)
df = gdf.merge(census_df, on=['Županija', 'Grad/općina'], how='inner')
df = df[df['Spol'] == 'sv.']

In [None]:
determine_number_of_clusters(df, census_percentages)

In [None]:
formatted_cluster_table, clustered_df = cluster(df, census_percentages, 3)
formatted_cluster_table

NARODNOST

In [None]:
census_df = pd.read_csv("data/census/narodnost.csv")
census_percentages = [col for col in census_df.columns[5:62] if col.endswith('%')]
census_percentages =list(census_percentages)
print(census_percentages)
df = gdf.merge(census_df, on=['Županija', 'Grad/općina'], how='inner')

In [None]:
determine_number_of_clusters(df, census_percentages)

In [None]:
formatted_cluster_table, clustered_df = cluster(df, census_percentages, 4)
formatted_cluster_table

PODRUČJE DJELATNOSTI

In [None]:
census_df = pd.read_csv("data/census/područje_djelatnosti.csv")
census_percentages = [col for col in census_df.columns if col.endswith('%')]
census_percentages =list(census_percentages)
print(census_percentages)
df = gdf.merge(census_df, on=['Županija', 'Grad/općina'], how='inner')
df = df[(df['Starost'] == 'Ukupno') & (df['Spol'] == 'sv.')]

In [None]:
determine_number_of_clusters(df, census_percentages)

In [None]:
formatted_cluster_table, clustered_df = cluster(df, census_percentages, 4)
formatted_cluster_table

POHAĐANJE ŠKOLE

In [None]:
census_df = pd.read_csv("data/census/pohađanje_škole.csv")
census_percentages = census_df.columns[[22] + [24] + [25] + list(range(27,31)) + [32] + [33] + list(range(35,40))]
census_percentages =list(census_percentages)
print(census_percentages)
df = gdf.merge(census_df, on=['Županija', 'Grad/općina'], how='inner')
df = df[df['Spol'] == 'sv.']

In [None]:
determine_number_of_clusters(df, census_percentages)

In [None]:
formatted_cluster_table, clustered_df = cluster(df, census_percentages, 4)
formatted_cluster_table

In [None]:
census_df = pd.read_csv("data/census/pohađanje_škole.csv")
census_percentages = census_df.columns[27:31]
census_percentages =list(census_percentages)
print(census_percentages)
df = gdf.merge(census_df, on=['Županija', 'Grad/općina'], how='inner')
df = df[df['Spol'] == 'sv.']

In [None]:
determine_number_of_clusters(df, census_percentages)

In [None]:
formatted_cluster_table, clustered_df = cluster(df, census_percentages, 4)
formatted_cluster_table

POLOŽAJ U ZAPOSLENJU

In [None]:
census_df = pd.read_csv("data/census/položaj_u_zaposlenju.csv")
census_percentages = census_df.columns[[12] + list(range(14,19))]
census_percentages =list(census_percentages)
print(census_percentages)
df = gdf.merge(census_df, on=['Županija', 'Grad/općina'], how='inner')
df = df[(df['Spol'] == 'sv.') & (df['Starost'] == 'Ukupno')]

In [None]:
determine_number_of_clusters(df, census_percentages)

In [None]:
formatted_cluster_table, clustered_df = cluster(df, census_percentages, 4)
formatted_cluster_table

VJERA

In [None]:
census_df = pd.read_csv("data/census/vjera.csv")
census_percentages = [col for col in census_df.columns[5:28] if col.endswith('%')]
census_percentages =list(census_percentages)
print(census_percentages)
df = gdf.merge(census_df, on=['Županija', 'Grad/općina'], how='inner')

In [None]:
determine_number_of_clusters(df, census_percentages)

In [None]:
formatted_cluster_table, clustered_df = cluster(df, census_percentages, 4)
formatted_cluster_table

ZAPOSLENOST PREMA ZANIMANJU

In [None]:
census_df = pd.read_csv("data/census/zaposlenost_prema_zanimanju.csv")
census_percentages = [col for col in census_df.columns if col.endswith('%')]
census_percentages =list(census_percentages)
print(census_percentages)
df = gdf.merge(census_df, on=['Županija', 'Grad/općina'], how='inner')
df = df[(df['Starost'] == 'Ukupno') & (df['Spol'] == 'sv.')]

In [None]:
determine_number_of_clusters(df, census_percentages)

In [None]:
formatted_cluster_table, clustered_df = cluster(df, census_percentages, 5)
formatted_cluster_table

ZAVRŠENA ŠKOLA

In [None]:
census_df = pd.read_csv("data/census/završena_škola.csv")
census_percentages = census_df.columns[list(range(15,20)) + list(range(21,25))]
census_percentages =list(census_percentages)
print(census_percentages)
df = gdf.merge(census_df, on=['Županija', 'Grad/općina'], how='inner')
df = df[(df['Spol'] == 'sv.') & (df['Starost'] == 'Ukupno')]

In [None]:
determine_number_of_clusters(df, census_percentages)

In [None]:
formatted_cluster_table, clustered_df = cluster(df, census_percentages, 4)
formatted_cluster_table

VIŠE PODATAKA IZ POPISA

In [None]:
census_percentages = []

bračni_status = pd.read_csv("data/census/bračni_status.csv")
bračni_status = bračni_status[bračni_status['Starost'] == 'Ukupno']
bračni_status_percentages = bračni_status.columns[16:26]
bračni_status_percentages =list(bračni_status_percentages)
census_percentages += bračni_status_percentages

br_djece = pd.read_csv("data/census/broj_živorođene_djece.csv")
br_djece = br_djece[br_djece['Starost'] == 'Ukupno']
br_djece_percentages = br_djece.columns[[17] + list(range(19, 30))]
br_djece_percentages =list(br_djece_percentages)
census_percentages += br_djece_percentages

dob = pd.read_csv("data/census/dob.csv")
dob = dob[dob['Spol'] == 'sv.']
dob_percentages = dob.columns[[14] + [18] +[23] + [25]]
dob_percentages =list(dob_percentages)
census_percentages += dob_percentages

ekonomska_aktivnost = pd.read_csv("data/census/ekonomska_aktivnost.csv")
ekonomska_aktivnost = ekonomska_aktivnost[(ekonomska_aktivnost['Spol'] == 'sv.') & (ekonomska_aktivnost['Starost'] == 'Ukupno')]
ekonomska_aktivnost_percentages = ekonomska_aktivnost.columns[[16] + [17] + list(range(19,25))]
ekonomska_aktivnost_percentages =list(ekonomska_aktivnost_percentages)
census_percentages += ekonomska_aktivnost_percentages

mjesto_rođenja = pd.read_csv("data/census/mjesto_rođenja_stanovanja.csv")
mjesto_rođenja = mjesto_rođenja[mjesto_rođenja['Spol'] == 'sv.']
mjesto_rođenja['Rođeni u gradu/općini stanovanja %'] = (mjesto_rođenja['U mjestu stanovanja'] + mjesto_rođenja['U drugom naselju istoga grada/općine2)']) / mjesto_rođenja['Ukupan broj stanovnika'] * 100
mjesto_rođenja_percentages = [col for col in mjesto_rođenja.columns[list(range(22,25)) + list(range(26, 33))] if col.endswith('%')]
mjesto_rođenja_percentages =list(mjesto_rođenja_percentages)
mjesto_rođenja_percentages.append('Rođeni u gradu/općini stanovanja %')
census_percentages += mjesto_rođenja_percentages

narodnost = pd.read_csv("data/census/narodnost.csv")
narodnost_percentages = [col for col in narodnost.columns[5:62] if col.endswith('%')]
narodnost_percentages =list(narodnost_percentages)
census_percentages += narodnost_percentages

područje_djelatnosti = pd.read_csv("data/census/područje_djelatnosti.csv")
područje_djelatnosti = područje_djelatnosti[(područje_djelatnosti['Starost'] == 'Ukupno') & (područje_djelatnosti['Spol'] == 'sv.')]
područje_djelatnosti_percentages = [col for col in područje_djelatnosti.columns if col.endswith('%')]
područje_djelatnosti_percentages =list(područje_djelatnosti_percentages)
census_percentages += područje_djelatnosti_percentages

položaj_u_zaposlenju = pd.read_csv("data/census/položaj_u_zaposlenju.csv")
položaj_u_zaposlenju = položaj_u_zaposlenju[(položaj_u_zaposlenju['Spol'] == 'sv.') & (položaj_u_zaposlenju['Starost'] == 'Ukupno')]
položaj_u_zaposlenju_percentages = položaj_u_zaposlenju.columns[[12] + list(range(14,19))]
položaj_u_zaposlenju_percentages =list(položaj_u_zaposlenju_percentages)
census_percentages += položaj_u_zaposlenju_percentages

vjera = pd.read_csv("data/census/vjera.csv")
vjera_percentages = [col for col in vjera.columns[5:28] if col.endswith('%')]
vjera_percentages =list(vjera_percentages)
census_percentages += vjera_percentages

završena_škola = pd.read_csv("data/census/završena_škola.csv")
završena_škola = završena_škola[(završena_škola['Spol'] == 'sv.') & (završena_škola['Starost'] == 'Ukupno')]
završena_škola_percentages = završena_škola.columns[list(range(15,20)) + list(range(21,25))]
završena_škola_percentages =list(završena_škola_percentages)
census_percentages += završena_škola_percentages

census_df = (
    bračni_status
    .merge(br_djece, on=['Županija', 'Grad/općina'], how='inner', suffixes=('bračni_status', '_br_djece'))
    .merge(dob, on=['Županija', 'Grad/općina'], how='inner', suffixes=('', '_dob'))
    .merge(ekonomska_aktivnost, on=['Županija', 'Grad/općina'], how='inner', suffixes=('', '_ekonomska_aktivnost'))
    .merge(mjesto_rođenja, on=['Županija', 'Grad/općina'], how='inner', suffixes=('', '_mjesto_rođenja'))
    .merge(narodnost, on=['Županija', 'Grad/općina'], how='inner', suffixes=('', '_narodnost'))
    .merge(područje_djelatnosti, on=['Županija', 'Grad/općina'], how='inner', suffixes=('', '_područje_djelatnosti'))
    .merge(položaj_u_zaposlenju, on=['Županija', 'Grad/općina'], how='inner', suffixes=('', '_položaj_u_zaposlenju'))
    .merge(vjera, on=['Županija', 'Grad/općina'], how='inner', suffixes=('', '_vjera'))
    .merge(završena_škola, on=['Županija', 'Grad/općina'], how='inner', suffixes=('', '_završena_škola'))
)
df = gdf.merge(census_df, on=['Županija', 'Grad/općina'], how='inner')

print(census_percentages)

In [None]:
determine_number_of_clusters(df, census_percentages)

In [None]:
formatted_cluster_table, clustered_df = cluster(df, census_percentages, 8)
formatted_cluster_table

PODATCI IZ POPISA + REZULTATI IZBORA

In [None]:
election_results = pd.read_csv("data/election_results/kombinirani_rezultati.csv")

party_percentages = election_results.columns[34:61] 
party_percentages = list(party_percentages)
print(party_percentages)

election_results['Županija'] = election_results['Županija'].str.strip()  # Strip any extra spaces
election_results['Grad/općina'] = election_results['Grad/općina/država'].str.strip()

df = gdf.merge(election_results, on=['Županija', 'Grad/općina'], how='inner').merge(census_df, on=['Županija', 'Grad/općina'], how='inner')
features = party_percentages + census_percentages
print(features)

In [None]:
determine_number_of_clusters(df, features)

In [None]:
formatted_cluster_table, clustered_df = cluster(df, features, 8)
formatted_cluster_table