In [59]:

import pandas as pd
import geopandas as gpd

# Load GeoJSON data
small_areas_gdf = gpd.read_file('/workspaces/Datathon_2024/data/smasvaedi/smasvaedi_2021.json')
city_lane_gdf = gpd.read_file('/workspaces/Datathon_2024/data/geojson_files/cityline_2025.geojson')

# Load CSV data
employed_df = pd.read_csv('/workspaces/Datathon_2024/data/num_of_people_working/fjoldi_starfandi.csv')
income_decile_df = pd.read_csv('data/income_data/tekjutiundir.csv')
population_df = pd.read_csv('/workspaces/Datathon_2024/data/num_of_residents/ibuafjoldi.csv')
dwellings_df = pd.read_csv('data/dwellings/ibudir.csv')
construction_sites_df = pd.read_csv('/workspaces/Datathon_2024/data/hms_lodir/lodir.csv')  #wait for new data



In [60]:
# Inspect GeoJSON data
print(small_areas_gdf.head())
print(city_lane_gdf.head())

# Inspect CSV data
print(employed_df.head())
print(income_decile_df.head())
print(population_df.head())
print(dwellings_df.head())
print(construction_sites_df.head())  

# Check data types and non-null counts
print(small_areas_gdf.info())
print(employed_df.info())


                   id  fid        dataset nuts3       nuts3_label  \
0  smasvaedi_2021.126    1  Smásvæði 2021   001  Höfuðborgarsvæði   
1  smasvaedi_2021.127    2  Smásvæði 2021   001  Höfuðborgarsvæði   
2  smasvaedi_2021.128    3  Smásvæði 2021   001  Höfuðborgarsvæði   
3  smasvaedi_2021.129    4  Smásvæði 2021   001  Höfuðborgarsvæði   
4  smasvaedi_2021.287  161  Smásvæði 2021   002        Landsbyggð   

  nuts3_label_short  nuts3_label_en  hxsv  hxsv_label hxsv_label_short  ...  \
0  Höfuðborgarsvæði  Capital Region  0011   Reykjavík        Reykjavík  ...   
1  Höfuðborgarsvæði  Capital Region  0011   Reykjavík        Reykjavík  ...   
2  Höfuðborgarsvæði  Capital Region  0011   Reykjavík        Reykjavík  ...   
3  Höfuðborgarsvæði  Capital Region  0011   Reykjavík        Reykjavík  ...   
4        Landsbyggð   Other regions  0021  Suðursvæði       Suðursvæði  ...   

  tlsv                   tlsv_label            tlsv_label_short  \
0   01  Reykjavík: Vesturbær norður        

In [61]:
#extract unique values from the columns in dwellings_df['svfnr'] and dwellings_df['smasvaedi']
print(dwellings_df['svfnr'].unique())
print(dwellings_df['smasvaedi'].unique())

[   0. 1400. 1300. 1000. 1100. 1606. 1604.]
[  1.   2.   3.   4.   5.   6.   7.   8.   9.  10.  11.  12.  13.  14.
  15.  16.  17.  18.  19.  20.  21.  22.  23.  24.  25.  26.  27.  28.
  29.  30.  31.  32.  33.  34.  35.  36.  37.  38.  39.  40.  41.  42.
  43.  44.  45.  46.  47.  48.  49.  50.  51.  52.  53.  54.  55.  56.
  57.  58.  59.  60.  61.  62.  63.  64.  65.  66.  67.  68.  69.  70.
  71.  72.  73.  74.  75.  76.  77.  78.  79.  80.  81.  82.  83.  84.
  85.  86.  87.  88.  89.  90.  91.  92.  93.  94.  95.  96.  97.  98.
  99. 158. 166. 167. 168. 169. 170. 171. 172. 173. 174. 175. 176. 177.
 178. 179. 180. 181. 182. 183. 184. 185. 186. 192. 193. 194. 195. 196.
 197. 198. 199. 200. 202. 204.]


In [62]:
# Ensure 'smasvaedi' is treated as a string, then remove the '.0' suffix
dwellings_df['smasvaedi'] = dwellings_df['smasvaedi'].astype(str).str.replace('.0', '', regex=False)

# Continue with the rest of your operations
all_dwellings = dwellings_df[dwellings_df['framvinda'] == "Fullbúið"].groupby('smasvaedi')['Fjöldi'].sum().reset_index()

# Ensure 'fid' in the GeoDataFrame is also a string
small_areas_gdf['fid'] = small_areas_gdf['fid'].astype(str)


# Perform the merge
all_dwellings_smallarea = pd.merge(
    small_areas_gdf, 
    all_dwellings, 
    left_on='fid', 
    right_on='smasvaedi', 
    how='left'
)



# Fill NaN values using .loc to target specific rows
all_dwellings_smallarea.loc[all_dwellings_smallarea['smasvaedi'].isna(), 'smasvaedi'] = all_dwellings_smallarea['fid']


# Convert 'smasvaedi' to string, just to maintain consistency with other string identifiers
all_dwellings_smallarea['smasvaedi'] = all_dwellings_smallarea['smasvaedi'].astype(str)

# Inspect the merged dataset 
print(all_dwellings_smallarea.head())




                   id  fid        dataset nuts3       nuts3_label  \
0  smasvaedi_2021.126    1  Smásvæði 2021   001  Höfuðborgarsvæði   
1  smasvaedi_2021.127    2  Smásvæði 2021   001  Höfuðborgarsvæði   
2  smasvaedi_2021.128    3  Smásvæði 2021   001  Höfuðborgarsvæði   
3  smasvaedi_2021.129    4  Smásvæði 2021   001  Höfuðborgarsvæði   
4  smasvaedi_2021.287  161  Smásvæði 2021   002        Landsbyggð   

  nuts3_label_short  nuts3_label_en  hxsv  hxsv_label hxsv_label_short  ...  \
0  Höfuðborgarsvæði  Capital Region  0011   Reykjavík        Reykjavík  ...   
1  Höfuðborgarsvæði  Capital Region  0011   Reykjavík        Reykjavík  ...   
2  Höfuðborgarsvæði  Capital Region  0011   Reykjavík        Reykjavík  ...   
3  Höfuðborgarsvæði  Capital Region  0011   Reykjavík        Reykjavík  ...   
4        Landsbyggð   Other regions  0021  Suðursvæði       Suðursvæði  ...   

             tlsv_label_short                  tlsv_label_en  smsv  \
0            Rvk: Vesturbær n    Reykjav

In [63]:
print("Unique values in small_areas_gdf['fid']: ", small_areas_gdf['fid'].unique()[:10])
print("Unique values in all_dwellings['smasvaedi']: ", all_dwellings['smasvaedi'].unique()[:10])

Unique values in small_areas_gdf['fid']:  ['1' '2' '3' '4' '161' '162' '163' '5' '6' '7']
Unique values in all_dwellings['smasvaedi']:  ['1' '10' '11' '12' '13' '14' '15' '158' '16' '166']


In [81]:

# Convert 'smasvaedi' in all relevant dataframes to 4-digit format

# Ensure that all 'smasvaedi' columns have the same format
all_dwellings['smasvaedi'] = all_dwellings['smasvaedi'].astype(str).str.zfill(4)
small_areas_gdf['smasvaedi'] = small_areas_gdf['fid'].astype(str).str.zfill(4)

# Repeat this for other datasets that contain 'smasvaedi' to make them consistent
income_decile_df['smasvaedi'] = income_decile_df['smasvaedi'].astype(str).str.zfill(4)
population_df['smasvaedi'] = population_df['smasvaedi'].astype(str).str.zfill(4)


# Calculate median
median_fjoldi = all_dwellings_smallarea['Fjöldi'].median()

# Fix the chained assignment warning by using loc or direct assignment
all_dwellings_smallarea.loc[:, 'Fjöldi'] = all_dwellings_smallarea['Fjöldi'].fillna(median_fjoldi)


In [82]:
# Inspect the merged dataset 
print(all_dwellings_smallarea.head())

                   id  fid        dataset nuts3       nuts3_label  \
0  smasvaedi_2021.126    1  Smásvæði 2021   001  Höfuðborgarsvæði   
1  smasvaedi_2021.127    2  Smásvæði 2021   001  Höfuðborgarsvæði   
2  smasvaedi_2021.128    3  Smásvæði 2021   001  Höfuðborgarsvæði   
3  smasvaedi_2021.129    4  Smásvæði 2021   001  Höfuðborgarsvæði   
4  smasvaedi_2021.287  161  Smásvæði 2021   002        Landsbyggð   

  nuts3_label_short  nuts3_label_en  hxsv  hxsv_label hxsv_label_short  ...  \
0  Höfuðborgarsvæði  Capital Region  0011   Reykjavík        Reykjavík  ...   
1  Höfuðborgarsvæði  Capital Region  0011   Reykjavík        Reykjavík  ...   
2  Höfuðborgarsvæði  Capital Region  0011   Reykjavík        Reykjavík  ...   
3  Höfuðborgarsvæði  Capital Region  0011   Reykjavík        Reykjavík  ...   
4        Landsbyggð   Other regions  0021  Suðursvæði       Suðursvæði  ...   

             tlsv_label_short                  tlsv_label_en  smsv  \
0            Rvk: Vesturbær n    Reykjav

 to highlight which areas are missing dwelling data.

In [None]:
# Fill NaN values in 'Fjöldi' with 0
all_dwellings_smallarea['Fjöldi'].fillna(0, inplace=True)

In [None]:
# Rename the identifier column in CSVs to match GeoJSON
employed_df.rename(columns={'smasvaedi': 'smsv'}, inplace=True)
income_decile_df.rename(columns={'smasvaedi': 'smsv'}, inplace=True)
population_df.rename(columns={'smasvaedi': 'smsv'}, inplace=True)
dwellings_df.rename(columns={'smasvaedi': 'smsv'}, inplace=True)
construction_sites_df.rename(columns={'Smásvæði': 'smsv'}, inplace=True)

# Ensure all identifiers are of type 'str'
small_areas_gdf['smsv'] = small_areas_gdf['smsv'].astype(str)
employed_df['smsv'] = employed_df['smsv'].astype(str)
income_decile_df['smsv'] = income_decile_df['smsv'].astype(str)
population_df['smsv'] = population_df['smsv'].astype(str)
dwellings_df['smsv'] = dwellings_df['smsv'].astype(str)
#construction_sites_df['smsv'] = construction_sites_df['smsv'].astype(str)   #currrently not useful


In [None]:
"""population_df['smasvaedi'] = population_df['smasvaedi'].astype(str).str.zfill(4)
dwellings_df['smasvaedi'] = dwellings_df['smasvaedi'].astype(str).str.zfill(4)
all_dwellings = dwellings_df[dwellings_df['framvinda'] == "Fullbúið"].groupby('smasvaedi')['Fjöldi'].sum().reset_index()


# Ensure that both 'fid' and 'smasvaedi' are strings and zero-padded to match each other.
# Convert 'fid' to string and pad with zeros if necessary to match the format of 'smasvaedi'.
small_areas_gdf['fid'] = small_areas_gdf['fid'].astype(str).str.zfill(4)

# Now you can proceed with merging
all_dwellings_smallarea = pd.merge(
    small_areas_gdf, 
    all_dwellings, 
    left_on='fid', 
    right_on='smasvaedi', 
    how='left'
)

# Inspect the result to verify the merge
print(all_dwellings_smallarea.head()) """