In [None]:
import sys
import os
import re
sys.path.insert(0, os.path.abspath(os.path.join(os.getcwd(), '..')))
import pandas as pd
import plotly.express as px
import numpy as np
import geopandas as gpd
from pyproj import Transformer
from scripts.utils import split_column_at
from scripts.accent_cleaner import AccentCleaner
#from deep_translator import GoogleTranslator

In [None]:
# Read the GeoJSON file
gdf = gpd.read_file('../data/raw/spanish_schools.geojson')

# Ensure that the geometry column contains points
# Extract latitude and longitude from the geometry column
gdf['latitude'] = gdf.geometry.y
gdf['longitude'] = gdf.geometry.x

# Now drop the geometry column if you don't need it anymore
df_schools = gdf.drop(columns='geometry')

df_schools.rename(columns={"Gestion": "public_private",
                            "OBJECTID": "school_id", 
                            "Gestion": "public_private",
                            "Nombre": "name",
                            "Tipo": "type"},
                            inplace=True)

df_schools.drop(["Direccion", "CodigoPostal", "Fuente", "Municipio"], axis=1,  inplace=True)

# I dropped Municipio because we don't have the code, and maybe not needed as we can just calculate distance

df_schools.head()

In [None]:
df_schools["public_private"].unique()

In [None]:
df_schools

In [None]:
translation_map = {
    'Público': 'Public',
    'Concertado': 'Charter',
    'Privado': 'Private'
}

# Replace values in the column
df_schools["public_private"] = df_schools["public_private"].replace(translation_map)

# Print unique values to confirm the change
print(df_schools["public_private"].unique())


In [None]:
df_schools["type"].unique()


In [None]:
df_schools['type'] = df_schools['type'].str.capitalize()

In [None]:
df_schools

In [None]:
df_schools.to_csv("../data/processed/filtered_schools.csv", index=False)

In [None]:
title = "Schools in Spain"

lats = df_schools.latitude
lons = df_schools.longitude

fig = px.scatter_map(df_schools, 
                     lat=lats, 
                     lon=lons,
                     color_continuous_scale=px.colors.carto.Aggrnyl,
                     zoom=5,
                     size_max=50  # Increase max size of markers
                     )

# Adjust the size reference to make small points more visible
fig.update_traces(marker=dict(sizeref=1000))  # Decrease this value to make points larger

fig.update_geos(fitbounds="locations")
fig.update_layout(height=1000, width=1000)
fig.update_layout(mapbox_style="open-street-map")
fig.update_layout(margin={"r":50,"t":50,"l":50,"b":50})
fig.update_layout(
    coloraxis_colorbar=dict(title='asdf')
)
fig.update_layout(title="Schools in Spain ")

fig.show()

### looks like there might be missing data in CLM

In [None]:
# Translate function

# def translate_text(text):
#     if pd.isna(text):  # Handle NaN values
#         return text
#     return GoogleTranslator(source='auto', target='en').translate(text)

# # Apply translation to unique values
# df_schools["type"] = df_schools["type"].apply(translate_text)

# # Check unique values after translation
# print(df_schools["type"].unique())
