In [12]:
import geopandas as gpd
import pandas as pd
from shapely.geometry import Point

df = pd.read_excel('data/world_air_quality.xlsx')

In [13]:
df.head()

Unnamed: 0,Country Code,City,Location,Coordinates,Pollutant,Source Name,Unit,Value,Last Updated,Country Label
0,DE,Moselle,FR22054,"49.1874042196733, 6.91086524487126",PM10,EEA France,µg/m³,6.6,2023-10-31T02:00:00+00:00,Germany
1,DE,Niedersachsen,DENI051,"51.75816, 10.61248",NO,EEA Germany,µg/m³,0.14721,2023-10-31T04:00:00+00:00,Germany
2,DE,Sachsen,DESN001,"50.570872, 12.997278",NO,EEA Germany,µg/m³,1.0,2023-10-31T04:00:00+00:00,Germany
3,DE,Baden-Württemberg,DEBW052,"47.664361, 9.169289",PM2.5,EEA Germany,µg/m³,2.0,2023-10-31T04:00:00+00:00,Germany
4,DE,Baden-Württemberg,DEBW080,"49.00796, 8.38719",PM10,EEA Germany,µg/m³,4.3,2023-10-31T04:00:00+00:00,Germany


In [14]:
# Tratamento de dados
print(df.isnull().sum()) # Verificar valores nulos
print(df.dtypes) # Alterar Last Update para datetime, Coordinates para geometry

Country Code         0
City             23180
Location             2
Coordinates        219
Pollutant            0
Source Name          0
Unit                 0
Value                0
Last Updated         0
Country Label      115
dtype: int64
Country Code      object
City              object
Location          object
Coordinates       object
Pollutant         object
Source Name       object
Unit              object
Value            float64
Last Updated      object
Country Label     object
dtype: object


In [15]:
# Descartar as linhas de coordenadas nulas
df = df.dropna(subset=['Coordinates'])

In [16]:
# Tratamento de colunas com valores nulos
df['City'].fillna('Não informado', inplace=True)
df['Location'].fillna('DESC000', inplace=True)
df['Country Label'].fillna('Não informado', inplace=True)

In [17]:
# Converter coluna Last Update para datetime
df['Last Updated'] = pd.to_datetime(df['Last Updated'])

In [18]:
# Dividir coluna Coordinates em Latitude e Longitude
df['Latitude'] = df['Coordinates'].apply(lambda x: float(x.split(',')[0]))
df['Longitude'] = df['Coordinates'].apply(lambda x: float(x.split(',')[1]))

In [19]:
# Criar coluna geometry
df['geometry'] = None

for index, row in df.iterrows():
    df.loc[index, 'geometry'] = Point(row['Longitude'], row['Latitude'])

In [20]:
# Seleção de colunas necessárias
df = df[['City', 'Pollutant', 'Value', 'Unit', 'Last Updated', 'Country Label','Latitude', 'Longitude' ,'geometry']]

In [21]:
# Converter para GeoDataFrame
df = gpd.GeoDataFrame(df, geometry='geometry')

In [22]:
# Exportação para geojson
df.to_file('data/processado/geo_data.geojson', driver='GeoJSON')