In [73]:
import pandas as pd

# Load the CSV file
file_path = 'dados/portugal_listings.csv'  # Replace with the actual file path
df = pd.read_csv(file_path)

# Print the column names to verify
print("Columns in the DataFrame:", df.columns)

# Rename columns (if necessary)
df = df.rename(columns={
    'Price': 'Price',
    'District': 'District',
    'City': 'City',
    'Town': 'Town'
    # Add more columns as needed
})

# Convert Price to numeric (if it's not already)
df['Price'] = pd.to_numeric(df['Price'], errors='coerce')

# Handle missing values (e.g., replace NaN with a default value or drop rows)
df['Price'] = df['Price'].fillna(0)  # Replace NaN with 0, or use another strategy

# Filter out rows with missing or non-applicable data
df_filtered = df[df['Price'].notna()]

# Display the cleaned DataFrame
print(df_filtered.head())


Columns in the DataFrame: Index(['Price', 'District', 'City', 'Town', 'Unnamed: 4', 'Unnamed: 5',
       'Unnamed: 6', 'Unnamed: 7', 'Unnamed: 8', 'Unnamed: 9', 'Unnamed: 10',
       'Unnamed: 11', 'Unnamed: 12', 'Unnamed: 13'],
      dtype='object')
      Price   District                  City  \
0  780000.0  Vila Real              Valpaços   
1  223000.0       Faro  São Brás de Alportel   
2  228000.0       Faro  São Brás de Alportel   
3  250000.0       Faro  São Brás de Alportel   
4  250000.0       Faro  São Brás de Alportel   

                               Town  Unnamed: 4  Unnamed: 5  Unnamed: 6  \
0  Carrazedo de Montenegro e Curros         NaN         NaN         NaN   
1              São Brás de Alportel         NaN         NaN         NaN   
2              São Brás de Alportel         NaN         NaN         NaN   
3              São Brás de Alportel         NaN         NaN         NaN   
4              São Brás de Alportel         NaN         NaN         NaN   

   Unname

In [7]:
import xml.etree.ElementTree as ET

tree = ET.parse('concelhos.up.xml')  # Parse XML file
root = tree.getroot()  # Get root element

objects_data = []
for object_element in root.findall("Object"):
    object_dict = {}
    for property_element in object_element.findall("Property"):
        name = property_element.get("Name")
        value = property_element.text
        object_dict[name] = value
    objects_data.append(object_dict)

df = pd.DataFrame(objects_data)
    
print(df)

               Concelho                       Address    Latitude   Longitude
0              Abrantes            Abrantes, Portugal  39.4630563  -8.1995808
1                Águeda              Águeda, Portugal  40.5754246  -8.4464368
2       Aguiar da Beira     Aguiar da Beira, Portugal   40.816453  -7.5454104
3             Alandroal           Alandroal, Portugal  38.7021331  -7.4036488
4    Albergaria-a-Velha  Albergaria-a-Velha, Portugal  40.6894236  -8.4796655
..                  ...                           ...         ...         ...
300             Vimioso             Vimioso, Portugal  41.5844451  -6.5291211
301             Vinhais             Vinhais, Portugal  41.8307225  -7.0093005
302               Viseu               Viseu, Portugal  40.6565861  -7.9124712
303              Vizela              Vizela, Portugal  41.3764108  -8.3098348
304             Vouzela             Vouzela, Portugal  40.7231807  -8.1120232

[305 rows x 4 columns]


In [63]:
import json
import pandas as pd

# Load the JSON file
file_path = 'dados/densidadealojamentosm2.json'  # Replace with the actual file path
with open(file_path, 'r', encoding='utf-8') as f:
    data = json.load(f)

# Extract the 2021 data
dados_2021 = data[0]['Dados']['2021']

# Convert the list of dictionaries into a DataFrame
df = pd.DataFrame(dados_2021)

# Print the column names to verify
print("Columns in the DataFrame:", df.columns)

# Rename columns
df = df.rename(columns={
    'geocod': 'RegionCode',
    'geodsg': 'RegionName',
    'ind_string': 'DensityString',
    'valor': 'DensityValue'  # Ensure this matches the actual column name
})

# Convert DensityValue to numeric
df['DensityValue'] = pd.to_numeric(df['DensityValue'], errors='coerce')

# Handle missing values (e.g., replace "-" with NaN)
df['DensityValue'] = df['DensityValue'].replace('-', None)

# Filter out rows with missing or non-applicable data
df_filtered = df[df['DensityValue'].notna()]

# Display the cleaned DataFrame
print(df_filtered.head())

Columns in the DataFrame: Index(['geocod', 'geodsg', 'ind_string', 'valor'], dtype='object')
  RegionCode                                         RegionName DensityString  \
0     070513                                Torre de Coelheiros           2,1   
1     070601                                            Cabrela           2,2   
2     031003                                     Campo do Gerês           2,3   
3     120302                                       Aldeia Velha           2,4   
4     070525  União das freguesias de Nossa Senhora da Toure...           2,5   

   DensityValue  
0           2.1  
1           2.2  
2           2.3  
3           2.4  
4           2.5  


In [57]:
import json
import pandas as pd

# Load the JSON file
file_path = 'dados/densidadePopulacional.json'  # Replace with the actual file path
with open(file_path, 'r', encoding='utf-8') as f:
    data = json.load(f)

# Extract the 2022 data
dados_2022 = data[0]['Dados']['2022']

# Convert the list of dictionaries into a DataFrame
df = pd.DataFrame(dados_2022)

# Print the column names to verify
print("Columns in the DataFrame:", df.columns)

# Rename columns
df = df.rename(columns={
    'geocod': 'RegionCode',
    'geodsg': 'RegionName',
    'ind_string': 'DensityString',
    'valor': 'DensityValue'  # Ensure this matches the actual column name
})

# Convert DensityValue to numeric
df['DensityValue'] = pd.to_numeric(df['DensityValue'], errors='coerce')

# Handle missing values (e.g., replace "-" with NaN)
df['DensityValue'] = df['DensityValue'].replace('-', None)

# Filter out rows with missing or non-applicable data
df_filtered = df[df['DensityValue'].notna()]

# Display the cleaned DataFrame
print(df_filtered.head())

Columns in the DataFrame: Index(['geocod', 'geodsg', 'ind_string', 'valor'], dtype='object')
  RegionCode     RegionName DensityString  DensityValue
0    1500802       Alcoutim           4,3           4.3
1    1840209        Mértola           4,8           4.8
2    16H0505  Idanha-a-Nova           5,9           5.9
3    1861203           Avis           6,2           6.2
4    1861211       Monforte           7,1           7.1


In [53]:
import json
import pandas as pd

# Load the JSON file
file_path = 'dados/rendasm2.json'  # Replace with the actual file path
with open(file_path, 'r', encoding='utf-8') as f:
    data = json.load(f)

# Extract the 2023 data
dados_2023 = data[0]['Dados']['2023']

# Convert the list of dictionaries into a DataFrame
df = pd.DataFrame(dados_2023)

# Print the column names to verify
print("Columns in the DataFrame:", df.columns)

# Rename columns
df = df.rename(columns={
    'geocod': 'RegionCode',
    'geodsg': 'RegionName',
    'ind_string': 'RentString',
    'valor': 'RentValue',  # Ensure this matches the actual column name
    'sinal_conv': 'SignalCode',
    'sinal_conv_desc': 'SignalDescription'
})

# Convert RentValue to numeric
df['RentValue'] = pd.to_numeric(df['RentValue'], errors='coerce')

# Handle missing values (e.g., replace "-" with NaN)
df['RentValue'] = df['RentValue'].replace('-', None)

# Filter out rows with missing or non-applicable data
df_filtered = df[df['RentValue'].notna()]

# Display the cleaned DataFrame
print(df_filtered.head())

Columns in the DataFrame: Index(['geocod', 'geodsg', 'ind_string', 'valor', 'sinal_conv',
       'sinal_conv_desc'],
      dtype='object')
  RegionCode            RegionName RentString  RentValue SignalCode  \
0    11D0914  Vila Nova de Foz Côa       2,08       2.08        NaN   
1    11E0410             Vila Flor       2,29       2.29        NaN   
2    1960913              Trancoso       2,41       2.41        NaN   
3    1960501              Belmonte       2,61       2.61        NaN   
4    11C1302                 Baião       2,63       2.63        NaN   

  SignalDescription  
0               NaN  
1               NaN  
2               NaN  
3               NaN  
4               NaN  
