In [None]:
import pandas as pd
import json
import numpy as np

## Carregar os dados

In [None]:
with open('../datasets/HPI_master.json', 'r') as f:
    json_data = json.load(f)

data_json = pd.DataFrame(json_data)

# Carregar o dataset CSV - Population size
data_csv_ps = pd.read_csv("../datasets/US_Population.csv")
data_csv_ps = pd.DataFrame(data_csv_ps)

----
## Visualizar os dados

In [None]:
data_json.head()

In [None]:
data_csv_ps.head()

#### **Tipos de dados do dataset: `Population Size`**

In [None]:
data_csv_ps.info()

#### **Tipos de dados do dataset: `House Price Index`**

In [None]:
data_json.dtypes

#### **Verificar a existência de Missing Values nos Dataframes**

In [None]:
print("Number of missing values in House Price dataset:")
json_MV = data_json.isnull().sum()
print(json_MV)

print("Number of nan values in House Price dataset:")
nan_counts = data_json.apply(lambda x: x.value_counts().get('NaN', 0))

# Print the number of "Nan" values for each column
print(nan_counts)

In [None]:
print("Number of missing values in Population Size dataset:")
populationSize_MV = data_csv_ps.isnull().sum()
print(populationSize_MV)

----
## Tratamento dos Dados

#### **Filtrar apenas as colunas com `year` igual ou superior a 1977**
Uma vez que nem todos os datasets contêm informação de anos anteriores

In [None]:
print("[Dataframes size before]:\n\n")
print("PS: ", data_csv_ps.count(), "\n")
print("HPI: ", data_json.count())

In [None]:
data_csv_ps = data_csv_ps[data_csv_ps['year'] >= 1975]
data_json = data_json[data_json['yr'] >= 1975]

Verificar a correção do tratamento

In [None]:
for index, row in data_json.iterrows():
    if row['yr'] < 1975:
        print("Falha detectada na linha", index)

for index, row in data_json.iterrows():
    if row['yr'] < 1975:
        print("Falha detectada na linha", index)

for index, row in data_json.iterrows():
    if row['yr'] < 1975:
        print("Falha detectada na linha", index)

In [None]:
data_json.head()

In [None]:
print("[Dataframe size after]: PS: ", data_csv_ps.count(), "HPI: ", data_json.count())

#### **Remover a coluna `index_sa`**

In [None]:
data_json = data_json.drop('index_sa', axis=1)

#### **Transformar `period` em dados numéricos**

In [None]:
data_json['period'].astype(str).astype(int)

#### **Extrair `State` da localização no dataset `json` (House Price Index)**

In [None]:
import re

state_names = {
    "AL": "Alabama", "AK": "Alaska", "AZ": "Arizona", "AR": "Arkansas", "CA": "California",
    "CO": "Colorado", "CT": "Connecticut", "DE": "Delaware", "FL": "Florida", "GA": "Georgia",
    "HI": "Hawaii", "ID": "Idaho", "IL": "Illinois", "IN": "Indiana", "IA": "Iowa",
    "KS": "Kansas", "KY": "Kentucky", "LA": "Louisiana", "ME": "Maine", "MD": "Maryland",
    "MA": "Massachusetts", "MI": "Michigan", "MN": "Minnesota", "MS": "Mississippi", "MO": "Missouri",
    "MT": "Montana", "NE": "Nebraska", "NV": "Nevada", "NH": "New Hampshire", "NJ": "New Jersey",
    "NM": "New Mexico", "NY": "New York", "NC": "North Carolina", "ND": "North Dakota", "OH": "Ohio",
    "OK": "Oklahoma", "OR": "Oregon", "PA": "Pennsylvania", "RI": "Rhode Island", "SC": "South Carolina",
    "SD": "South Dakota", "TN": "Tennessee", "TX": "Texas", "UT": "Utah", "VT": "Vermont",
    "VA": "Virginia", "WA": "Washington", "WV": "West Virginia", "WI": "Wisconsin", "WY": "Wyoming",
    "Alabama": "Alabama", "Alaska": "Alaska", "Arizona": "Arizona", "Arkansas": "Arkansas",
    "California": "California", "Colorado": "Colorado", "Connecticut": "Connecticut", "Delaware": "Delaware",
    "Florida": "Florida", "Georgia": "Georgia", "Hawaii": "Hawaii", "Idaho": "Idaho", "Illinois": "Illinois",
    "Indiana": "Indiana", "Iowa": "Iowa", "Kansas": "Kansas", "Kentucky": "Kentucky", "Louisiana": "Louisiana",
    "Maine": "Maine", "Maryland": "Maryland", "Massachusetts": "Massachusetts", "Michigan": "Michigan",
    "Minnesota": "Minnesota", "Mississippi": "Mississippi", "Missouri": "Missouri", "Montana": "Montana",
    "Nebraska": "Nebraska", "Nevada": "Nevada", "New Hampshire": "New Hampshire", "New Jersey": "New Jersey",
    "New Mexico": "New Mexico", "New York": "New York", "North Carolina": "North Carolina",
    "North Dakota": "North Dakota", "Ohio": "Ohio", "Oklahoma": "Oklahoma", "Oregon": "Oregon",
    "Pennsylvania": "Pennsylvania", "Rhode Island": "Rhode Island", "South Carolina": "South Carolina",
    "South Dakota": "South Dakota", "Tennessee": "Tennessee", "Texas": "Texas", "Utah": "Utah",
    "Vermont": "Vermont", "Virginia": "Virginia", "Washington": "Washington", "West Virginia": "West Virginia",
    "Wisconsin": "Wisconsin", "Wyoming": "Wyoming", "Puerto Rico": "Puerto Rico", "United States": "United States",
    "East North Central Division": "East North Central Division", "East South Central Division": "East South Central Division",
    "Middle Atlantic Division": "Middle Atlantic Division", "Mountain Division": "Mountain Division", "New England Division": "New England Division",
    "South Atlantic Division": "South Atlantic Division", "West North Central Division": "West North Central Division",
    "West South Central Division": "West South Central Division", "Pacific Division": "Pacific Division"
}


# Define the array of values
special_names = [
    "Alaska", "Alabama", "Arkansas", "Arizona", "California", "Colorado", "Connecticut", "District of Columbia",
    "Delaware", "East North Central Division", "East South Central Division", "Middle Atlantic Division",
    "Mountain Division", "New England Division", "Pacific Division", "South Atlantic Division",
    "West North Central Division", "West South Central Division", "Florida", "Georgia", "Hawaii", "Iowa", "Idaho",
    "Illinois", "Indiana", "Kansas", "Kentucky", "Louisiana", "Massachusetts", "Maryland", "Maine", "Michigan",
    "Minnesota", "Missouri", "Mississippi", "Montana", "North Carolina", "North Dakota", "Nebraska",
    "New Hampshire", "New Jersey", "New Mexico", "Nevada", "New York", "Ohio", "Oklahoma", "Oregon", "Pennsylvania",
    "Puerto Rico", "Rhode Island", "South Carolina", "South Dakota", "Tennessee", "Texas", "United States", "Utah",
    "Virginia", "Vermont", "Washington", "Wisconsin", "West Virginia", "Wyoming"
]

def extract_state(place_name):
    if place_name not in special_names:
        state = re.findall(r'\b([A-Z]{2})\b', place_name)
        if state:
            return state[0]
    return place_name

data_json["state"] = data_json["place_name"].map(extract_state)
data_json["state"] = data_json["state"].map(state_names)

#### **Tratar a coluna `value`**

In [None]:
data_json['index_nsa'] = data_json['index_nsa'].round(2)

#### **Tratar a coluna `State`**

In [None]:
# Replace 'DC' with 'District of Columbia' in the 'state' column of the DataFrame
data_csv_ps['state'] = data_csv_ps['state'].replace('DC', 'District of Columbia')

data_csv_ps['state'] = data_csv_ps['state'].replace('US', 'United States')

#### **Renomear colunas**

Estas colunas são necessárias renomear para ser possível efetuar o merge

In [None]:
data_csv_ps.rename(columns = {'all':'populationSize'}, inplace = True)
data_json.rename(columns = {'yr':'year'}, inplace = True)

In [None]:
data_json['level'].unique()

#### **Remover colunas**

In [None]:
data_json.drop(['hpi_type', 'hpi_flavor', 'frequency', 'place_id', 'level'], axis=1, inplace=True)

#### **Agrupar dataset `HPI` (json) por estado e ano**

In [None]:
# Define a dictionary to map states with their IDs
state_id_map = {
    'United States': 0,
    'Alabama': 1,
    'Alaska': 2,
    'Arizona': 4,
    'Arkansas': 5,
    'California': 6,
    'Colorado': 8,
    'Connecticut': 9,
    'Delaware': 10,
    'District of Columbia': 11,
    'Florida': 12,
    'Georgia': 13,
    'Hawaii': 15,
    'Idaho': 16,
    'Illinois': 17,
    'Indiana': 18,
    'Iowa': 19,
    'Kansas': 20,
    'Kentucky': 21,
    'Louisiana': 22,
    'Maine': 23,
    'Maryland': 24,
    'Massachusetts': 25,
    'Michigan': 26,
    'Minnesota': 27,
    'Mississippi': 28,
    'Missouri': 29,
    'Montana': 30,
    'Nebraska': 31,
    'Nevada': 32,
    'New Hampshire': 33,
    'New Jersey': 34,
    'New Mexico': 35,
    'New York': 36,
    'North Carolina': 37,
    'North Dakota': 38,
    'Ohio': 39,
    'Oklahoma': 40,
    'Oregon': 41,
    'Pennsylvania': 42,
    'Rhode Island': 44,
    'South Carolina': 45,
    'South Dakota': 46,
    'Tennessee': 47,
    'Texas': 48,
    'Utah': 49,
    'Vermont': 50,
    'Virginia': 51,
    'Washington': 53,
    'West Virginia': 54,
    'Wisconsin': 55,
    'Wyoming': 56,
    'Puerto Rico': 72,
    'Virgin Islands': 78,
    'East North Central Division': 80,
    'East South Central Division': 81,
    'West North Central Division': 82,
    'West South Central Division': 83,
    'Middle Atlantic Division': 84,
    'South Atlantic Division': 85,
    'Pacific Division': 86,
    'Mountain Division': 87,
    'New England Division': 88
}

# Group the data by state and year and calculate the average of index values
grouped_data = data_json.groupby(['state', 'year']).agg({'index_nsa': 'mean'})

# Reset the index of the grouped DataFrame
grouped_data = grouped_data.reset_index()

# Round the 'index_nsa' column to 2 decimal places
grouped_data['index_nsa'] = grouped_data['index_nsa'].round(2)

#### **Adicionar `state_id` ao dataset `HPI` (json) agrupado por estado e ano**

In [None]:
# Add a new column 'state_id' with the corresponding state ID using the dictionary
grouped_data['state_id'] = grouped_data['state'].map(state_id_map)

# Replace missing or invalid state IDs with a placeholder value, such as -1
grouped_data['state_id'] = grouped_data['state_id'].fillna(-1).astype(int)

# Reorder the columns to include 'state_id' as the first column
grouped_data = grouped_data[['state_id', 'state', 'year', 'index_nsa']]

#### **Visualizar resultados do tratamento**

In [None]:
# Guardar os resultados para novos ficheiros CSV
data_csv_ps.to_csv('newPS.csv', index=False)
grouped_data.to_csv('newHPI.csv', index=False)

----
## Converter para Parquet

In [None]:
pq_population = data_csv_ps.to_parquet('../parquetFiles/data_ps.parquet')
pq_housing = grouped_data.to_parquet('../parquetFiles/data_json.parquet')

## Realizar o Merge dos datasets

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *

spark = SparkSession.builder \
    .appName("Conexao ao MongoDB Atlas") \
    .config("spark.jars.packages", "org.mongodb.spark:mongo-spark-connector_2.12:2.4.0") \
    .config("spark.driver.memory", "8g") \
    .config("spark.executor.memory", "8g") \
    .config("spark.executor.cores", "4") \
    .getOrCreate()

pq_population = spark.read.format("parquet").load('../parquetFiles/data_ps.parquet') # population size
pq_housing = spark.read.format("parquet").load('../parquetFiles/data_json.parquet') # habitacao

# Merge
df_merged = pq_population.join(pq_housing, on=['state_id', 'state', 'year'], how='inner')

#### **Redução do tamanho do dataset**

- Remover colunas desnecessárias resultantes do merge
- Remover valores nulos e duplicados

In [None]:
df_merged = df_merged.drop("__index_level_0__")
# Show the merged data
df_merged.show(40)

Extrair uma amostra do dataset

In [None]:
df_final = df_merged

----
## **Armazenar os dados no MongoDB**

### Enviar os dados para o MongoDB

In [None]:
import pymongo
import re

# Connect to MongoDB
client = pymongo.MongoClient("mongodb://localhost:27017/")
db = client["bigdata"]
