In [None]:
import pandas as pd
import json
import numpy as np

## Carregar os dados

In [None]:
# Carregar o dataset JSON - House Price Index (HPI)
with open('../datasets/HPI_master.json', 'r') as f:
    json_data = json.load(f)

data_json = pd.DataFrame(json_data)

# Carregar o dataset CSV - Population size
data_csv_ps = pd.read_csv("../datasets/cu.data.19.PopulationSize.csv")
data_csv_ps = pd.DataFrame(data_csv_ps)

# Carregar o dataset CSV - Food & Beverage
data_csv_fb = pd.read_csv("../datasets/cu.data.11.USFoodBeverage.csv")
data_csv_fb = pd.DataFrame(data_csv_fb)


## Visualizar os dados

In [None]:
data_json.head()

In [None]:
data_csv_ps.head()

In [None]:
data_csv_fb.head()

#### **Tipos de dados do dataset: Food & Beverage**

In [None]:
data_csv_fb.info()

#### **Tipos de dados do dataset: Population Size**

In [None]:
data_csv_ps.info()

Conseguimos observar a existência de dados do tipo `object`. Poderá ser necessário tratar este tipo de dados.

#### **Tipos de dados do dataset Json**

In [None]:
data_json.dtypes

### Remover linhas com ano anterior a 1977, uma vez que o dataset FoodBeverage começa nesse ano

In [None]:
print("[Dataframes size before]:\n\n")
print("FB: ", data_csv_fb.count(), "\n")
print("PS: ", data_csv_ps.count(), "\n")
print("HPI: ", data_json.count())

#### Filtrar apenas as colunas com o ano igual ou superior a 1977
Uma vez que nem todos os datasets contêm informação de anos anteriores

In [None]:
data_csv_fb = data_csv_fb[data_csv_fb['year'] > 1976]
data_csv_ps = data_csv_ps[data_csv_ps['year'] > 1976]
data_json = data_json[data_json['yr'] > 1976]


In [None]:
for index, row in data_json.iterrows():
    if row['yr'] < 1977:
        print("Falha detectada na linha", index)

for index, row in data_json.iterrows():
    if row['yr'] < 1977:
        print("Falha detectada na linha", index)

for index, row in data_json.iterrows():
    if row['yr'] < 1977:
        print("Falha detectada na linha", index)


In [None]:
data_json.head()

In [None]:
print("[Dataframe size after]: FB: ", data_csv_fb.count(), "PS: ", data_csv_ps.count(), "HPI: ", data_json.count())

#### **Verificar a existência de Missing Values nos Dataframes**

In [None]:
print("Number of missing values in House Price dataset:")
json_MV = data_json.isnull().sum()
print(json_MV)

In [None]:
print("Number of missing values in FoodBeverage dataset:")
foodBeverage_MV = data_csv_fb.isnull().sum()
print(foodBeverage_MV)

In [None]:
print("Number of missing values in Population Size dataset:")
populationSize_MV = data_csv_ps.isnull().sum()
print(populationSize_MV)

#### **Verificar os valores da coluna `period`**

In [None]:
period_values = data_csv_ps['period'].unique()
print("The 'period' column contains the following values:")
print(period_values)

## Tratamento dos Dados

#### **Transformar `index_sa` em dados numéricos**

In [None]:
data_json['index_sa'] = data_json['index_sa'].astype(float)

#### **Transformar `period` em dados numéricos**

In [None]:
period_map = {
    'M01': 1,
    'M02': 2,
    'M03': 3,
    'M04': 4,
    'M05': 5,
    'M06': 6,
    'M07': 7,
    'M08': 8,
    'M09': 9,
    'M10': 10,
    'M11': 11,
    'M12': 12,
    'M13': 13,
    'S01': 14,
    'S02': 15,
    'S03': 16
}

data_csv_ps['period'] = data_csv_ps["period"].replace(period_map)
data_csv_fb['period'] = data_csv_fb["period"].replace(period_map)
data_json['period'].astype(str).astype(int)

#### Usar apenas a média anual

In [None]:
# filter the rows where period is not equal to 13
data_csv_fb = data_csv_fb[(data_csv_fb['period'] == 13) | (data_csv_fb['year'] == 2017)]

data_csv_ps = data_csv_ps[(data_csv_ps['period'] == 13) | (data_csv_ps['year'] == 2017)]

#### Tratar o ano 2017

In [None]:
# group by series_id and filter rows where year == 2017
grouped_fb = data_csv_fb[data_csv_fb['year'] == 2017].groupby('series_id')
grouped_ps = data_csv_ps[data_csv_ps['year'] == 2017].groupby('series_id')

# create a new DataFrame with the mean value for each group
new_data_fb = pd.DataFrame({
    'series_id': grouped_fb['series_id'].first(),
    'year': 2017,
    'period': 13,
    'value': grouped_fb['value'].mean(),
})

# create a new DataFrame with the mean value for each group
new_data_ps = pd.DataFrame({
    'series_id': grouped_ps['series_id'].first(),
    'year': 2017,
    'period': 13,
    'value': grouped_ps['value'].mean(),
})

# concatenate the new DataFrame with the remaining rows (where year != 2017)
data_csv_fb = pd.concat([new_data_fb, data_csv_fb[data_csv_fb['year'] != 2017]], ignore_index=True)
data_csv_ps = pd.concat([new_data_ps, data_csv_ps[data_csv_ps['year'] != 2017]], ignore_index=True)


#### Tratar dataset json

In [None]:
# Agrupar os dados por place_id, yr, hpi_flavor
groups = data_json.groupby(['place_id', 'yr', 'hpi_flavor', 'level'])

# Substituir os documentos de cada ano com as médias do index_nsa e index_sa
def replace_docs(group):
    mean_nsa = group['index_nsa'].mean()
    mean_sa = np.nanmean(group['index_sa'])

    # Criar um novo documento para o ano com os valores da média
    doc = {
        'hpi_type': group['hpi_type'].iloc[0],
        'hpi_flavor': group['hpi_flavor'].iloc[0],
        'frequency': group['frequency'].iloc[0],
        'level': group['level'].iloc[0],
        'place_name': group['place_name'].iloc[0],
        'place_id': group['place_id'].iloc[0],
        'yr': group['yr'].iloc[0],
        'index_nsa': mean_nsa,
        'index_sa': mean_sa
    }

    return pd.DataFrame([doc])

# Aplicar a função a cada grupo e combinar os resultados
result = groups.apply(replace_docs).reset_index(drop=True)

data_json = result

#### Remover a coluna `period`

In [None]:
data_csv_fb = data_csv_fb.drop('period', axis=1)
data_csv_ps = data_csv_ps.drop('period', axis=1)

#### **Remover a coluna `footnote_codes`**

In [None]:
data_csv_ps.drop(columns=["footnote_codes"], inplace=True)
data_csv_fb.drop(columns=["footnote_codes"], inplace=True)

#### Tratar da coluna `value`

In [None]:

data_csv_fb['value'] = data_csv_fb['value'].round(2)
data_csv_ps['value'] = data_csv_ps['value'].round(2)
data_json['index_nsa'] = data_json['index_nsa'].round(2)
data_json['index_sa'] = data_json['index_sa'].round(2)

#### **Tratar Nan Values**

In [None]:
median_value = data_json['index_sa'].median()
data_json['index_sa'] = data_json['index_sa'].fillna(median_value)

#### **Renomear colunas**

Estas colunas são necessárias renomear para ser possível efetuar o merge

In [None]:
data_json.rename(columns = {'yr':'year'}, inplace = True)
data_csv_fb.rename(columns = {'value':'valueFoodBeverage'}, inplace = True)
data_csv_ps.rename(columns = {'value':'valuePopSize'}, inplace = True)
data_csv_fb.rename(columns = {'series_id':'idFoodBeverage'}, inplace = True)
data_csv_fb.rename(columns = {'series_id':'idPopSize'}, inplace = True)

#### Visualizar resultados do tratamento

In [None]:
# Guardar os resultados para novos ficheiros CSV
data_csv_fb.to_csv('newFB.csv', index=False)
data_csv_ps.to_csv('newPS.csv', index=False)
data_json.to_csv('newHPI.csv', index=False)

## Converter para Parquet

In [None]:
pq1 = data_csv_fb.to_parquet('../parquetFiles/data_fb.parquet')
pq2 = data_csv_ps.to_parquet('../parquetFiles/data_ps.parquet')
pq3 = data_json.to_parquet('../parquetFiles/data_json.parquet')

pd.read_parquet('../parquetFiles/data_fb.parquet')
pd.read_parquet('../parquetFiles/data_ps.parquet')
pd.read_parquet('../parquetFiles/data_json.parquet')

## Realizar o Merge dos datasets

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *

spark = SparkSession.builder \
    .appName("Conexao ao MongoDB Atlas") \
    .config("spark.jars.packages", "org.mongodb.spark:mongo-spark-connector_2.12:2.4.0") \
    .config("spark.driver.memory", "8g") \
    .config("spark.executor.memory", "8g") \
    .config("spark.executor.cores", "4") \
    .getOrCreate()

pq1 = spark.read.format("parquet").load('../parquetFiles/data_fb.parquet')
pq2 = spark.read.format("parquet").load('../parquetFiles/data_ps.parquet')
pq3 = spark.read.format("parquet").load('../parquetFiles/data_json.parquet')

# Merge
#df4 = pq1.join(pq2, on=['year'], how='inner').join(pq3, on=['year'], how='inner')
df4 = pq2.join(pq3, on=['year'], how='inner')

#### **Redução do tamanho do dataset**

Remover colunas desnecessárias resultantes do merge

In [None]:
df4 = df4.drop("__index_level_0__", "__index_level_0__", "__index_level_0__")

In [None]:
df4.head(10)

Remover valores nulos e duplicados 

In [None]:
df4 = df4.dropna()
df4 = df4.dropDuplicates()

Extrair uma amostra do dataset

In [None]:
#df5 = df4.sample(0.1)
df5 = df4
#print("df5 - Number of rows: ", df5.count())

----
## **Armazenar os dados no MongoDB**

### Enviar os dados para o MongoDB

- Teste com dataset menor: Criar as coleções para cada ano

In [None]:
# Funcional
"""
import pymongo

# Load CSV file into a Spark DataFrame
df7 = spark.read.csv("newFB.csv", header=True, inferSchema=True)

# Connect to MongoDB
client = pymongo.MongoClient("mongodb://localhost:27017/")
db = client["bigdata"]

# Loop through each year and insert the data into the corresponding collection
for year in df7.select('year').distinct().rdd.flatMap(lambda x: x).collect():
    collection_name = f"data_{year}"
    collection = db[collection_name]
    year_data = df7.filter(df7.year == year).toJSON().map(lambda x: json.loads(x)).collect()
    collection.insert_many(year_data)
"""

- Teste com dataset maior

In [None]:
import pymongo

# Connect to MongoDB
client = pymongo.MongoClient("mongodb://localhost:27017/")
db = client["bigdata"]

try:
    print("Starting the loop...") 
    # Loop through each year and insert the data into the corresponding collection
    for year in df5.select('year').distinct().rdd.flatMap(lambda x: x).collect():
        print("Year: ", year)
        collection_name = f"data_{year}"
        collection = db[collection_name]
        year_data = df5.filter(df5.year == year).toJSON().map(lambda x: json.loads(x)).collect()
        collection.insert_many(year_data)
except Exception as e:
    print("Error:", e)