In [1]:
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import json

## Carregar os dados

In [2]:
# Carregar o dataset JSON
with open('HPI_master.json', 'r') as f:
    json_data = json.load(f)

# Converter o dataset JSON para um DataFrame do pandas
data_json = pd.DataFrame(json_data)

# Carregar o dataset CSV - Population size
data_csv_ps = pd.read_csv("cu.data.19.PopulationSize.csv")

# Carregar o dataset CSV - 
data_csv_fb = pd.read_csv("cu.data.11.USFoodBeverage.csv")

## Converter para Parquet

## Visualizar os dados

In [3]:
data_json.head()

Unnamed: 0,hpi_type,hpi_flavor,frequency,level,place_name,place_id,yr,period,index_nsa,index_sa
0,traditional,purchase-only,monthly,USA or Census Division,East North Central Division,DV_ENC,1991,1,100.0,100.0
1,traditional,purchase-only,monthly,USA or Census Division,East North Central Division,DV_ENC,1991,2,100.91,100.96
2,traditional,purchase-only,monthly,USA or Census Division,East North Central Division,DV_ENC,1991,3,101.3,100.91
3,traditional,purchase-only,monthly,USA or Census Division,East North Central Division,DV_ENC,1991,4,101.69,100.98
4,traditional,purchase-only,monthly,USA or Census Division,East North Central Division,DV_ENC,1991,5,102.32,101.36


In [4]:
data_csv_ps.head()

Unnamed: 0,series_id,year,period,value,footnote_codes
0,CUURA000AA0,1986,M12,100.0,
1,CUURA000AA0,1987,M01,100.6,
2,CUURA000AA0,1987,M02,101.1,
3,CUURA000AA0,1987,M03,101.6,
4,CUURA000AA0,1987,M04,102.2,


In [5]:
data_csv_fb.head()

Unnamed: 0,series_id,year,period,value,footnote_codes
0,CUSR0000SAF,1967,M01,34.8,
1,CUSR0000SAF,1967,M02,34.7,
2,CUSR0000SAF,1967,M03,34.7,
3,CUSR0000SAF,1967,M04,34.6,
4,CUSR0000SAF,1967,M05,34.6,


#### **Tipos de dados do dataset: Population Size**

In [6]:
data_csv_ps.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 84760 entries, 0 to 84759
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   series_id       84760 non-null  object 
 1   year            84760 non-null  int64  
 2   period          84760 non-null  object 
 3   value           84760 non-null  float64
 4   footnote_codes  0 non-null      float64
dtypes: float64(2), int64(1), object(2)
memory usage: 3.2+ MB


Conseguimos observar a existência de dados do tipo `object`. Poderá ser necessário tratar este tipo de dados.

#### **Tipos de dados do dataset Json**

In [7]:
data_json.dtypes

hpi_type       object
hpi_flavor     object
frequency      object
level          object
place_name     object
place_id       object
yr              int64
period          int64
index_nsa     float64
index_sa       object
dtype: object

#### **Verificar os valores da coluna `period`**

In [8]:
# List all the different possible values in the 'period' column
period_values = data_csv_ps['period'].unique()
print("The 'period' column contains the following values:")
print(period_values)

The 'period' column contains the following values:
['M12' 'M01' 'M02' 'M03' 'M04' 'M05' 'M06' 'M07' 'M08' 'M09' 'M10' 'M11'
 'M13' 'S01' 'S02' 'S03']


## Tratamento dos Dados

### **Transformar `index_sa` em dados numéricos**

In [9]:
data_json['index_sa'] = data_json['index_sa'].astype(float)

#### **Transformar `period` em dados numéricos**

In [10]:
# Dicionário para mapear o período número ao valor da string
period_map = {
    'M01': 1,
    'M02': 2,
    'M03': 3,
    'M04': 4,
    'M05': 5,
    'M06': 6,
    'M07': 7,
    'M08': 8,
    'M09': 9,
    'M10': 10,
    'M11': 11,
    'M12': 12,
    'M13': 13,
    'S01': 14,
    'S02': 15,
    'S03': 16
}

data_csv_ps['period'] = data_csv_ps["period"].replace(period_map)
data_csv_fb['period'] = data_csv_fb["period"].replace(period_map)
data_json['period'].astype(str).astype(int)

0         1
1         2
2         3
3         4
4         5
         ..
121457    4
121458    1
121459    2
121460    3
121461    4
Name: period, Length: 121462, dtype: int64

#### **Remover a coluna `footnote_codes`**

In [11]:
data_csv_ps.drop(columns=["footnote_codes"], inplace=True)
data_csv_fb.drop(columns=["footnote_codes"], inplace=True)

In [12]:
data_csv_ps.head()

Unnamed: 0,series_id,year,period,value
0,CUURA000AA0,1986,12,100.0
1,CUURA000AA0,1987,1,100.6
2,CUURA000AA0,1987,2,101.1
3,CUURA000AA0,1987,3,101.6
4,CUURA000AA0,1987,4,102.2


In [13]:
data_csv_fb.head()

Unnamed: 0,series_id,year,period,value
0,CUSR0000SAF,1967,1,34.8
1,CUSR0000SAF,1967,2,34.7
2,CUSR0000SAF,1967,3,34.7
3,CUSR0000SAF,1967,4,34.6
4,CUSR0000SAF,1967,5,34.6


## Tratar Nan Values

In [14]:
median_value = data_json['index_sa'].median()
data_json['index_sa'] = data_json['index_sa'].fillna(median_value)

## Converter para Parquet

In [15]:
data_json.rename(columns = {'yr':'year'}, inplace = True)

# Convert to PyArrow table
table1 = pa.Table.from_pandas(data_json)
table2 = pa.Table.from_pandas(data_csv_ps)
table3 = pa.Table.from_pandas(data_csv_fb)

# Write to Parquet file
pq.write_table(table1, 'data_json.parquet')
pq.write_table(table2, 'data_ps.parquet')
pq.write_table(table3, 'data_fb.parquet')

## Realizar o Merge dos datasets

In [23]:
# Read Parquet files

table_json = pq.read_table('data_json.parquet')
table_ps = pq.read_table('data_ps.parquet')
table_fb = pq.read_table('data_fb.parquet')

# Join tables based on 'year' and 'period' columns
merged_table = table_json.join(table_ps, on=['year', 'period']).join(table_fb, on=['year', 'period'])

# Write merged table to Parquet file
pq.write_table(merged_table, 'merged_data.parquet')

TypeError: join() takes at least 2 positional arguments (1 given)

## Parquet

In [None]:
# Testar se o parquet está a funcionar
table = pq.read_table('merged_data.parquet')

# Convert to Pandas DataFrame
df = table.to_pandas()

df.info()