# Imports

In [2]:
import pandas as pd
import numpy as np
import sys
from pathlib import Path

# Helper Functions

In [24]:
def load_data(folder_from_parent:str, filename:str):

    path = str(Path.cwd().parents[0] / folder_from_parent)
    path_ = path + '/' + filename

    try:

        if '.xlsx' in filename:
            df = pd.read_excel(path_)

        else:
            df = pd.read_csv(path_)
        
    except FileNotFoundError:
        print('File is missing.')

    return df

# Load data

In [25]:
df = load_data('data', 'small_juro_pre.xlsx')

In [26]:
df.head()

Unnamed: 0,Data,Abertura,Máxima,Mínima,Fechamento,DATA,PRÉ 10YRS
0,2022-03-02,2319.07,2540.46,2178.15,2532.16,2022-03-31,11.72
1,2022-02-01,2446.03,2468.0,2244.38,2319.07,2022-02-25,11.63
2,2022-01-03,2365.28,2449.23,2074.96,2446.03,2022-01-31,11.361909
3,2021-12-01,2278.72,2476.86,2211.72,2365.24,2021-12-31,10.71318
4,2021-11-01,2332.01,2628.9,2225.65,2278.71,2021-11-30,11.349279


# Exploratory Data Analysis

## Data Overview

It helps in getting a quick overview of the dataset. 

This section is used to get a brief summary of the dataframe.

In [27]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 163 entries, 0 to 162
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   Data        163 non-null    datetime64[ns]
 1   Abertura    163 non-null    float64       
 2   Máxima      163 non-null    float64       
 3   Mínima      163 non-null    float64       
 4   Fechamento  163 non-null    float64       
 5   DATA        163 non-null    datetime64[ns]
 6   PRÉ 10YRS   163 non-null    float64       
dtypes: datetime64[ns](2), float64(5)
memory usage: 9.0 KB


In [30]:
# number of null values
df.isnull( ).sum( )

Data          0
Abertura      0
Máxima        0
Mínima        0
Fechamento    0
DATA          0
PRÉ 10YRS     0
dtype: int64

In [31]:
# number of unique elements
df.nunique( )

Data          163
Abertura      163
Máxima        163
Mínima        163
Fechamento    163
DATA          163
PRÉ 10YRS     162
dtype: int64

## Data Cleaning

In this section we handle null/missing values, incorrect types and rename columns.

In [None]:
# drop duplicated column


In [43]:
df_cleaned = df.rename(columns = {
    'DATA':'data', 'PRÉ 10YRS':'pre_10yr'
    }, errors = 'raise')
df_cleaned.head()

KeyError: "['DATA', 'PRÉ 10YRS'] not found in axis"

## Summary Statistics

A statistical summary for numerical columns present in the dataset. 

This section calculates some statistical measures like percentile, mean and standard deviation of the numerical values of the Series or DataFrame.

In [36]:
df_cleaned.describe()

Unnamed: 0,Abertura,Máxima,Mínima,Fechamento,PRÉ 10YRS
count,163.0,163.0,163.0,163.0,163.0
mean,1499.683742,1575.750245,1425.761104,1509.850798,11.215889
std,604.214394,640.979235,575.990837,607.514758,2.095075
min,469.32,492.85,405.71,469.32,6.749
25%,1139.485,1180.96,1079.415,1141.215,9.866491
50%,1329.2,1402.17,1278.91,1332.39,11.471032
75%,1730.195,1775.885,1641.33,1745.885,12.410128
max,3144.75,3232.89,3104.51,3144.68,17.45


**Correlation:**

Look how the 'Fechamento' attribute has a very high negative correlation with 'pre_10yr'.

In [38]:
# correlation between numeric attributes
df_cleaned.corr()

Unnamed: 0,Abertura,Máxima,Mínima,Fechamento,PRÉ 10YRS
Abertura,1.0,0.994262,0.981098,0.979408,-0.728551
Máxima,0.994262,1.0,0.980592,0.986654,-0.743624
Mínima,0.981098,0.980592,1.0,0.99239,-0.741874
Fechamento,0.979408,0.986654,0.99239,1.0,-0.760568
PRÉ 10YRS,-0.728551,-0.743624,-0.741874,-0.760568,1.0
