In [1]:
import pandas as pd
from datetime import datetime as dt

# Rocinha - CLIMA DIARIO

In [2]:
# Data Frame + Dimensáo
df_clima_rc = pd.read_excel('Rocinha-2015-2023_Diario.xlsx')
print ('Numero de linhas: {}'.format(df_clima_rc.shape[0]))
print ('Numero de colunas: {}'.format(df_clima_rc.shape[1]))

Numero de linhas: 3288
Numero de colunas: 11


In [3]:
df_clima_rc.head()

Unnamed: 0,time,tavg,tmin,tmax,prcp,snow,wdir,wspd,wpgt,pres,tsun
0,2015-07-01,24.1,21.0,28.0,,,292.0,11.8,,,
1,2015-07-02,22.0,20.0,24.0,,,106.0,9.6,,,
2,2015-07-03,22.6,21.0,25.0,,,20.0,5.9,,,
3,2015-07-04,23.2,18.0,27.0,,,271.0,13.8,,,
4,2015-07-05,18.3,17.0,20.0,,,240.0,7.9,,,


In [4]:
# Renomeando as colunas
df_clima_rc = df_clima_rc.rename(columns={'time': 'data', 'tavg': 'temperaturaMedia', 'tmin': 'temperaturaMinima',
                                         'tmax': 'temperaturaMaxima', 'prcp': 'chuva', 'snow': 'neve',
                                          'wdir': 'direcaoVento', 'wspd': 'velocidadeVento', 'wpgt': 'rajadaVento',
                                          'pres': 'pressaoAtmosferica', 'tsun': 'luzSolar'})

In [5]:
# Dropando as colunas que não serão utilizadas
# Lista de colunas a serem removidas
colunas_para_dropar = ['neve', 'luzSolar', 'rajadaVento']

# Dropando as colunas
df_clima_rc = df_clima_rc.drop(columns=colunas_para_dropar)

In [6]:
# Colunas do dataset
df_clima_rc.columns

Index(['data', 'temperaturaMedia', 'temperaturaMinima', 'temperaturaMaxima',
       'chuva', 'direcaoVento', 'velocidadeVento', 'pressaoAtmosferica'],
      dtype='object')

In [7]:
# Tipo dos dados
df_clima_rc.dtypes

data                  datetime64[ns]
temperaturaMedia             float64
temperaturaMinima            float64
temperaturaMaxima            float64
chuva                        float64
direcaoVento                 float64
velocidadeVento              float64
pressaoAtmosferica           float64
dtype: object

In [8]:
#Check NA
df_clima_rc.isna().sum()

data                     0
temperaturaMedia         9
temperaturaMinima        9
temperaturaMaxima        9
chuva                 2013
direcaoVento            46
velocidadeVento         10
pressaoAtmosferica     329
dtype: int64

In [9]:
# Calcular a porcentagem de valores ausentes em cada coluna
missing_percentages = (df_clima_rc.isnull().sum() / len(df_clima_rc)) * 100

missing_percentages

data                   0.000000
temperaturaMedia       0.273723
temperaturaMinima      0.273723
temperaturaMaxima      0.273723
chuva                 61.222628
direcaoVento           1.399027
velocidadeVento        0.304136
pressaoAtmosferica    10.006083
dtype: float64

In [10]:
#Visualizar algumas linhas dos valores faltantes
df_clima_rc[df_clima_rc.isnull().any(axis=1)].head(10)

Unnamed: 0,data,temperaturaMedia,temperaturaMinima,temperaturaMaxima,chuva,direcaoVento,velocidadeVento,pressaoAtmosferica
0,2015-07-01,24.1,21.0,28.0,,292.0,11.8,
1,2015-07-02,22.0,20.0,24.0,,106.0,9.6,
2,2015-07-03,22.6,21.0,25.0,,20.0,5.9,
3,2015-07-04,23.2,18.0,27.0,,271.0,13.8,
4,2015-07-05,18.3,17.0,20.0,,240.0,7.9,
5,2015-07-06,19.7,18.0,22.0,,42.0,8.2,
6,2015-07-07,21.6,19.0,25.0,,78.0,6.9,
7,2015-07-08,22.5,20.0,25.9,,354.0,6.5,
8,2015-07-09,23.4,21.0,28.0,,248.0,11.0,
9,2015-07-10,22.0,19.0,26.0,,86.0,9.4,


In [11]:
# Descobrindo a Média Móvel das colunas e completando os valores faltantes

window_size = 52

df_clima_rc['temperaturaMedia'].fillna(df_clima_rc['temperaturaMedia'].rolling(window=window_size, min_periods=1).mean(), inplace=True)
df_clima_rc['temperaturaMinima'].fillna(df_clima_rc['temperaturaMinima'].rolling(window=window_size, min_periods=1).mean(), inplace=True)
df_clima_rc['temperaturaMaxima'].fillna(df_clima_rc['temperaturaMaxima'].rolling(window=window_size, min_periods=1).mean(), inplace=True)
df_clima_rc['velocidadeVento'].fillna(df_clima_rc['velocidadeVento'].rolling(window=window_size, min_periods=1).mean(), inplace=True)
df_clima_rc['direcaoVento'].fillna(df_clima_rc['direcaoVento'].mean(), inplace=True)
df_clima_rc['pressaoAtmosferica'].fillna(df_clima_rc['pressaoAtmosferica'].mean(), inplace=True)
df_clima_rc['chuva'].fillna(0, inplace=True)

In [12]:
# Quantidade de valores faltantes do Dataset após alteração
df_clima_rc.isnull().sum()

data                  0
temperaturaMedia      0
temperaturaMinima     0
temperaturaMaxima     0
chuva                 0
direcaoVento          0
velocidadeVento       0
pressaoAtmosferica    0
dtype: int64

In [13]:
df_clima_rc.head()

Unnamed: 0,data,temperaturaMedia,temperaturaMinima,temperaturaMaxima,chuva,direcaoVento,velocidadeVento,pressaoAtmosferica
0,2015-07-01,24.1,21.0,28.0,0.0,292.0,11.8,1015.162893
1,2015-07-02,22.0,20.0,24.0,0.0,106.0,9.6,1015.162893
2,2015-07-03,22.6,21.0,25.0,0.0,20.0,5.9,1015.162893
3,2015-07-04,23.2,18.0,27.0,0.0,271.0,13.8,1015.162893
4,2015-07-05,18.3,17.0,20.0,0.0,240.0,7.9,1015.162893


## REALIZAR O AGRUPAMENTO A PARTIR DA SEMANA

In [14]:
# week of year
df_clima_rc['semana_num'] = df_clima_rc['data'].dt.strftime('%Y-%U')
df_clima_rc

Unnamed: 0,data,temperaturaMedia,temperaturaMinima,temperaturaMaxima,chuva,direcaoVento,velocidadeVento,pressaoAtmosferica,semana_num
0,2015-07-01,24.1,21.0,28.0,0.0,292.0,11.8,1015.162893,2015-26
1,2015-07-02,22.0,20.0,24.0,0.0,106.0,9.6,1015.162893,2015-26
2,2015-07-03,22.6,21.0,25.0,0.0,20.0,5.9,1015.162893,2015-26
3,2015-07-04,23.2,18.0,27.0,0.0,271.0,13.8,1015.162893,2015-26
4,2015-07-05,18.3,17.0,20.0,0.0,240.0,7.9,1015.162893,2015-27
...,...,...,...,...,...,...,...,...,...
3283,2024-06-26,23.0,21.7,25.6,0.2,60.0,14.5,1015.200000,2024-25
3284,2024-06-27,23.4,22.6,26.2,0.1,199.0,20.4,1013.800000,2024-25
3285,2024-06-28,22.9,21.4,24.6,0.0,58.0,15.7,1013.000000,2024-25
3286,2024-06-29,23.4,21.9,25.1,1.7,249.0,21.4,1014.400000,2024-25


In [15]:
# Crie um intervalo de datas que cubra todos os anos e semanas desejados
start_date = pd.to_datetime('2015-01-01')  # Data inicial desejada
end_date = pd.to_datetime('2024-06-30')    # Data final desejada
date_range = pd.date_range(start=start_date, end=end_date, freq='W-SAT')  # Intervalo de datas semanais

In [16]:
# Crie um DataFrame com todas as semanas desejadas
all_weeks = pd.DataFrame({'data': date_range})
all_weeks['semana_num'] = all_weeks['data'].dt.strftime('%Y-%U')

In [17]:
# Junte os dados existentes com o DataFrame de todas as semanas
df_merged = all_weeks.merge(df_clima_rc, on='semana_num', how='left')

In [18]:
# Junta os dados usando a data do domingo como representação da semana
df_clima_final = df_merged.groupby('semana_num').agg({
    'chuva': 'sum',
    'temperaturaMedia': 'mean',
    'temperaturaMaxima': 'max',
    'temperaturaMinima': 'min',
    'direcaoVento': 'mean',
    'velocidadeVento': 'mean',
    'pressaoAtmosferica': 'mean'
}).reset_index()

In [19]:
# Crie uma coluna 'ano' e 'semana_num' a partir da coluna 'semana'
df_clima_final['ano'] = df_clima_final['semana_num'].str[:4]
df_clima_final['semana'] = df_clima_final['semana_num'].str[5:].astype(int)

In [20]:
def get_sunday_date(row):
    year = int(row['semana_num'][:4])
    week_num = int(row['semana_num'][5:])
    sunday = pd.to_datetime(f'{year}-W{week_num}-0', format='%Y-W%W-%w')
    return sunday

In [21]:
df_clima_final['data'] = df_clima_final.apply(get_sunday_date, axis=1)
df_clima_final.head(60)

Unnamed: 0,semana_num,chuva,temperaturaMedia,temperaturaMaxima,temperaturaMinima,direcaoVento,velocidadeVento,pressaoAtmosferica,ano,semana,data
0,2015-00,0.0,,,,,,,2015,0,2015-01-04
1,2015-01,0.0,,,,,,,2015,1,2015-01-11
2,2015-02,0.0,,,,,,,2015,2,2015-01-18
3,2015-03,0.0,,,,,,,2015,3,2015-01-25
4,2015-04,0.0,,,,,,,2015,4,2015-02-01
5,2015-05,0.0,,,,,,,2015,5,2015-02-08
6,2015-06,0.0,,,,,,,2015,6,2015-02-15
7,2015-07,0.0,,,,,,,2015,7,2015-02-22
8,2015-08,0.0,,,,,,,2015,8,2015-03-01
9,2015-09,0.0,,,,,,,2015,9,2015-03-08


In [22]:
# Dropando as colunas que não serão utilizadas
# Lista de colunas a serem removidas
colunas_para_dropar2 = ['semana_num', 'ano']

# Dropando as colunas
df_clima_final = df_clima_final.drop(columns=colunas_para_dropar2)
df_clima_final

Unnamed: 0,chuva,temperaturaMedia,temperaturaMaxima,temperaturaMinima,direcaoVento,velocidadeVento,pressaoAtmosferica,semana,data
0,0.0,,,,,,,0,2015-01-04
1,0.0,,,,,,,1,2015-01-11
2,0.0,,,,,,,2,2015-01-18
3,0.0,,,,,,,3,2015-01-25
4,0.0,,,,,,,4,2015-02-01
...,...,...,...,...,...,...,...,...,...
491,26.6,21.842857,24.0,19.8,152.571429,22.428571,1021.028571,21,2024-05-26
492,9.9,22.085714,23.9,20.1,86.000000,16.157143,1022.085714,22,2024-06-02
493,0.0,22.814286,25.7,20.1,51.714286,15.142857,1018.785714,23,2024-06-09
494,0.0,23.414286,26.7,21.4,84.142857,15.571429,1016.800000,24,2024-06-16


In [23]:
#Check NA
df_clima_final.isna().sum()

chuva                  0
temperaturaMedia      26
temperaturaMaxima     26
temperaturaMinima     26
direcaoVento          26
velocidadeVento       26
pressaoAtmosferica    26
semana                 0
data                   0
dtype: int64

In [24]:
# Descobrindo a Média Móvel das colunas e completando os valores faltantes

window_size = 30

df_clima_final['temperaturaMedia'].fillna(df_clima_final['temperaturaMedia'].mean(), inplace=True)
df_clima_final['temperaturaMinima'].fillna(df_clima_final['temperaturaMinima'].mean(), inplace=True)
df_clima_final['temperaturaMaxima'].fillna(df_clima_final['temperaturaMaxima'].mean(), inplace=True)
df_clima_final['velocidadeVento'].fillna(df_clima_final['velocidadeVento'].mean(), inplace=True)
df_clima_final['direcaoVento'].fillna(df_clima_final['direcaoVento'].mean(), inplace=True)
df_clima_final['pressaoAtmosferica'].fillna(df_clima_final['pressaoAtmosferica'].mean(), inplace=True)
df_clima_final['chuva'].fillna(0, inplace=True)

In [25]:
#Check NA
df_clima_final.isna().sum()

chuva                 0
temperaturaMedia      0
temperaturaMaxima     0
temperaturaMinima     0
direcaoVento          0
velocidadeVento       0
pressaoAtmosferica    0
semana                0
data                  0
dtype: int64