In [1]:
import pandas as pd
from datetime import datetime as dt

# Cidade de Deus - CLIMA DIARIO

In [2]:
# Data Frame + Dimensáo
df_clima_cd = pd.read_excel('CidadeDeDeus-2015-2023_Diario.xlsx')
print ('Numero de linhas: {}'.format(df_clima_cd.shape[0]))
print ('Numero de colunas: {}'.format(df_clima_cd.shape[1]))

Numero de linhas: 3469
Numero de colunas: 11


In [3]:
df_clima_cd.head()

Unnamed: 0,time,tavg,tmin,tmax,prcp,snow,wdir,wspd,wpgt,pres,tsun
0,2015-01-01,31.0,24.0,40.0,,,47.0,9.9,,,
1,2015-01-02,32.0,24.0,40.0,,,224.0,7.8,,,
2,2015-01-03,29.1,26.0,33.0,,,209.0,15.3,,,
3,2015-01-04,27.1,24.0,34.0,,,165.0,8.6,,,
4,2015-01-05,25.1,22.0,33.0,,,345.0,5.2,,,


In [4]:
# Renomeando as colunas
df_clima_cd = df_clima_cd.rename(columns={'time': 'data', 'tavg': 'temperaturaMedia', 'tmin': 'temperaturaMinima',
                                         'tmax': 'temperaturaMaxima', 'prcp': 'chuva', 'snow': 'neve',
                                          'wdir': 'direcaoVento', 'wspd': 'velocidadeVento', 'wpgt': 'rajadaVento',
                                          'pres': 'pressaoAtmosferica', 'tsun': 'luzSolar'})

In [5]:
# Dropando as colunas que não serão utilizadas
# Lista de colunas a serem removidas
colunas_para_dropar = ['neve', 'luzSolar', 'rajadaVento']

# Dropando as colunas
df_clima_cd = df_clima_cd.drop(columns=colunas_para_dropar)

In [6]:
# Colunas do dataset
df_clima_cd.columns

Index(['data', 'temperaturaMedia', 'temperaturaMinima', 'temperaturaMaxima',
       'chuva', 'direcaoVento', 'velocidadeVento', 'pressaoAtmosferica'],
      dtype='object')

In [7]:
# Tipo dos dados
df_clima_cd.dtypes

data                  datetime64[ns]
temperaturaMedia             float64
temperaturaMinima            float64
temperaturaMaxima            float64
chuva                        float64
direcaoVento                 float64
velocidadeVento              float64
pressaoAtmosferica           float64
dtype: object

In [8]:
#Check NA
df_clima_cd.isna().sum()

data                     0
temperaturaMedia         0
temperaturaMinima        3
temperaturaMaxima        3
chuva                 1235
direcaoVento            36
velocidadeVento          3
pressaoAtmosferica     298
dtype: int64

In [9]:
# Calcular a porcentagem de valores ausentes em cada coluna
missing_percentages = (df_clima_cd.isnull().sum() / len(df_clima_cd)) * 100

missing_percentages

data                   0.000000
temperaturaMedia       0.000000
temperaturaMinima      0.086480
temperaturaMaxima      0.086480
chuva                 35.601038
direcaoVento           1.037763
velocidadeVento        0.086480
pressaoAtmosferica     8.590372
dtype: float64

In [10]:
#Visualizar algumas linhas dos valores faltantes
df_clima_cd[df_clima_cd.isnull().any(axis=1)].head(10)

Unnamed: 0,data,temperaturaMedia,temperaturaMinima,temperaturaMaxima,chuva,direcaoVento,velocidadeVento,pressaoAtmosferica
0,2015-01-01,31.0,24.0,40.0,,47.0,9.9,
1,2015-01-02,32.0,24.0,40.0,,224.0,7.8,
2,2015-01-03,29.1,26.0,33.0,,209.0,15.3,
3,2015-01-04,27.1,24.0,34.0,,165.0,8.6,
4,2015-01-05,25.1,22.0,33.0,,345.0,5.2,
5,2015-01-06,26.8,22.0,33.0,,103.0,7.1,
6,2015-01-07,30.2,24.0,36.0,,95.0,6.3,
7,2015-01-08,30.7,22.0,38.0,,79.0,7.5,
8,2015-01-09,31.2,23.0,38.0,,47.0,9.8,
9,2015-01-10,30.5,21.0,40.0,,103.0,8.1,


In [11]:
# Descobrindo a Média Móvel das colunas e completando os valores faltantes

window_size = 52

df_clima_cd['temperaturaMedia'].fillna(df_clima_cd['temperaturaMedia'].rolling(window=window_size, min_periods=1).mean(), inplace=True)
df_clima_cd['temperaturaMinima'].fillna(df_clima_cd['temperaturaMinima'].rolling(window=window_size, min_periods=1).mean(), inplace=True)
df_clima_cd['temperaturaMaxima'].fillna(df_clima_cd['temperaturaMaxima'].rolling(window=window_size, min_periods=1).mean(), inplace=True)
df_clima_cd['velocidadeVento'].fillna(df_clima_cd['velocidadeVento'].rolling(window=window_size, min_periods=1).mean(), inplace=True)
df_clima_cd['direcaoVento'].fillna(df_clima_cd['direcaoVento'].mean(), inplace=True)
df_clima_cd['pressaoAtmosferica'].fillna(df_clima_cd['pressaoAtmosferica'].mean(), inplace=True)
df_clima_cd['chuva'].fillna(0, inplace=True)

In [12]:
# Quantidade de valores faltantes do Dataset após alteração
df_clima_cd.isnull().sum()

data                  0
temperaturaMedia      0
temperaturaMinima     0
temperaturaMaxima     0
chuva                 0
direcaoVento          0
velocidadeVento       0
pressaoAtmosferica    0
dtype: int64

In [13]:
df_clima_cd.head()

Unnamed: 0,data,temperaturaMedia,temperaturaMinima,temperaturaMaxima,chuva,direcaoVento,velocidadeVento,pressaoAtmosferica
0,2015-01-01,31.0,24.0,40.0,0.0,47.0,9.9,1015.521917
1,2015-01-02,32.0,24.0,40.0,0.0,224.0,7.8,1015.521917
2,2015-01-03,29.1,26.0,33.0,0.0,209.0,15.3,1015.521917
3,2015-01-04,27.1,24.0,34.0,0.0,165.0,8.6,1015.521917
4,2015-01-05,25.1,22.0,33.0,0.0,345.0,5.2,1015.521917


## REALIZAR O AGRUPAMENTO A PARTIR DA SEMANA

In [14]:
# week of year
df_clima_cd['semana_num'] = df_clima_cd['data'].dt.strftime('%Y-%U')
df_clima_cd

Unnamed: 0,data,temperaturaMedia,temperaturaMinima,temperaturaMaxima,chuva,direcaoVento,velocidadeVento,pressaoAtmosferica,semana_num
0,2015-01-01,31.0,24.0,40.0,0.0,47.0,9.9,1015.521917,2015-00
1,2015-01-02,32.0,24.0,40.0,0.0,224.0,7.8,1015.521917,2015-00
2,2015-01-03,29.1,26.0,33.0,0.0,209.0,15.3,1015.521917,2015-00
3,2015-01-04,27.1,24.0,34.0,0.0,165.0,8.6,1015.521917,2015-01
4,2015-01-05,25.1,22.0,33.0,0.0,345.0,5.2,1015.521917,2015-01
...,...,...,...,...,...,...,...,...,...
3464,2024-06-26,25.1,19.9,33.0,0.0,338.0,6.2,1014.600000,2024-25
3465,2024-06-27,23.6,21.4,27.0,1.8,258.0,9.2,1013.600000,2024-25
3466,2024-06-28,25.3,19.3,34.0,0.0,20.0,7.5,1012.500000,2024-25
3467,2024-06-29,24.4,20.0,31.0,0.9,281.0,9.9,1014.200000,2024-25


In [15]:
# Crie um intervalo de datas que cubra todos os anos e semanas desejados
start_date = pd.to_datetime('2015-01-01')  # Data inicial desejada
end_date = pd.to_datetime('2024-06-30')    # Data final desejada
date_range = pd.date_range(start=start_date, end=end_date, freq='W-SAT')  # Intervalo de datas semanais

In [16]:
# Crie um DataFrame com todas as semanas desejadas
all_weeks = pd.DataFrame({'data': date_range})
all_weeks['semana_num'] = all_weeks['data'].dt.strftime('%Y-%U')

In [17]:
# Junte os dados existentes com o DataFrame de todas as semanas
df_merged = all_weeks.merge(df_clima_cd, on='semana_num', how='left')

In [18]:
# Junta os dados usando a data do domingo como representação da semana
df_clima_final = df_merged.groupby('semana_num').agg({
    'chuva': 'sum',
    'temperaturaMedia': 'mean',
    'temperaturaMaxima': 'max',
    'temperaturaMinima': 'min',
    'direcaoVento': 'mean',
    'velocidadeVento': 'mean',
    'pressaoAtmosferica': 'mean'
}).reset_index()

In [19]:
# Crie uma coluna 'ano' e 'semana_num' a partir da coluna 'semana'
df_clima_final['ano'] = df_clima_final['semana_num'].str[:4]
df_clima_final['semana'] = df_clima_final['semana_num'].str[5:].astype(int)

In [20]:
def get_sunday_date(row):
    year = int(row['semana_num'][:4])
    week_num = int(row['semana_num'][5:])
    sunday = pd.to_datetime(f'{year}-W{week_num}-0', format='%Y-W%W-%w')
    return sunday

In [21]:
df_clima_final['data'] = df_clima_final.apply(get_sunday_date, axis=1)
df_clima_final.head(60)

Unnamed: 0,semana_num,chuva,temperaturaMedia,temperaturaMaxima,temperaturaMinima,direcaoVento,velocidadeVento,pressaoAtmosferica,ano,semana,data
0,2015-00,0.0,30.7,40.0,24.0,160.0,11.0,1015.521917,2015,0,2015-01-04
1,2015-01,0.0,28.8,40.0,21.0,133.857143,7.514286,1015.521917,2015,1,2015-01-11
2,2015-02,0.0,30.1,40.0,22.0,218.428571,10.871429,1015.521917,2015,2,2015-01-18
3,2015-03,0.0,28.157143,38.0,22.0,221.136948,9.985714,1015.521917,2015,3,2015-01-25
4,2015-04,0.0,28.257143,37.0,21.0,222.851234,9.742857,1015.521917,2015,4,2015-02-01
5,2015-05,0.0,25.514286,34.0,20.0,204.285714,8.442857,1015.521917,2015,5,2015-02-08
6,2015-06,0.0,28.842857,38.0,20.0,178.0,9.028571,1015.521917,2015,6,2015-02-15
7,2015-07,0.0,27.785714,37.0,21.0,204.428571,7.8,1015.521917,2015,7,2015-02-22
8,2015-08,0.0,28.342857,36.0,21.0,161.279805,10.285714,1015.521917,2015,8,2015-03-01
9,2015-09,0.0,27.057143,36.0,21.0,148.422662,10.557143,1015.521917,2015,9,2015-03-08


In [22]:
# Dropando as colunas que não serão utilizadas
# Lista de colunas a serem removidas
colunas_para_dropar2 = ['semana_num', 'ano']

# Dropando as colunas
df_clima_final = df_clima_final.drop(columns=colunas_para_dropar2)
df_clima_final

Unnamed: 0,chuva,temperaturaMedia,temperaturaMaxima,temperaturaMinima,direcaoVento,velocidadeVento,pressaoAtmosferica,semana,data
0,0.0,30.700000,40.0,24.0,160.000000,11.000000,1015.521917,0,2015-01-04
1,0.0,28.800000,40.0,21.0,133.857143,7.514286,1015.521917,1,2015-01-11
2,0.0,30.100000,40.0,22.0,218.428571,10.871429,1015.521917,2,2015-01-18
3,0.0,28.157143,38.0,22.0,221.136948,9.985714,1015.521917,3,2015-01-25
4,0.0,28.257143,37.0,21.0,222.851234,9.742857,1015.521917,4,2015-02-01
...,...,...,...,...,...,...,...,...,...
491,22.8,21.200000,33.0,16.3,195.000000,9.728571,1020.828571,21,2024-05-26
492,7.9,22.285714,31.0,16.2,230.714286,7.185714,1021.585714,22,2024-06-02
493,0.0,23.414286,34.0,15.4,173.285714,7.500000,1018.357143,23,2024-06-09
494,0.0,23.885714,35.0,14.0,237.000000,7.971429,1016.342857,24,2024-06-16
