In [None]:
# ## Descrição

 
# Este case consiste em um exercício prático de construção de um modelo e quais insights você consegue extrair dos dados.

# Desenhamos o case para que você possa mostrar as suas habilidades como cientista de dados.

# O conjunto de dados que fornecemos contém observações diárias do clima de algumas estações meteorológicas na Austrália.
 
# Os dados estão organizados em duas tabelas:

# - `rain_data_aus.csv`: Contém a maior parte das informações, já centralizadas, de todas as estações.

# - `wind_table_01.csv a wind_table_08.csv`: Contém informações sobre velocidade e orientação de ventos.


# As tabelas estão apartadas, pois são originadas de um outro instrumento e salvas em um sistema apartado.

 

# A descrição das colunas se encontra no arquivo `data_dictionary.md`.

# Submeta os arquivos em um repositório no git e nos envie o link para avaliação.

# Idealmente, queremos poder replicar sua análise a partir dos códigos enviados. Portanto, especifique as versões das ferramentas e pacotes que você está usando.


# ## Objetivo:

# Construir um modelo preditivo para determinar se vai ou não chover amanhã. 

In [41]:
import matplotlib.pyplot as plt
import numpy as np
from sklearn import datasets
import pandas as pd
from sklearn.metrics import precision_score, recall_score, roc_auc_score, plot_roc_curve
import warnings
warnings.simplefilter("ignore")

In [11]:
rain = pd.read_csv('data/rain_data_aus.csv')

In [5]:
rain.head()

Unnamed: 0,date,location,mintemp,maxtemp,rainfall,evaporation,sunshine,humidity9am,humidity3pm,pressure9am,...,temp9am,temp3pm,raintoday,amountOfRain,raintomorrow,temp,humidity,precipitation3pm,precipitation9am,modelo_vigente
0,2008-12-01,Albury,13.4,22.9,0.6,,,71.0,22.0,1007.7,...,16.9,21.8,No,0.0,No,29.48,28.4,12,5.11536,0.089825
1,2008-12-02,Albury,7.4,25.1,0.0,,,44.0,25.0,1010.6,...,17.2,24.3,No,0.0,No,32.12,2.208569,10,21.4971,0.023477
2,2008-12-03,Albury,12.9,25.7,0.0,,,38.0,30.0,1007.6,...,21.0,23.2,No,0.0,No,32.84,38.0,17,20.782859,0.02758
3,2008-12-04,Albury,9.2,28.0,0.0,,,45.0,16.0,1017.6,...,18.1,26.5,No,1.0,No,35.6,21.2,8,12.028646,0.023962
4,2008-12-05,Albury,17.5,32.3,1.0,,,82.0,33.0,1010.8,...,17.8,29.7,No,0.2,No,40.76,41.6,9,11.883546,0.220164


In [12]:
rain.shape

(142193, 23)

In [15]:
rain.columns

Index(['date', 'location', 'mintemp', 'maxtemp', 'rainfall', 'evaporation',
       'sunshine', 'humidity9am', 'humidity3pm', 'pressure9am', 'pressure3pm',
       'cloud9am', 'cloud3pm', 'temp9am', 'temp3pm', 'raintoday',
       'amountOfRain', 'raintomorrow', 'temp', 'humidity', 'precipitation3pm',
       'precipitation9am', 'modelo_vigente'],
      dtype='object')

In [20]:
rain.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 142193 entries, 0 to 142192
Data columns (total 23 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   date              142193 non-null  object 
 1   location          142193 non-null  object 
 2   mintemp           141556 non-null  float64
 3   maxtemp           141871 non-null  float64
 4   rainfall          140787 non-null  float64
 5   evaporation       81350 non-null   float64
 6   sunshine          74377 non-null   float64
 7   humidity9am       140419 non-null  float64
 8   humidity3pm       138583 non-null  float64
 9   pressure9am       128179 non-null  float64
 10  pressure3pm       128212 non-null  float64
 11  cloud9am          88536 non-null   float64
 12  cloud3pm          85099 non-null   float64
 13  temp9am           141289 non-null  float64
 14  temp3pm           139467 non-null  float64
 15  raintoday         140787 non-null  object 
 16  amountOfRain      14

In [None]:
# Variable | Description
# ---------|------------
# Date   |  The date of observation
# Location   |  The common name of the location of the weather station
# MinTemp   |  The minimum temperature in degrees celsius
# MaxTemp   |  The maximum temperature in degrees celsius
# Rainfall   |  The amount of rainfall recorded for the day in mm
# Evaporation   |  The so-called Class A pan evaporation (mm) in the 24 hours to 9am
# Sunshine   |  The number of hours of bright sunshine in the day.
# WindGustDir   |  The direction of the strongest wind gust in the 24 hours to midnight
# WindGustSpeed   |  The speed (km/h) of the strongest wind gust in the 24 hours to midnight
# WindDir9am   |  Direction of the wind at 9am
# WindDir3pm   |  Direction of the wind at 3pm
# WindSpeed9am   |  Wind speed (km/hr) averaged over 10 minutes prior to 9am
# WindSpeed3pm   |  Wind speed (km/hr) averaged over 10 minutes prior to 3pm
# Humidity9am   |  Humidity (percent) at 9am
# Humidity3pm   |  Humidity (percent) at 3pm
# Pressure9am   |  Atmospheric pressure (hpa) reduced to mean sea level at 9am
# Pressure3pm   |  Atmospheric pressure (hpa) reduced to mean sea level at 3pm
# Cloud9am   |  Fraction of sky obscured by cloud at 9am. This is measured in "oktas", which are a unit of eigths. It records how many eigths of the sky are obscured by cloud. A 0 measure indicates completely clear sky whilst an 8 indicates that it is completely overcast.
# Cloud3pm | Fraction of sky obscured by cloud (in "oktas": eighths) at 3pm. See Cload9am for a description of the values
# Temp9am |  Temperature (degrees C) at 9am
# Temp3pm |  Temperature (degrees C) at 3pm
# Precipitation9am |  The amount of rain in mm prior to 9am
# Precipitation3pm |  The amount of rain in mm prior to 3pm
# AmountOfRain |  The amount of rain in mm
# Temp |  Temperature (degrees C)
# Humidity |  Humidity (percent)
# RainToday |  Boolean: 1 if precipitation (mm) in the 24 hours to 9am exceeds 1mm, otherwise 0
# RainTomorrow |  The target variable. Did it rain tomorrow?

In [10]:
rain.describe()

Unnamed: 0,mintemp,maxtemp,rainfall,evaporation,sunshine,humidity9am,humidity3pm,pressure9am,pressure3pm,cloud9am,cloud3pm,temp9am,temp3pm,amountOfRain,temp,humidity,precipitation3pm,precipitation9am,modelo_vigente
count,141556.0,141871.0,140787.0,81350.0,74377.0,140419.0,138583.0,128179.0,128212.0,88536.0,85099.0,141289.0,139467.0,142193.0,141871.0,138583.0,142193.0,142193.0,142193.0
mean,12.1864,23.226784,2.349974,5.469824,7.624853,68.84381,51.482606,1017.653758,1015.258204,4.437189,4.503167,16.987509,21.687235,2.360682,28.505419,61.991179,10.014164,10.000748,0.227804
std,6.403283,7.117618,8.465173,4.188537,3.781525,19.051293,20.797772,7.105476,7.036677,2.887016,2.720633,6.492838,6.937594,8.477969,10.237506,26.649111,3.169832,4.997908,0.272764
min,-8.5,-4.8,0.0,0.0,0.0,0.0,0.0,980.5,977.1,0.0,0.0,-7.2,-5.4,0.0,-3.76,2.0,0.0,-17.739346,0.0
25%,7.6,17.9,0.0,2.6,4.9,57.0,37.0,1012.9,1010.4,1.0,2.0,12.3,16.6,0.0,22.52,44.0,8.0,6.650238,0.030055
50%,12.0,22.6,0.0,4.8,8.5,70.0,52.0,1017.6,1015.2,5.0,5.0,16.7,21.1,0.0,28.52,63.2,10.0,10.000009,0.100853
75%,16.8,28.2,0.8,7.4,10.6,83.0,66.0,1022.4,1020.0,7.0,7.0,21.6,26.4,0.8,35.48,80.0,12.0,13.389306,0.329507
max,33.9,48.1,371.0,145.0,14.5,100.0,100.0,1041.0,1039.6,9.0,9.0,40.2,46.7,371.0,59.72,122.0,26.0,32.47859,0.9994


In [25]:
rain['location'].unique().size # There are 49 locations

49

In [40]:
rain['date'].unique() # 3436 distinct dates, '2007-11-01' to '2017-06-25', about 9 years and a half of data (from size)

array(['2008-12-01', '2008-12-02', '2008-12-03', ..., '2008-01-29',
       '2008-01-30', '2008-01-31'], dtype=object)

In [41]:
rain.corr()

Unnamed: 0,mintemp,maxtemp,rainfall,evaporation,sunshine,humidity9am,humidity3pm,pressure9am,pressure3pm,cloud9am,cloud3pm,temp9am,temp3pm,amountOfRain,temp,humidity,precipitation3pm,precipitation9am,modelo_vigente
mintemp,1.0,0.736267,0.104255,0.467261,0.072961,-0.234211,0.005999,-0.45126,-0.461623,0.077625,0.020489,0.901813,0.708865,0.124743,0.584512,0.004145,0.0013,-0.001554,0.134768
maxtemp,0.736267,1.0,-0.074839,0.588915,0.469967,-0.505432,-0.50927,-0.332293,-0.427279,-0.289865,-0.279053,0.88702,0.984562,-0.044208,0.794183,-0.463989,0.001191,-0.000918,-0.235871
rainfall,0.104255,-0.074839,1.0,-0.064549,-0.227525,0.223725,0.255312,-0.168085,-0.126728,0.198195,0.171993,0.011477,-0.079178,0.308557,-0.059987,0.232724,-0.00241,-0.002792,0.361186
evaporation,0.467261,0.588915,-0.064549,1.0,0.366607,-0.50589,-0.392785,-0.269907,-0.29316,-0.185032,-0.184287,0.545497,0.574275,-0.043498,0.465288,-0.355983,0.004543,-0.002419,-0.164926
sunshine,0.072961,0.469967,-0.227525,0.366607,1.0,-0.491603,-0.629122,0.040959,-0.020464,-0.67561,-0.704202,0.291139,0.49018,-0.294973,0.369367,-0.571486,-0.00383,0.003155,-0.639583
humidity9am,-0.234211,-0.505432,0.223725,-0.50589,-0.491603,1.0,0.667388,0.139519,0.186955,0.452182,0.358043,-0.472826,-0.499777,0.172417,-0.400695,0.606853,-0.001347,0.002721,0.393754
humidity3pm,0.005999,-0.50927,0.255312,-0.392785,-0.629122,0.667388,1.0,-0.027449,0.05184,0.517037,0.52327,-0.221467,-0.557989,0.313183,-0.404326,0.908871,-0.000707,0.000464,0.662753
pressure9am,-0.45126,-0.332293,-0.168085,-0.269907,0.040959,0.139519,-0.027449,1.0,0.961348,-0.130081,-0.148139,-0.422773,-0.287301,-0.163673,-0.261068,-0.025464,-0.00176,0.001572,-0.361402
pressure3pm,-0.461623,-0.427279,-0.126728,-0.29316,-0.020464,0.186955,0.05184,0.961348,1.0,-0.061152,-0.084963,-0.470325,-0.389863,-0.164184,-0.335831,0.046812,-0.001651,0.001973,-0.316223
cloud9am,0.077625,-0.289865,0.198195,-0.185032,-0.67561,0.452182,0.517037,-0.130081,-0.061152,1.0,0.604118,-0.137843,-0.30252,0.198095,-0.227987,0.469853,-0.001553,0.000611,0.475929


Separação das variáveis X e y

X = rain_data_aus.drop(columns=['raintomorrow'])
y = pd.Series(rain_data_aus['raintomorrow'])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42,stratify=y?)

transformar yes and no em 1 e 0

In [None]:
vazamento de dados! 
ver coluna modelo vigente

In [6]:
rain.head()

Unnamed: 0,date,location,mintemp,maxtemp,rainfall,evaporation,sunshine,humidity9am,humidity3pm,pressure9am,...,temp9am,temp3pm,raintoday,amountOfRain,raintomorrow,temp,humidity,precipitation3pm,precipitation9am,modelo_vigente
0,2008-12-01,Albury,13.4,22.9,0.6,,,71.0,22.0,1007.7,...,16.9,21.8,No,0.0,No,29.48,28.4,12,5.11536,0.089825
1,2008-12-02,Albury,7.4,25.1,0.0,,,44.0,25.0,1010.6,...,17.2,24.3,No,0.0,No,32.12,2.208569,10,21.4971,0.023477
2,2008-12-03,Albury,12.9,25.7,0.0,,,38.0,30.0,1007.6,...,21.0,23.2,No,0.0,No,32.84,38.0,17,20.782859,0.02758
3,2008-12-04,Albury,9.2,28.0,0.0,,,45.0,16.0,1017.6,...,18.1,26.5,No,1.0,No,35.6,21.2,8,12.028646,0.023962
4,2008-12-05,Albury,17.5,32.3,1.0,,,82.0,33.0,1010.8,...,17.8,29.7,No,0.2,No,40.76,41.6,9,11.883546,0.220164


In [18]:
rain.loc[rain['raintoday'] == 'No','raintoday'] = 0
rain.loc[rain['raintoday'] == 'Yes','raintoday'] = 1

In [23]:
rain.loc[rain['raintomorrow'] == 'No','raintomorrow'] = 0
rain.loc[rain['raintomorrow'] == 'Yes','raintomorrow'] = 1

In [26]:
pd.set_option('display.max_columns', None)

In [31]:
rain = rain.drop(['modelo_vigente'],axis=1)

In [None]:
# muitas coisas estranhas:
# ######nans on the raintoday column
# tem dias que raintoday = 0, mas amounOfRain != 0
# pior ainda, tem dias que raintoday = 1, mas amountOfRain = 0
# essa coluna raintoday é bizarra, acho que vou tirar ela de início..

In [37]:
rain_all_nans_dropped = rain.dropna()

In [36]:
rain.shape

(142193, 22)

In [38]:
rain_all_nans_dropped.shape

(61918, 22)

In [40]:
rain_all_nans_dropped.head(5) 

Unnamed: 0,date,location,mintemp,maxtemp,rainfall,evaporation,sunshine,humidity9am,humidity3pm,pressure9am,pressure3pm,cloud9am,cloud3pm,temp9am,temp3pm,raintoday,amountOfRain,raintomorrow,temp,humidity,precipitation3pm,precipitation9am
5939,2009-01-01,Cobar,17.9,35.2,0.0,12.0,12.3,20.0,13.0,1006.3,1004.4,2.0,5.0,26.6,33.4,0,0.0,0,44.24,17.6,5,15.165784
5940,2009-01-02,Cobar,18.4,28.9,0.0,14.8,13.0,30.0,8.0,1012.9,1012.1,1.0,1.0,20.3,27.0,0,0.0,0,36.68,11.6,9,9.750834
5942,2009-01-04,Cobar,19.4,37.6,0.0,10.8,10.6,42.0,22.0,1012.3,1009.2,1.0,6.0,28.7,34.9,0,0.0,0,47.12,28.4,11,5.557986
5943,2009-01-05,Cobar,21.9,38.4,0.0,11.4,12.2,37.0,22.0,1012.7,1009.1,1.0,5.0,29.1,35.6,0,0.0,0,48.08,28.4,10,17.820001
5944,2009-01-06,Cobar,24.2,41.0,0.0,11.2,8.4,19.0,15.0,1010.7,1007.4,1.0,6.0,33.6,37.6,0,0.0,0,51.2,20.0,9,13.217116
