In [1]:
# récupération du package scikit-plot
!pip install scikit-plot



In [2]:
# récupération des librairies
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import time, datetime
import scikitplot as skplt

from scipy.spatial.distance import cdist
from scipy.stats import chi2_contingency

from sklearn import ensemble, linear_model, model_selection, neighbors, preprocessing, svm

from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestClassifier, GradientBoostingRegressor
from sklearn.feature_selection import SelectFromModel, SelectKBest, chi2
from sklearn.impute import KNNImputer
from sklearn.linear_model import LogisticRegression, SGDClassifier, Lasso, LassoCV, lasso_path, RidgeCV
from sklearn.metrics import classification_report, accuracy_score, roc_curve, auc, mean_squared_error, recall_score, f1_score, precision_score
from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score, learning_curve, KFold
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

from imblearn.metrics import classification_report_imbalanced
from imblearn.under_sampling import RandomUnderSampler

In [8]:
# récupération du jeu de données sauvegardé
df = pd.read_csv('../data/weatherAUS_preprocessed_before_city_clust_with_temp.csv')
df.head()

Unnamed: 0,Date,Location,MaxTemp,Rainfall,Sunshine,WindGustSpeed,Humidity3pm,Pressure9am,Cloud3pm,Temp3pm,RainToday,RainTomorrow,Temp_Delta_MinMax,Humidity_Delta
0,2008-12-01,Albury,22.9,0.6,0.15,44.0,22.0,1007.7,8.0,21.8,0,0,9.5,-49.0
1,2008-12-02,Albury,25.1,0.0,7.63054,44.0,25.0,1010.6,4.49925,24.3,0,0,17.7,-19.0
2,2008-12-03,Albury,25.7,0.0,11.7,46.0,30.0,1007.6,2.0,23.2,0,0,12.8,-8.0
3,2008-12-04,Albury,28.0,0.0,7.63054,24.0,16.0,1017.6,4.49925,26.5,0,0,18.8,-29.0
4,2008-12-05,Albury,32.3,1.0,2.55,41.0,33.0,1010.8,8.0,29.7,0,0,14.8,-49.0


In [9]:
df.columns

Index(['Date', 'Location', 'MaxTemp', 'Rainfall', 'Sunshine', 'WindGustSpeed',
       'Humidity3pm', 'Pressure9am', 'Cloud3pm', 'Temp3pm', 'RainToday',
       'RainTomorrow', 'Temp_Delta_MinMax', 'Humidity_Delta'],
      dtype='object')

In [10]:
# Deal with date
index_date = pd.to_datetime(df['Date'])
df = df.set_index(index_date)

In [11]:
# Create dictionary grouping by Location prior to creation of new column for temp next day
d = dict(tuple(df.groupby('Location')))

In [12]:
#Create new column of temp for next day -  by taking following days temp
# Can change reach to future by changing freq='-1d'
place_dfs = []
for place in d:
  d[place]['MaxTempTomorrow'] = d[place].MaxTemp.shift(freq='-1d')
  place_dfs.append(d[place])

In [13]:
# Recombine all sepeate locations into one DF
df_pre_temp = pd.concat(place_dfs)

In [14]:
# expect some nans due to next day data not being available
df_pre_temp.isna().sum()

Date                    0
Location                0
MaxTemp                 0
Rainfall                0
Sunshine                0
WindGustSpeed           0
Humidity3pm             0
Pressure9am             0
Cloud3pm                0
Temp3pm                 0
RainToday               0
RainTomorrow            0
Temp_Delta_MinMax       0
Humidity_Delta          0
MaxTempTomorrow      1505
dtype: int64

In [15]:
# Delete lines with Nans
df_pre_temp.dropna(inplace = True)

In [16]:
df_pre_temp.isna().sum()

Date                 0
Location             0
MaxTemp              0
Rainfall             0
Sunshine             0
WindGustSpeed        0
Humidity3pm          0
Pressure9am          0
Cloud3pm             0
Temp3pm              0
RainToday            0
RainTomorrow         0
Temp_Delta_MinMax    0
Humidity_Delta       0
MaxTempTomorrow      0
dtype: int64

In [18]:
aus_town_gps = pd.read_csv("../data/aus_town_gps.csv",sep=",")
climatsaus = pd.read_csv("../data/climatsAUS_v2.csv",sep=";")
climats_type = {'Am':'chaud_humide',
                'Aw':'chaud_humide',
                'Cfa':'chaud_humide',
                'Cfb':'tempéré_froid', 
                'Cfc':'tempéré_froid', 
                'BSh':'sec',
                'BSk':'sec',
                'Bsk':'sec', 
                'Bwh':'sec',
                'Csa':'méditerranéen',
                'Csb':'méditerranéen'              
               }

climatsaus['Clim_type']=climatsaus['Climat_Koppen'].map(climats_type)

df_pre_temp = pd.merge(df_pre_temp, aus_town_gps, how='left', left_on="Location",right_on="Location")
df_pre_temp = pd.merge(df_pre_temp, climatsaus, how='left', left_on="Location",right_on="Location")

In [19]:
clim_indic = pd.get_dummies(df_pre_temp.Clim_type, prefix='clim')
df_pre_temp = df_pre_temp.join(clim_indic).drop('Clim_type', axis=1)

In [20]:
#Deal with date 
df_pre_temp['year'] = pd.to_datetime(df_pre_temp['Date']).dt.year
df_pre_temp['month'] = pd.to_datetime(df_pre_temp['Date']).dt.month
df_pre_temp['day'] = pd.to_datetime(df_pre_temp['Date']).dt.day

In [21]:
df_pre_temp.drop(columns=['year','day','Date'], inplace = True)

In [22]:
#Split months into 4 seasons
seasons_type = {1:'ete',
                2:'ete',
                3:'automne',
                4:'automne', 
                5:'automne', 
                6:'hiver',
                7:'hiver',
                8:'hiver', 
                9:'primtemps',
                10:'primtemps',
                11:'primtemps',
                12:'ete'              
               }

df_pre_temp['Season']=df_pre_temp['month'].map(seasons_type)

In [23]:
# get dummies for seasons
season_indic = pd.get_dummies(df_pre_temp.Season, prefix='Season')
df_pre_temp = df_pre_temp.join(season_indic).drop('Season', axis=1)

In [25]:
# split wind dir into 4 compass points rather than 16
#compass_points = {'N':'NW',
#                'NNW':'NW',
#                'WNW':'NW',
#                'W':'SW', 
#                'WSW':'SW', 
#                'SSW':'SW',
#                'S':'SE',
#                'SSE':'SE', 
#                'ESE':'SE',
#                'ENE':'NE',
#                'E':'NE',
#                'NNE':'NE'              
#               }

#df_pre_temp['WindDir3pm']=df_pre_temp['WindDir3pm'].map(compass_points)

In [20]:
# Get dummies for wind dir 4 compass points
#winddir_indic = pd.get_dummies(df_pre_temp.WindDir3pm, prefix='WindDir')
#df_pre_temp = df_pre_temp.join(winddir_indic).drop('WindDir3pm', axis=1)

In [26]:
#clean features
df_pre_temp.drop(columns=[ 'Latitude', 
                 'Longitude', 'Climat_Koppen', 'month', 'Location'], inplace = True)

In [27]:
#Create target and features
target = df_pre_temp.MaxTempTomorrow
data = df_pre_temp.drop('MaxTempTomorrow', axis = 1)

In [28]:
scaler = StandardScaler()

In [29]:
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size = 0.2)

In [30]:
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [31]:
# Pick grad_boost for initial try - fastish with decent results
grad_boost = GradientBoostingRegressor()

grad_boost.fit(X_train_scaled, y_train)

y_pred = grad_boost.predict(X_test_scaled)

print('Train Data Score: {}'.format(grad_boost.score(X_train_scaled,y_train)))
print('Test Data Score: {}'.format(grad_boost.score(X_test_scaled,y_test)))



Train Data Score: 0.8281676287524122
Test Data Score: 0.8270172788963379


Ce notebook est cense d'etre un 'proof of concept': 
- est-ce que c'est possible: oui, 
- est-ce que c'est interessant a faire: le score present bien
- est-ce que c'est possible da'ller plus loin: oui avec un etude des autre modeles et aussi pour les forcasts de plus d'un jour.