In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import datetime
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder

In [2]:
df1 = pd.read_csv('../input/wind_dataset_imputed1.csv')
df1['DATE']=pd.to_datetime(df1['DATE'])
df1 = df1.set_index('DATE')

df2 = pd.read_csv('../input/wind_dataset_imputed2_regresija.csv')
df2['DATE']=pd.to_datetime(df2['DATE'])
df2 = df2.set_index('DATE')

In [3]:
continuous_features = ["WIND","RAIN","T.MAX","T.MIN","T.MIN.G"]
discrete_features = ["IND","IND.1","IND.2"]

In [4]:
scaler1 = MinMaxScaler()
df1[continuous_features] = scaler1.fit_transform(df1[continuous_features])
    
scaler2 = MinMaxScaler()
df2[continuous_features] = scaler2.fit_transform(df2[continuous_features])

In [5]:
# save scalers for later. joblib.load("path do scalera") da ga se vrati.
import joblib
joblib.dump(scaler1, '../data/scaler1.gz')

joblib.dump(scaler2, '../data/scaler2.gz')

['./data/scaler2.gz']

In [6]:
df1.head(5)

Unnamed: 0_level_0,WIND,IND,RAIN,IND.1,T.MAX,IND.2,T.MIN,T.MIN.G
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1961-01-01,0.450115,0,0.002985,0.0,0.356877,0.0,0.515254,0.443709
1961-01-02,0.378663,0,0.076119,0.0,0.271375,0.0,0.532203,0.513245
1961-01-03,0.370431,0,0.00597,0.0,0.208178,0.0,0.40678,0.460265
1961-01-04,0.284162,0,0.002985,0.0,0.211896,0.0,0.40339,0.370861
1961-01-05,0.392493,0,0.155224,0.0,0.271375,1.0,0.338983,0.228477


In [7]:
df2.head(5)

Unnamed: 0_level_0,WIND,IND,RAIN,IND.1,T.MAX,IND.2,T.MIN,T.MIN.G
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1961-01-01,0.450115,0,0.002985,0.0,0.356877,0.0,0.515254,0.443709
1961-01-02,0.378663,0,0.076119,0.0,0.271375,0.0,0.532203,0.513245
1961-01-03,0.370431,0,0.00597,0.0,0.208178,0.0,0.40678,0.460265
1961-01-04,0.284162,0,0.002985,0.0,0.211896,0.0,0.40339,0.370861
1961-01-05,0.392493,0,0.155224,0.0,0.271375,1.0,0.338983,0.228477


In [8]:
one_hot = pd.get_dummies(df1['IND']).rename(columns=lambda x:'IND_' +str(x))
df1 = df1.drop('IND',axis = 1)
df1 = df1.join(one_hot)

one_hot = pd.get_dummies(df1['IND.1']).rename(columns=lambda x:'IND.1_' +str(x))
df1 = df1.drop('IND.1',axis = 1)
df1 = df1.join(one_hot)

one_hot = pd.get_dummies(df1['IND.2']).rename(columns=lambda x:'IND.2_' +str(x))
df1 = df1.drop('IND.2',axis = 1)
df1 = df1.join(one_hot)


one_hot = pd.get_dummies(df2['IND']).rename(columns=lambda x:'IND_' +str(x))
df2 = df2.drop('IND',axis = 1)
df2 = df2.join(one_hot)

one_hot = pd.get_dummies(df2['IND.1']).rename(columns=lambda x:'IND.1_' +str(x))
df2 = df2.drop('IND.1',axis = 1)
df2 = df2.join(one_hot)

one_hot = pd.get_dummies(df2['IND.2']).rename(columns=lambda x:'IND.2_' +str(x))
df2 = df2.drop('IND.2',axis = 1)
df2 = df2.join(one_hot)

In [9]:
df1.head()

Unnamed: 0_level_0,WIND,RAIN,T.MAX,T.MIN,T.MIN.G,IND_0,IND_1,IND_4,IND.1_0.0,IND.1_1.0,IND.1_2.0,IND.1_4.0,IND.2_0.0,IND.2_1.0,IND.2_2.0,IND.2_3.0,IND.2_4.0
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
1961-01-01,0.450115,0.002985,0.356877,0.515254,0.443709,1,0,0,1,0,0,0,1,0,0,0,0
1961-01-02,0.378663,0.076119,0.271375,0.532203,0.513245,1,0,0,1,0,0,0,1,0,0,0,0
1961-01-03,0.370431,0.00597,0.208178,0.40678,0.460265,1,0,0,1,0,0,0,1,0,0,0,0
1961-01-04,0.284162,0.002985,0.211896,0.40339,0.370861,1,0,0,1,0,0,0,1,0,0,0,0
1961-01-05,0.392493,0.155224,0.271375,0.338983,0.228477,1,0,0,1,0,0,0,0,1,0,0,0


In [10]:
df2.head()

Unnamed: 0_level_0,WIND,RAIN,T.MAX,T.MIN,T.MIN.G,IND_0,IND_1,IND_4,IND.1_0.0,IND.1_1.0,IND.1_2.0,IND.1_4.0,IND.2_0.0,IND.2_1.0,IND.2_2.0,IND.2_3.0,IND.2_4.0
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
1961-01-01,0.450115,0.002985,0.356877,0.515254,0.443709,1,0,0,1,0,0,0,1,0,0,0,0
1961-01-02,0.378663,0.076119,0.271375,0.532203,0.513245,1,0,0,1,0,0,0,1,0,0,0,0
1961-01-03,0.370431,0.00597,0.208178,0.40678,0.460265,1,0,0,1,0,0,0,1,0,0,0,0
1961-01-04,0.284162,0.002985,0.211896,0.40339,0.370861,1,0,0,1,0,0,0,1,0,0,0,0
1961-01-05,0.392493,0.155224,0.271375,0.338983,0.228477,1,0,0,1,0,0,0,0,1,0,0,0


In [11]:
df1.to_csv('../input/wind_dataset_imputed_and_scaled1.csv')
df2.to_csv('../input/wind_dataset_imputed_and_scaled2.csv')