In [46]:
import pandas as pd
import numpy as np
pd.set_option("display.max_columns",500)

In [47]:
#weather data
weather = pd.read_csv('./data/weather.csv')

In [48]:
weather.head()

Unnamed: 0,Station,Date,Tmax,Tmin,Tavg,Depart,DewPoint,WetBulb,Heat,Cool,Sunrise,Sunset,CodeSum,Depth,Water1,SnowFall,PrecipTotal,StnPressure,SeaLevel,ResultSpeed,ResultDir,AvgSpeed
0,1,2007-05-01,83,50,67,14,51,56,0,2,0448,1849,,0,M,0.0,0.0,29.1,29.82,1.7,27,9.2
1,2,2007-05-01,84,52,68,M,51,57,0,3,-,-,,M,M,M,0.0,29.18,29.82,2.7,25,9.6
2,1,2007-05-02,59,42,51,-3,42,47,14,0,0447,1850,BR,0,M,0.0,0.0,29.38,30.09,13.0,4,13.4
3,2,2007-05-02,60,43,52,M,42,47,13,0,-,-,BR HZ,M,M,M,0.0,29.44,30.08,13.3,2,13.4
4,1,2007-05-03,66,46,56,2,40,48,9,0,0446,1851,,0,M,0.0,0.0,29.39,30.12,11.7,7,11.9


In [49]:
#not useful for modeling reasons, just here as a legend.

dummy_meanings = {
'+FC' : 'TORNADO/WATERSPOUT',
 'FC' : 'FUNNEL CLOUD',
 'TS' : 'THUNDERSTORM',
 'GR' : 'HAIL',
 'RA' : 'RAIN',
 'DZ' : 'DRIZZLE',
 'SN' : 'SNOW',
 'SG' : 'SNOW GRAINS',
 'GS' : 'SMALL HAIL &/OR SNOW PELLETS',
 'PL' : 'ICE PELLETS',
 'IC' : 'ICE CRYSTALS',
 'FG+' : 'HEAVY FOG (FG & LE.25 MILES VISIBILITY)',
 'FG' : 'FOG',
 'BR' : 'MIST',
 'UP' : 'UNKNOWN PRECIPITATION',
 'HZ' : 'HAZE',
 'FU' : 'SMOKE',
 'VA' : 'VOLCANIC ASH',
 'DU' : 'WIDESPREAD DUST',
 'DS' : 'DUSTSTORM',
 'PO' : 'SAND/DUST WHIRLS',
 'SA' : 'SAND',
 'SS' : 'SANDSTORM',
 'PY' : 'SPRAY',
 'SQ' : 'SQUALL',
 'DR' : 'LOW DRIFTING',
 'SH' : 'SHOWER',
 'FZ' : 'FREEZING',
 'MI' : 'SHALLOW',
 'PR' : 'PARTIAL',
 'BC' : 'PATCHES',
 'BL' : 'BLOWING',
 'VC' : 'VICINITY'
}

In [50]:
conditions_list = ['+FC', 'FC',  'TS',  'GR', 'RA','DZ', 'SN','SG', 'GS', 'PL', 'IC',
                   'FG+','FG', 'BR', 'UP', 'HZ', 'FU', 'VA', 'DU', 'DS', 'PO', 'SA', 
                   'SS', 'PY', 'SQ', 'DR','SH', 'FZ', 'MI', 'PR', 'BC', 'BL', 'VC'] 

In [51]:
condition_dummys = {}
for condition in conditions_list:
    condition_dummys[f'{condition}_dummy'] = []
    for cell in weather.CodeSum:
        if condition in cell:
            condition_dummys[f'{condition}_dummy'].append(1)
        else:
            condition_dummys[f'{condition}_dummy'].append(0)

In [52]:
cond_dummys = pd.DataFrame.from_dict(condition_dummys)
cond_dummys.head()

Unnamed: 0,+FC_dummy,FC_dummy,TS_dummy,GR_dummy,RA_dummy,DZ_dummy,SN_dummy,SG_dummy,GS_dummy,PL_dummy,IC_dummy,FG+_dummy,FG_dummy,BR_dummy,UP_dummy,HZ_dummy,FU_dummy,VA_dummy,DU_dummy,DS_dummy,PO_dummy,SA_dummy,SS_dummy,PY_dummy,SQ_dummy,DR_dummy,SH_dummy,FZ_dummy,MI_dummy,PR_dummy,BC_dummy,BL_dummy,VC_dummy
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [53]:
weather = pd.concat([weather, cond_dummys], axis=1)

In [54]:
weather.drop(['CodeSum'], axis=1, inplace=True)

In [55]:
weather.head(4)

Unnamed: 0,Station,Date,Tmax,Tmin,Tavg,Depart,DewPoint,WetBulb,Heat,Cool,Sunrise,Sunset,Depth,Water1,SnowFall,PrecipTotal,StnPressure,SeaLevel,ResultSpeed,ResultDir,AvgSpeed,+FC_dummy,FC_dummy,TS_dummy,GR_dummy,RA_dummy,DZ_dummy,SN_dummy,SG_dummy,GS_dummy,PL_dummy,IC_dummy,FG+_dummy,FG_dummy,BR_dummy,UP_dummy,HZ_dummy,FU_dummy,VA_dummy,DU_dummy,DS_dummy,PO_dummy,SA_dummy,SS_dummy,PY_dummy,SQ_dummy,DR_dummy,SH_dummy,FZ_dummy,MI_dummy,PR_dummy,BC_dummy,BL_dummy,VC_dummy
0,1,2007-05-01,83,50,67,14,51,56,0,2,0448,1849,0,M,0.0,0.0,29.1,29.82,1.7,27,9.2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,2,2007-05-01,84,52,68,M,51,57,0,3,-,-,M,M,M,0.0,29.18,29.82,2.7,25,9.6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,1,2007-05-02,59,42,51,-3,42,47,14,0,0447,1850,0,M,0.0,0.0,29.38,30.09,13.0,4,13.4,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,2,2007-05-02,60,43,52,M,42,47,13,0,-,-,M,M,M,0.0,29.44,30.08,13.3,2,13.4,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [56]:
weather.Depth = 0* weather.shape[0]

In [57]:
weather.SnowFall = 0* weather.shape[0]

In [58]:
weather.drop(['Water1'], axis=1, inplace=True)

In [59]:
weather['Tavg']= (weather.Tmax+weather.Tmin)/2

In [60]:
weather.drop(['Depart'], axis=1, inplace=True)

In [61]:
weather[(weather.Heat == 'M')]

Unnamed: 0,Station,Date,Tmax,Tmin,Tavg,DewPoint,WetBulb,Heat,Cool,Sunrise,Sunset,Depth,SnowFall,PrecipTotal,StnPressure,SeaLevel,ResultSpeed,ResultDir,AvgSpeed,+FC_dummy,FC_dummy,TS_dummy,GR_dummy,RA_dummy,DZ_dummy,SN_dummy,SG_dummy,GS_dummy,PL_dummy,IC_dummy,FG+_dummy,FG_dummy,BR_dummy,UP_dummy,HZ_dummy,FU_dummy,VA_dummy,DU_dummy,DS_dummy,PO_dummy,SA_dummy,SS_dummy,PY_dummy,SQ_dummy,DR_dummy,SH_dummy,FZ_dummy,MI_dummy,PR_dummy,BC_dummy,BL_dummy,VC_dummy
7,2,2007-05-04,78,51,64.5,42,50,M,M,-,-,0,0,0.00,29.36,30.04,10.1,7,10.4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
505,2,2008-07-08,86,46,66.0,68,71,M,M,-,-,0,0,0.28,29.16,29.80,7.4,24,8.3,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
675,2,2008-10-01,62,46,54.0,41,47,M,M,-,-,0,0,0.00,29.3,29.96,10.9,33,11.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1637,2,2011-07-22,100,71,85.5,70,74,M,M,-,-,0,0,0.14,29.23,29.86,3.8,10,8.2,0,0,1,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2067,2,2012-08-22,84,72,78.0,51,61,M,M,-,-,0,0,0.00,29.39,M,4.7,19,M,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2211,2,2013-05-02,71,42,56.5,39,45,M,M,-,-,0,0,0.00,29.51,30.17,15.8,2,16.1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2501,2,2013-09-24,91,52,71.5,48,54,M,M,-,-,0,0,0.00,29.33,30.00,5.8,9,7.7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2511,2,2013-09-29,84,53,68.5,48,54,M,M,-,-,0,0,0.22,29.36,30.01,6.3,36,7.8,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2525,2,2013-10-06,76,48,62.0,44,50,M,M,-,-,0,0,0.06,29.1,29.76,10.1,25,10.6,0,0,0,0,1,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2579,2,2014-05-02,80,47,63.5,43,47,M,M,-,-,0,0,0.04,29.1,29.79,10.7,23,11.9,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [62]:
weather.Cool.replace('M', 0, inplace=True)
weather.Heat.replace('M', 0, inplace=True)

In [63]:
weather.WetBulb.replace('M', 60, inplace=True)

In [64]:
weather.WetBulb = weather.WetBulb.astype(int)

In [65]:
weather.Heat = weather.Heat.astype(float)
weather.Cool = weather.Cool.astype(float)

In [66]:
weather.drop(['Sunrise', 'Sunset'], axis=1, inplace=True)

In [67]:
weather.PrecipTotal.replace('T', 0.005, inplace=True)
weather.PrecipTotal.replace(' T', 0.005, inplace=True)
weather.PrecipTotal.replace('  T', 0.005, inplace=True)
weather.PrecipTotal.replace('M', 0, inplace=True)

In [68]:
weather.PrecipTotal = weather.PrecipTotal.astype(float)

In [69]:
weather.StnPressure.replace('M', 30, inplace=True)

In [70]:
weather.StnPressure = weather.StnPressure.astype(float)

In [71]:
weather.SeaLevel.replace('M', 30, inplace=True)

In [72]:
weather.SeaLevel = weather.SeaLevel.astype(float)

In [73]:
weather.AvgSpeed.replace('M', 7, inplace=True)

In [74]:
weather.AvgSpeed = weather.AvgSpeed.astype(float)

In [75]:
weather.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2944 entries, 0 to 2943
Data columns (total 50 columns):
Station        2944 non-null int64
Date           2944 non-null object
Tmax           2944 non-null int64
Tmin           2944 non-null int64
Tavg           2944 non-null float64
DewPoint       2944 non-null int64
WetBulb        2944 non-null int64
Heat           2944 non-null float64
Cool           2944 non-null float64
Depth          2944 non-null int64
SnowFall       2944 non-null int64
PrecipTotal    2944 non-null float64
StnPressure    2944 non-null float64
SeaLevel       2944 non-null float64
ResultSpeed    2944 non-null float64
ResultDir      2944 non-null int64
AvgSpeed       2944 non-null float64
+FC_dummy      2944 non-null int64
FC_dummy       2944 non-null int64
TS_dummy       2944 non-null int64
GR_dummy       2944 non-null int64
RA_dummy       2944 non-null int64
DZ_dummy       2944 non-null int64
SN_dummy       2944 non-null int64
SG_dummy       2944 non-null int64
G

split stations

In [76]:
weather.set_index('Date', inplace=True)

In [77]:
weather_1 = weather[(weather.Station == 1)]

In [78]:
weather_2 = weather[(weather.Station == 2)]

In [79]:
weather_1.drop(['Station'], axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


In [80]:
weather_1.head()

Unnamed: 0_level_0,Tmax,Tmin,Tavg,DewPoint,WetBulb,Heat,Cool,Depth,SnowFall,PrecipTotal,StnPressure,SeaLevel,ResultSpeed,ResultDir,AvgSpeed,+FC_dummy,FC_dummy,TS_dummy,GR_dummy,RA_dummy,DZ_dummy,SN_dummy,SG_dummy,GS_dummy,PL_dummy,IC_dummy,FG+_dummy,FG_dummy,BR_dummy,UP_dummy,HZ_dummy,FU_dummy,VA_dummy,DU_dummy,DS_dummy,PO_dummy,SA_dummy,SS_dummy,PY_dummy,SQ_dummy,DR_dummy,SH_dummy,FZ_dummy,MI_dummy,PR_dummy,BC_dummy,BL_dummy,VC_dummy
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1
2007-05-01,83,50,66.5,51,56,0.0,2.0,0,0,0.0,29.1,29.82,1.7,27,9.2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2007-05-02,59,42,50.5,42,47,14.0,0.0,0,0,0.0,29.38,30.09,13.0,4,13.4,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2007-05-03,66,46,56.0,40,48,9.0,0.0,0,0,0.0,29.39,30.12,11.7,7,11.9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2007-05-04,66,49,57.5,41,50,7.0,0.0,0,0,0.005,29.31,30.05,10.4,8,10.8,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2007-05-05,66,53,59.5,38,49,5.0,0.0,0,0,0.005,29.4,30.1,11.7,7,12.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [81]:
weather_2.drop(['Station'], axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


In [82]:
weather_2.head()

Unnamed: 0_level_0,Tmax,Tmin,Tavg,DewPoint,WetBulb,Heat,Cool,Depth,SnowFall,PrecipTotal,StnPressure,SeaLevel,ResultSpeed,ResultDir,AvgSpeed,+FC_dummy,FC_dummy,TS_dummy,GR_dummy,RA_dummy,DZ_dummy,SN_dummy,SG_dummy,GS_dummy,PL_dummy,IC_dummy,FG+_dummy,FG_dummy,BR_dummy,UP_dummy,HZ_dummy,FU_dummy,VA_dummy,DU_dummy,DS_dummy,PO_dummy,SA_dummy,SS_dummy,PY_dummy,SQ_dummy,DR_dummy,SH_dummy,FZ_dummy,MI_dummy,PR_dummy,BC_dummy,BL_dummy,VC_dummy
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1
2007-05-01,84,52,68.0,51,57,0.0,3.0,0,0,0.0,29.18,29.82,2.7,25,9.6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2007-05-02,60,43,51.5,42,47,13.0,0.0,0,0,0.0,29.44,30.08,13.3,2,13.4,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2007-05-03,67,48,57.5,40,50,7.0,0.0,0,0,0.0,29.46,30.12,12.9,6,13.2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2007-05-04,78,51,64.5,42,50,0.0,0.0,0,0,0.0,29.36,30.04,10.1,7,10.4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2007-05-05,66,54,60.0,39,50,5.0,0.0,0,0,0.005,29.46,30.09,11.2,7,11.5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [83]:
weather_1.to_csv('weather_1_UNSCALED.csv')

In [84]:
weather_2.to_csv('weather_2_UNSCALED.csv')

_______________

### scaling

In [85]:
from sklearn.preprocessing import StandardScaler

In [86]:
scaler = StandardScaler()

In [87]:
weather_1_scaled = weather_1

In [88]:
weather_2_scaled = weather_2

In [89]:
weather_1_scaled[weather_1_scaled.columns] = scaler.fit_transform(weather_1[weather_1.columns])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [90]:
weather_2_scaled[weather_2_scaled.columns] = scaler.fit_transform(weather_2[weather_2.columns])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [91]:
weather_1_scaled.to_csv('weather_1_scaled.csv')

In [92]:
weather_2_scaled.to_csv('weather_2_scaled.csv')