In [1]:
import pandas as pd 
import numpy as np 
import seaborn as sns 
from sklearn.linear_model import LinearRegression 
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV, cross_validate, cross_val_score, cross_val_predict
from sklearn.feature_selection import RFECV
from sklearn.metrics import mean_squared_error
from sklearn.metrics.scorer import make_scorer 
import matplotlib.pyplot as plt 
%matplotlib inline 


In [55]:
train = pd.read_csv("data/train.csv")

In [56]:
train.head()

Unnamed: 0,X,Y,target_2015,elevation,precip 2014-11-16 - 2014-11-23,precip 2014-11-23 - 2014-11-30,precip 2014-11-30 - 2014-12-07,precip 2014-12-07 - 2014-12-14,precip 2014-12-14 - 2014-12-21,precip 2014-12-21 - 2014-12-28,...,precip 2019-03-24 - 2019-03-31,precip 2019-03-31 - 2019-04-07,precip 2019-04-07 - 2019-04-14,precip 2019-04-14 - 2019-04-21,precip 2019-04-21 - 2019-04-28,precip 2019-04-28 - 2019-05-05,precip 2019-05-05 - 2019-05-12,precip 2019-05-12 - 2019-05-19,LC_Type1_mode,Square_ID
0,34.26,-15.91,0.0,887.764222,0.0,0.0,0.0,14.844025,14.552823,12.237766,...,0.896323,1.68,0.0,0.0,0.0,0.0,0.0,0.0,9,4e3c3896-14ce-11ea-bce5-f49634744a41
1,34.26,-15.9,0.0,743.403912,0.0,0.0,0.0,14.844025,14.552823,12.237766,...,0.896323,1.68,0.0,0.0,0.0,0.0,0.0,0.0,9,4e3c3897-14ce-11ea-bce5-f49634744a41
2,34.26,-15.89,0.0,565.728343,0.0,0.0,0.0,14.844025,14.552823,12.237766,...,0.896323,1.68,0.0,0.0,0.0,0.0,0.0,0.0,9,4e3c3898-14ce-11ea-bce5-f49634744a41
3,34.26,-15.88,0.0,443.392774,0.0,0.0,0.0,14.844025,14.552823,12.237766,...,0.896323,1.68,0.0,0.0,0.0,0.0,0.0,0.0,10,4e3c3899-14ce-11ea-bce5-f49634744a41
4,34.26,-15.87,0.0,437.443428,0.0,0.0,0.0,14.844025,14.552823,12.237766,...,0.896323,1.68,0.0,0.0,0.0,0.0,0.0,0.0,10,4e3c389a-14ce-11ea-bce5-f49634744a41


In [57]:
# train = train.set_index("Square_ID")

In [58]:
cols_2015 = [col for col in train.columns if "2014" in col or "2015" in col]

In [59]:
def grade_rmse(ytrue, ypred):
    return np.sqrt(mean_squared_error(ytrue, ypred))

In [60]:
my_scorer = make_scorer(grade_rmse, greater_is_better=False)

In [61]:
cols_2015

['target_2015',
 'precip 2014-11-16 - 2014-11-23',
 'precip 2014-11-23 - 2014-11-30',
 'precip 2014-11-30 - 2014-12-07',
 'precip 2014-12-07 - 2014-12-14',
 'precip 2014-12-14 - 2014-12-21',
 'precip 2014-12-21 - 2014-12-28',
 'precip 2014-12-28 - 2015-01-04',
 'precip 2015-01-04 - 2015-01-11',
 'precip 2015-01-11 - 2015-01-18',
 'precip 2015-01-18 - 2015-01-25',
 'precip 2015-01-25 - 2015-02-01',
 'precip 2015-02-01 - 2015-02-08',
 'precip 2015-02-08 - 2015-02-15',
 'precip 2015-02-15 - 2015-02-22',
 'precip 2015-02-22 - 2015-03-01',
 'precip 2015-03-01 - 2015-03-08',
 'precip 2015-03-08 - 2015-03-15']

In [62]:
x_main = train[["X", "Y", "elevation", "LC_Type1_mode", 'precip 2014-11-16 - 2014-11-23',
 'precip 2014-11-23 - 2014-11-30',
 'precip 2014-11-30 - 2014-12-07',
 'precip 2014-12-07 - 2014-12-14',
 'precip 2014-12-14 - 2014-12-21',
 'precip 2014-12-21 - 2014-12-28',
 'precip 2014-12-28 - 2015-01-04',
 'precip 2015-01-04 - 2015-01-11',
 'precip 2015-01-11 - 2015-01-18',
 'precip 2015-01-18 - 2015-01-25',
 'precip 2015-01-25 - 2015-02-01',
 'precip 2015-02-01 - 2015-02-08',
 'precip 2015-02-08 - 2015-02-15',
 'precip 2015-02-15 - 2015-02-22',
 'precip 2015-02-22 - 2015-03-01',
 'precip 2015-03-01 - 2015-03-08',
 'precip 2015-03-08 - 2015-03-15']]

In [63]:
y = train[["target_2015"]]

In [64]:
x_main

Unnamed: 0,X,Y,elevation,LC_Type1_mode,precip 2014-11-16 - 2014-11-23,precip 2014-11-23 - 2014-11-30,precip 2014-11-30 - 2014-12-07,precip 2014-12-07 - 2014-12-14,precip 2014-12-14 - 2014-12-21,precip 2014-12-21 - 2014-12-28,...,precip 2015-01-04 - 2015-01-11,precip 2015-01-11 - 2015-01-18,precip 2015-01-18 - 2015-01-25,precip 2015-01-25 - 2015-02-01,precip 2015-02-01 - 2015-02-08,precip 2015-02-08 - 2015-02-15,precip 2015-02-15 - 2015-02-22,precip 2015-02-22 - 2015-03-01,precip 2015-03-01 - 2015-03-08,precip 2015-03-08 - 2015-03-15
0,34.26,-15.91,887.764222,9,0.000000,0.000000,0.000000,14.844025,14.552823,12.237766,...,30.127047,30.449468,1.521829,29.389995,32.878318,8.179804,0.963981,16.659097,3.304466,0.000000
1,34.26,-15.90,743.403912,9,0.000000,0.000000,0.000000,14.844025,14.552823,12.237766,...,30.127047,30.449468,1.521829,29.389995,32.878318,8.179804,0.963981,16.659097,3.304466,0.000000
2,34.26,-15.89,565.728343,9,0.000000,0.000000,0.000000,14.844025,14.552823,12.237766,...,30.127047,30.449468,1.521829,29.389995,32.878318,8.179804,0.963981,16.659097,3.304466,0.000000
3,34.26,-15.88,443.392774,10,0.000000,0.000000,0.000000,14.844025,14.552823,12.237766,...,30.127047,30.449468,1.521829,29.389995,32.878318,8.179804,0.963981,16.659097,3.304466,0.000000
4,34.26,-15.87,437.443428,10,0.000000,0.000000,0.000000,14.844025,14.552823,12.237766,...,30.127047,30.449468,1.521829,29.389995,32.878318,8.179804,0.963981,16.659097,3.304466,0.000000
5,34.26,-15.86,405.631747,10,0.000000,0.000000,0.000000,14.844025,14.552823,12.237766,...,30.127047,30.449468,1.521829,29.389995,32.878318,8.179804,0.963981,16.659097,3.304466,0.000000
6,34.26,-15.85,389.261362,10,0.000000,0.000000,0.000000,14.844025,14.552823,12.237766,...,30.127047,30.449468,1.521829,29.389995,32.878318,8.179804,0.963981,16.659097,3.304466,0.000000
7,34.26,-15.84,383.123454,10,0.000000,0.000000,0.000000,14.844025,14.552823,12.237766,...,30.127047,30.449468,1.521829,29.389995,32.878318,8.179804,0.963981,16.659097,3.304466,0.000000
8,34.26,-15.83,382.920492,10,0.000000,0.000000,0.000000,14.844025,14.552823,12.237766,...,30.127047,30.449468,1.521829,29.389995,32.878318,8.179804,0.963981,16.659097,3.304466,0.000000
9,34.27,-15.92,826.058090,9,0.000000,0.000000,0.000000,14.844025,14.552823,12.237766,...,30.127047,30.449468,1.521829,29.389995,32.878318,8.179804,0.963981,16.659097,3.304466,0.000000


In [65]:
x_main.head()

Unnamed: 0,X,Y,elevation,LC_Type1_mode,precip 2014-11-16 - 2014-11-23,precip 2014-11-23 - 2014-11-30,precip 2014-11-30 - 2014-12-07,precip 2014-12-07 - 2014-12-14,precip 2014-12-14 - 2014-12-21,precip 2014-12-21 - 2014-12-28,...,precip 2015-01-04 - 2015-01-11,precip 2015-01-11 - 2015-01-18,precip 2015-01-18 - 2015-01-25,precip 2015-01-25 - 2015-02-01,precip 2015-02-01 - 2015-02-08,precip 2015-02-08 - 2015-02-15,precip 2015-02-15 - 2015-02-22,precip 2015-02-22 - 2015-03-01,precip 2015-03-01 - 2015-03-08,precip 2015-03-08 - 2015-03-15
0,34.26,-15.91,887.764222,9,0.0,0.0,0.0,14.844025,14.552823,12.237766,...,30.127047,30.449468,1.521829,29.389995,32.878318,8.179804,0.963981,16.659097,3.304466,0.0
1,34.26,-15.9,743.403912,9,0.0,0.0,0.0,14.844025,14.552823,12.237766,...,30.127047,30.449468,1.521829,29.389995,32.878318,8.179804,0.963981,16.659097,3.304466,0.0
2,34.26,-15.89,565.728343,9,0.0,0.0,0.0,14.844025,14.552823,12.237766,...,30.127047,30.449468,1.521829,29.389995,32.878318,8.179804,0.963981,16.659097,3.304466,0.0
3,34.26,-15.88,443.392774,10,0.0,0.0,0.0,14.844025,14.552823,12.237766,...,30.127047,30.449468,1.521829,29.389995,32.878318,8.179804,0.963981,16.659097,3.304466,0.0
4,34.26,-15.87,437.443428,10,0.0,0.0,0.0,14.844025,14.552823,12.237766,...,30.127047,30.449468,1.521829,29.389995,32.878318,8.179804,0.963981,16.659097,3.304466,0.0


In [66]:
y.head()

Unnamed: 0,target_2015
0,0.0
1,0.0
2,0.0
3,0.0
4,0.0


In [67]:
lr = LinearRegression()

In [68]:
cross_validate(lr, x_main, y, cv=5, scoring=my_scorer)



{'fit_time': array([0.00668955, 0.00817275, 0.01178598, 0.01521611, 0.014745  ]),
 'score_time': array([0.00518179, 0.00102234, 0.0048151 , 0.0039959 , 0.00399709]),
 'test_score': array([-0.13060394, -0.21593088, -0.31629639, -0.14452043, -1.27533719]),
 'train_score': array([-0.22800067, -0.21053019, -0.18386116, -0.2239283 , -0.17824855])}

In [69]:
rfr = RandomForestRegressor(random_state=42)

In [70]:
cross_validate(rfr, x_main, y, cv=5, scoring=my_scorer)

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


{'fit_time': array([1.03746533, 1.10673094, 1.18478823, 1.10702562, 1.25013709]),
 'score_time': array([0.00602078, 0.01204157, 0.00801945, 0.004987  , 0.00399232]),
 'test_score': array([-0.09256783, -0.18754249, -0.16840106, -0.14070642, -0.2944055 ]),
 'train_score': array([-0.04236253, -0.04074161, -0.03926416, -0.04266525, -0.03290352])}

In [71]:
rfecv = RFECV(estimator=rfr, cv=5, scoring=my_scorer)

In [19]:
rfecv = rfecv.fit(x_main, y)

  y = column_or_1d(y, warn=True)








In [20]:
rfecv.ranking_ 

array([ 1,  1,  1,  2, 16, 18, 17, 11, 12,  8, 13,  3,  6, 14, 10,  7,  5,
       15,  4,  9,  1])

In [21]:
for r, col in zip(rfecv.ranking_, x_main.columns):
    print((r, col))

(1, 'X')
(1, 'Y')
(1, 'elevation')
(2, 'LC_Type1_mode')
(16, 'precip 2014-11-16 - 2014-11-23')
(18, 'precip 2014-11-23 - 2014-11-30')
(17, 'precip 2014-11-30 - 2014-12-07')
(11, 'precip 2014-12-07 - 2014-12-14')
(12, 'precip 2014-12-14 - 2014-12-21')
(8, 'precip 2014-12-21 - 2014-12-28')
(13, 'precip 2014-12-28 - 2015-01-04')
(3, 'precip 2015-01-04 - 2015-01-11')
(6, 'precip 2015-01-11 - 2015-01-18')
(14, 'precip 2015-01-18 - 2015-01-25')
(10, 'precip 2015-01-25 - 2015-02-01')
(7, 'precip 2015-02-01 - 2015-02-08')
(5, 'precip 2015-02-08 - 2015-02-15')
(15, 'precip 2015-02-15 - 2015-02-22')
(4, 'precip 2015-02-22 - 2015-03-01')
(9, 'precip 2015-03-01 - 2015-03-08')
(1, 'precip 2015-03-08 - 2015-03-15')


In [22]:
ranking_one = x_main[["X", "Y", "elevation", 'precip 2015-03-08 - 2015-03-15', 'LC_Type1_mode']]

In [23]:
ranking_one.head()

Unnamed: 0_level_0,X,Y,elevation,precip 2015-03-08 - 2015-03-15,LC_Type1_mode
Square_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
4e3c3896-14ce-11ea-bce5-f49634744a41,34.26,-15.91,887.764222,0.0,9
4e3c3897-14ce-11ea-bce5-f49634744a41,34.26,-15.9,743.403912,0.0,9
4e3c3898-14ce-11ea-bce5-f49634744a41,34.26,-15.89,565.728343,0.0,9
4e3c3899-14ce-11ea-bce5-f49634744a41,34.26,-15.88,443.392774,0.0,10
4e3c389a-14ce-11ea-bce5-f49634744a41,34.26,-15.87,437.443428,0.0,10


In [72]:
cols_2019 = [col for col in train.columns if "2019" in col]

In [73]:
cols_2019

['precip 2019-01-20 - 2019-01-27',
 'precip 2019-01-27 - 2019-02-03',
 'precip 2019-02-03 - 2019-02-10',
 'precip 2019-02-10 - 2019-02-17',
 'precip 2019-02-17 - 2019-02-24',
 'precip 2019-02-24 - 2019-03-03',
 'precip 2019-03-03 - 2019-03-10',
 'precip 2019-03-10 - 2019-03-17',
 'precip 2019-03-17 - 2019-03-24',
 'precip 2019-03-24 - 2019-03-31',
 'precip 2019-03-31 - 2019-04-07',
 'precip 2019-04-07 - 2019-04-14',
 'precip 2019-04-14 - 2019-04-21',
 'precip 2019-04-21 - 2019-04-28',
 'precip 2019-04-28 - 2019-05-05',
 'precip 2019-05-05 - 2019-05-12',
 'precip 2019-05-12 - 2019-05-19']

In [74]:
x_2019 = train[["X", "Y", "elevation", "LC_Type1_mode", 'precip 2019-01-20 - 2019-01-27',
 'precip 2019-01-27 - 2019-02-03',
 'precip 2019-02-03 - 2019-02-10',
 'precip 2019-02-10 - 2019-02-17',
 'precip 2019-02-17 - 2019-02-24',
 'precip 2019-02-24 - 2019-03-03',
 'precip 2019-03-03 - 2019-03-10',
 'precip 2019-03-10 - 2019-03-17',
 'precip 2019-03-17 - 2019-03-24',
 'precip 2019-03-24 - 2019-03-31',
 'precip 2019-03-31 - 2019-04-07',
 'precip 2019-04-07 - 2019-04-14',
 'precip 2019-04-14 - 2019-04-21',
 'precip 2019-04-21 - 2019-04-28',
 'precip 2019-04-28 - 2019-05-05',
 'precip 2019-05-05 - 2019-05-12',
 'precip 2019-05-12 - 2019-05-19']]

In [75]:
rfr.fit(x_main, y)

  """Entry point for launching an IPython kernel.


RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
           oob_score=False, random_state=42, verbose=0, warm_start=False)

In [76]:
len(x_main.columns), len(x_2019.columns)

(21, 21)

In [77]:
predictions = rfr.predict(x_2019)

In [78]:
square_id = train["Square_ID"]

In [93]:
pd_pred1 = pd.DataFrame(data={"target_2019": predictions}, index=square_id)

In [94]:
pd_pred1.to_csv("predictions/PD_pred1.csv")

In [81]:
len(x_main), len(x_2019)

(16466, 16466)

In [82]:
len(square_id)

16466

In [83]:
len(train)

16466

In [84]:
len(pd_pred1)

16466

In [85]:
pd_pred1[pd_pred1["Square_ID"]=="4e3c3896-14ce-11ea-bce5-f49634744a41"]

Unnamed: 0,Square_ID,target_2019
0,4e3c3896-14ce-11ea-bce5-f49634744a41,0.0


In [88]:
pd_pred1.head()

Unnamed: 0,Square_ID,target_2019
0,4e3c3896-14ce-11ea-bce5-f49634744a41,0.0
1,4e3c3897-14ce-11ea-bce5-f49634744a41,0.0
2,4e3c3898-14ce-11ea-bce5-f49634744a41,0.050076
3,4e3c3899-14ce-11ea-bce5-f49634744a41,0.063238
4,4e3c389a-14ce-11ea-bce5-f49634744a41,0.063238
