In [9]:
import pandas as pd

In [10]:
arima_cv = pd.read_csv("./models/CV_ARIMA.csv").drop('Unnamed: 0', 1)
lr_cv = pd.read_csv("./models/CV_LR.csv").drop('Unnamed: 0', 1)
trees_cv = pd.read_csv("./models/CV_Trees.csv").drop('Unnamed: 0', 1)

total = pd.concat([arima_cv, lr_cv, trees_cv], 1)

In [11]:
pd.DataFrame({'avg': total.mean(), 'std': total.std()}).sort_values(by='avg', ascending=False)

Unnamed: 0,avg,std
LR_l1,0.371461,0.238345
GBM,0.209293,0.417741
LR_l2,0.175068,0.353504
RF,0.149244,0.408748
ARIMA,-0.148599,0.825209
MA,-0.391939,0.528335
LR,-1.098186,4.375926
AR,-1.163461,1.028266


Давайте загрузим тестовый датасет и посмотрим прогнозы на нем.

In [12]:
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.preprocessing import StandardScaler

In [13]:
train = pd.read_csv("train_prepared.csv", index_col=[0], parse_dates=[0])
test = pd.read_csv("test_prepared.csv", index_col=[0], parse_dates=[0])

sc = StandardScaler()
X_train = train.drop('Passengers', 1).values
y_train = train['Passengers'].values
scaler = sc.fit(X_train)
X_train = sc.transform(X_train)

#test prepare
X_test = test.drop('Passengers', 1).values
y_test = test['Passengers'].values
X_test = sc.transform(X_test)

In [14]:
model = Lasso()
model.fit(X_train, y_train)
preds = model.predict(X_test)
r2_score(y_test, preds)

0.29113019205349966

In [15]:
model.coef_

array([ 12.4659154 ,   0.        ,   0.        ,  -0.29429733,
        -0.        ,  -0.        ,   1.97474316,   0.        ,
        77.76567198,  -5.43017285, -19.89666315,   0.        ,
         0.        ,  36.26531554])

In [17]:
features_imps = pd.DataFrame({'feature': [col for col in train.drop('Passengers', 1).columns], 
              'coef_': model.coef_})
features_imps[features_imps['coef_']==0]

Unnamed: 0,feature,coef_
1,month,0.0
2,dayofmonth,0.0
4,dayofyear,-0.0
5,weekofyear,-0.0
7,hour,0.0
11,exp_0_01,0.0
12,exp_0_03,0.0


In [18]:
features_imps[features_imps['coef_']!=0]

Unnamed: 0,feature,coef_
0,year,12.465915
3,quarter,-0.294297
6,dayofweek,1.974743
8,t1,77.765672
9,t2,-5.430173
10,t3,-19.896663
13,exp_0_08,36.265316


Попробуем с другими моделями

In [8]:
model = Ridge()
model.fit(X_train, y_train)
preds = model.predict(X_test)
r2_score(y_test, preds)

0.07846360276984754

In [9]:
model = LinearRegression()
model.fit(X_train, y_train)
preds = model.predict(X_test)
r2_score(y_test, preds)

-182.18142792756595

Лучшей моделью выбираем пока LASSO

In [10]:
model = Lasso()
model.fit(X_train, y_train)

Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
      normalize=False, positive=False, precompute=False, random_state=None,
      selection='cyclic', tol=0.0001, warm_start=False)

In [63]:
from statsmodels.api import OLS
model = OLS(y_train, X_train).fit_regularized(alpha=0.8, L1_wt=1.0)

In [64]:
regularized_regression_parameters = model.params
print(regularized_regression_parameters)

[ 39.63691729   1.55490488   0.           0.           0.
   0.           2.41185223   0.          86.73228626  -5.83956811
 -18.76609499   0.           0.           0.        ]


In [66]:
features_imps = pd.DataFrame({'feature': [col for col in train.drop('Passengers', 1).columns], 
              'coef_': regularized_regression_parameters})
features_imps

Unnamed: 0,feature,coef_
0,year,39.636917
1,month,1.554905
2,dayofmonth,0.0
3,quarter,0.0
4,dayofyear,0.0
5,weekofyear,0.0
6,dayofweek,2.411852
7,hour,0.0
8,t1,86.732286
9,t2,-5.839568


In [68]:
selected_features = features_imps[features_imps['coef_']!=0]['feature'].values

In [69]:
sc = StandardScaler()
X_train = train.drop('Passengers', 1)[selected_features].values
y_train = train['Passengers'].values
scaler = sc.fit(X_train)
X_train = sc.transform(X_train)

#test prepare
X_test = test.drop('Passengers', 1)[selected_features].values
y_test = test['Passengers'].values
X_test = sc.transform(X_test)

In [71]:
model = Lasso()
model.fit(X_train, y_train)
preds = model.predict(X_test)
r2_score(y_test, preds)

0.15643402046030008