In [17]:
import numpy as np
import pandas as pd
import pickle as pkl
from sklearn import preprocessing, linear_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

from sklearn.preprocessing import PolynomialFeatures
# Import Linear Regression and a regularized regression function
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LassoCV
# Finally, import function to make a machine learning pipeline
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import ExtraTreesClassifier


## Load Data

In [6]:
X = pd.read_csv('X_train.csv')
Y = pd.read_csv('Y_train.csv')
X_test = pd.read_csv('X_test.csv')

print("Training Data:{}".format(X.shape))
print("Test Data:{}".format(X_test.shape))

Y.head(2)


Training Data:(1212, 888)
Test Data:(776, 888)


Unnamed: 0,id,y
0,0.0,75.0
1,1.0,76.0


In [7]:
Y['y'].describe()

count    1212.000000
mean       69.763201
std         9.941656
min        42.000000
25%        64.000000
50%        70.000000
75%        76.000000
max        96.000000
Name: y, dtype: float64

In [8]:
X.describe()

Unnamed: 0,id,x0,x1,x2,x3,x4,x5,x6,x7,x8,...,x877,x878,x879,x880,x881,x882,x883,x884,x885,x886
count,1212.0,1122.0,1140.0,1132.0,1123.0,1133.0,1126.0,1141.0,1122.0,1122.0,...,1127.0,1127.0,1132.0,1127.0,1125.0,1127.0,1148.0,1126.0,1123.0,1111.0
mean,605.5,7300.504957,1003125.0,1051614.0,1049.844772,105.047681,203511.156265,1050.73588,341958.172245,104916.372111,...,382532200000.0,1003145.0,-502595.6,1001.891614,999590.5,3732.365716,100.659348,1617.956555,10505.966555,65052.578569
std,350.018571,1379.891266,100181.7,28180.85,28.475255,2.823009,29841.633207,28.623527,58820.438523,2755.013692,...,434795900000.0,95948.47,86088.74,100.410174,96804.91,725.532171,9.336065,401.791865,290.648021,0.029221
min,0.0,1030.502715,671634.5,1000037.0,1000.062471,100.033879,63202.600024,1000.134779,92365.078214,100016.602565,...,-508388200000.0,718663.5,-1110029.0,643.042857,689535.4,451.131089,65.692019,458.289896,10001.346875,65052.528022
25%,302.75,6496.988432,940969.9,1028118.0,1025.913567,102.724769,186609.583069,1026.464126,309182.73954,102687.100342,...,157860300000.0,938511.7,-547596.6,933.591537,934864.1,3297.203036,94.515998,1360.553119,10249.981685,65052.553207
50%,605.5,7381.752216,1003238.0,1052406.0,1050.174694,105.023063,201709.971057,1051.39919,337308.178918,104861.600927,...,275819700000.0,1001974.0,-496535.0,1001.295903,998976.1,3768.931107,100.672131,1604.528424,10505.538263,65052.579678
75%,908.25,8153.767104,1070372.0,1075329.0,1074.864998,107.391464,220981.402036,1075.166305,371797.754187,107160.482832,...,486729700000.0,1063238.0,-456005.7,1069.927335,1064618.0,4179.60223,106.809319,1861.784028,10763.810688,65052.603107
max,1211.0,13055.814408,1316548.0,1099990.0,1099.845375,110.048177,370398.522988,1099.997865,784817.830992,109991.914244,...,7405700000000.0,1308895.0,-140040.3,1323.073354,1276136.0,6781.164024,126.678078,3745.022165,10999.908941,65052.627907


## Preprocessing

### Replace missing values: Interpolation

In [9]:
print("Missing entries across all features: ",X.isnull().sum().sum())
X[X.x0.isnull()].head(5)

Missing entries across all features:  77121


Unnamed: 0,id,x0,x1,x2,x3,x4,x5,x6,x7,x8,...,x877,x878,x879,x880,x881,x882,x883,x884,x885,x886
8,8.0,,1136943.0,1047700.0,1095.07177,104.280469,199679.789379,1075.166305,308952.165532,,...,65464770000.0,,-454744.713147,845.220975,1124313.0,4242.573171,89.842734,983.020034,10064.518546,65052.596763
15,15.0,,1082081.0,1053115.0,1050.318884,107.479167,214460.503578,1012.636406,325226.367066,,...,,889731.5,,1050.097266,1126804.0,3840.596957,100.609133,1777.062732,10722.666402,65052.534048
17,17.0,,1139222.0,1089087.0,1043.829138,,,1091.372597,311692.655155,106852.971402,...,,839667.5,-457813.74854,914.165982,1215597.0,3470.879134,94.259174,1512.026531,10065.973104,
49,49.0,,1141092.0,1036322.0,1097.841819,100.59551,192700.231292,1012.064796,349769.421213,109055.678089,...,635153300000.0,1072613.0,-509825.273218,969.096337,830329.9,4486.555772,92.507889,2008.006951,10836.45297,65052.555294
60,60.0,,963564.4,1072941.0,1081.119503,101.504822,221232.6052,1013.414998,373612.773713,106301.596339,...,306228800000.0,795337.9,-548945.945151,919.380354,960910.3,3854.713301,107.829104,1638.064765,10877.091027,65052.530389


In [10]:
X.x0.head(5)

0    7077.537454
1    4406.423818
2    4460.878690
3    7152.388016
4    6531.930242
Name: x0, dtype: float64

In [11]:
X.x0 =  X.x0.interpolate(method='nearest')



In [12]:
# X_train.iloc[8:60]
# X_train[X_train.x0.isnull()].head(5)

In [13]:
X = X.interpolate(method = 'nearest').ffill().bfill()

X.isnull().sum().sum()

0

### Data normalization

In [14]:
X_normalized = preprocessing.normalize(X)
X_standardized = preprocessing.scale(X)




### Training / Validation Spilt

In [40]:
X_train, X_val, Y_train, Y_val = train_test_split(X_normalized, Y, test_size=0.2, random_state=42)
print("X_train: {}".format(X_train.shape[0]))
print("X_test: {}".format(X_test.shape[0]))



X_train: 969
X_test: 243


## Feature Reduction / Extraction

In [41]:
# feature extraction
model = ExtraTreesClassifier()
model.fit(X_train, Y_train)
feature_importance = model.feature_importances_

In [42]:
# feature_importance.find(feature_importance>0)

feature_max_importance = np.argwhere(feature_importance > 0)
feature_max_importance

array([[332],
       [401],
       [610],
       [878]])

I will select the following features

In [43]:
X_train_selected = X_train[:,feature_max_importance]
X_val_selected = X_val[:,feature_max_importance]

X_train_selected = X_train_selected.reshape(X_train_selected.shape[0],X_train_selected.shape[1])
X_train_selected.shape

X_val_selected = X_val_selected.reshape(X_val_selected.shape[0],X_val_selected.shape[1])
X_val_selected.shape

(243, 4)

## Algorithm

In [44]:
regr = linear_model.LinearRegression()

# Train the model using the training sets
regr.fit(X_train_selected, Y_train)

# Make predictions using the testing set
Y_pred = regr.predict(X_val_selected)


print('R2 score: %.2f' % r2_score(Y_val, Y_pred))


R2 score: 0.00


In [45]:
# Alpha (regularization strength) of LASSO regression
# lasso_eps = 0.0001
# lasso_nalpha=20
# lasso_iter=5000
# # Min and max degree of polynomials features to consider
# degree_min = 2
# degree_max = 8
# Test/train split
# Make a pipeline model with polynomial transformation and LASSO regression with cross-validation, run it for increasing degree of polynomial (complexity of the model)
# for degree in range(degree_min,degree_max+1):
#     model = make_pipeline(PolynomialFeatures(degree, interaction_only=False), LassoCV(eps=lasso_eps,n_alphas=lasso_nalpha,max_iter=lasso_iter,
# normalize=True,cv=5))
#     model.fit(X_train,Y_train)
#     Y_pred = np.array(model.predict(X_test))
    
#     print('R2 score: %.2f' % r2_score(Y_test, Y_pred))

    