In [1]:
import pandas as pd

from pandas import read_csv                           # For dataframes
from pandas import DataFrame                       # For dataframes
from numpy import ravel                                  # For matrices
import matplotlib.pyplot as plt                        # For plotting data
import seaborn as sns                                     # For plotting data
from sklearn.model_selection import train_test_split    # For train/test splits
from sklearn.neighbors import KNeighborsClassifier    # The k-nearest neighbor classifier
from sklearn.feature_selection import VarianceThreshold # Feature selector
from sklearn.pipeline import Pipeline                                  # For setting up pipeline
# Various pre-processing steps
from sklearn.preprocessing import Normalizer, StandardScaler, MinMaxScaler, PowerTransformer, MaxAbsScaler, LabelEncoder
from sklearn.model_selection import GridSearchCV 

from sklearn.linear_model import LinearRegression

In [28]:
df = pd.read_csv("220527_seqana_mlops_challenge_dataset.csv")

In [29]:
df

Unnamed: 0,year,dem_nasa_dem30,dem_merit,dem_gmted,dem_srtm90_v4,dem_nasa_srtm30,dem_usgs_ned,terrain_topo_div_alos_constant,terrain_mtpi_alos_AVE,terrain_chili_alos_constant,...,sur_refl_b02_mean_sampling_year,sur_refl_b03_mean_sampling_year,sur_refl_b07_mean_sampling_year,evi_std_sampling_year,ndvi_std_sampling_year,sur_refl_b01_std_sampling_year,sur_refl_b02_std_sampling_year,sur_refl_b03_std_sampling_year,sur_refl_b07_std_sampling_year,soc_stock_t_ha
0,2012,894,899.792969,898,892,894,895.900574,0.306919,-4,183,...,3374.608696,1089.260870,1546.869565,1574.503874,2215.650847,1688.021295,1375.511632,1220.169482,654.961158,68.793925
1,2011,61,56.898190,58,58,57,58.000286,0.123088,-7,181,...,2709.608696,374.478261,1166.391304,1582.043952,1427.600306,307.310126,999.502956,167.002686,501.502718,87.265779
2,2011,92,92.471191,89,91,90,92.729630,0.046061,-1,181,...,3195.043478,414.347826,1221.347826,1927.965549,1960.699712,583.722087,1090.225101,322.372259,434.762277,96.650384
3,2011,1560,1572.300049,1622,1570,1549,1547.678345,0.781243,-17,160,...,4063.727273,1790.043478,928.956522,1010.631962,1903.293570,2179.134552,2019.112468,1884.017720,472.453507,68.400738
4,2011,1176,1175.498901,1179,1176,1176,1170.957031,0.210475,-2,181,...,2652.260870,1022.217391,864.565217,646.826519,1687.594992,922.467112,857.199362,1366.888502,303.002824,35.314235
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5086,2011,269,271.338745,276,272,269,267.902832,0.144872,-12,185,...,3651.217391,1201.304348,1579.173913,2151.600116,2580.150946,2111.791728,1600.114086,2184.760013,546.854281,54.985890
5087,2011,1841,1842.393311,1835,1843,1843,1841.644409,0.159572,-2,198,...,3077.173913,1342.913043,2899.043478,317.942229,475.251392,1181.081629,1011.575533,1344.406253,913.269127,9.786586
5088,2011,370,368.073639,371,365,366,365.822449,0.039614,1,170,...,2365.347826,1112.521739,790.260870,1607.331479,2896.890366,1159.625795,861.845568,1513.928240,260.348930,216.555119
5089,2011,373,369.719818,354,372,374,358.393951,0.187150,4,178,...,2712.130435,947.913043,856.913043,2046.776606,2653.634086,805.265537,911.478385,1176.317363,290.434614,134.094696


In [30]:
# df.corr()

In [31]:
# The data matrix X
X = df.iloc[:,1:-1]
# The labels
y = (df.iloc[:,-1:])


In [36]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=1/3,random_state=0)
 
print(X_train.shape)
print(X_test.shape)

(3394, 304)
(1697, 304)


In [37]:
reg = LinearRegression().fit(X, y)
# reg.score(X, y)

print('Training set score: ' + str(reg.score(X_train,y_train)))
print('Test set score: ' + str(reg.score(X_test,y_test)))

Training set score: 0.47616990262695225
Test set score: 0.4530198636963759


In [23]:
pipe = Pipeline([
('scaler', StandardScaler()),
('selector', VarianceThreshold()),
('regressor', LinearRegression())
])
 
pipe.fit(X_train, y_train)
 
print('Training set score: ' + str(pipe.score(X_train,y_train)))
print('Test set score: ' + str(pipe.score(X_test,y_test)))
 

Training set score: 0.4954153767160132
Test set score: 0.342964860263656


In [20]:
parameters = {'scaler': [StandardScaler(), MinMaxScaler(),
    Normalizer(), MaxAbsScaler()]
}
 
grid = GridSearchCV(pipe, parameters, cv=2).fit(X_train, y_train)
 
print('Training set score: ' + str(grid.score(X_train, y_train)))
print('Test set score: ' + str(grid.score(X_test, y_test)))
 

Training set score: 0.49871506284553613
Test set score: 0.34900085273250747


In [21]:
best_params = grid.best_params_
print(best_params)
# Stores the optimum model in best_pipe
best_pipe = grid.best_estimator_
print(best_pipe)
 
result_df = DataFrame.from_dict(grid.cv_results_, orient='columns')
print(result_df.columns)

{'scaler': Normalizer()}
Pipeline(steps=[('scaler', Normalizer()), ('selector', VarianceThreshold()),
                ('regressor', LinearRegression())])
Index(['mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time',
       'param_scaler', 'params', 'split0_test_score', 'split1_test_score',
       'mean_test_score', 'std_test_score', 'rank_test_score'],
      dtype='object')
