In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split,ShuffleSplit

In [None]:
from sklearn.model_selection import cross_val_score,cross_validate
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import validation_curve

In [None]:
df= pd.read_excel('book2.xlsx')

In [None]:
df1=df.drop(['District ID','District',
         'Object Id'],axis=1)

In [None]:
df1.head()

Unnamed: 0,Longitude,Latitude,Slope,Soil Type,LULC_1986,LULC_1987,LULC_1988,LULC_1989,LULC_1990,LULC_1991,...,GW_2010,GW_2011,GW_2012,GW_2013,GW_2014,GW_2015,GW_2016,GW_2017,GW_2018,GW_2019
0,74.75,31.25,0.784844,Haplic Calcisols,Agriculture,Agriculture,Agriculture,Agriculture,Agriculture,Agriculture,...,-0.04,-1.46,-0.6,-17.52,-3.119739,-9.14336,-9.823463,-2.394697,-2.774661,-5.369132
1,75.0,31.25,0.701995,Haplic Calcisols,Agriculture,Agriculture,Agriculture,Agriculture,Agriculture,Agriculture,...,-2.454617,-4.45711,-6.00039,-3.347226,-1.84,1.0,0.8,1.0,3.3,-1.9
2,74.75,31.5,0.0,Haplic Calcisols,Agriculture,Agriculture,Agriculture,Agriculture,Agriculture,Agriculture,...,-0.753646,-1.6,-0.02,-1.49,-1.78,-0.9,0.754127,0.6,0.2,-0.5
3,75.0,31.5,1.889588,Haplic Calcisols,Agriculture,Agriculture,Agriculture,Agriculture,Agriculture,Agriculture,...,3.95419,-3.838305,6.83308,19.31,2.646981,2.198592,4.854617,-0.397419,0.671259,-0.280918
4,75.25,31.5,0.351011,Haplic Calcisols,Agriculture,Agriculture,Agriculture,Agriculture,Agriculture,Agriculture,...,0.14,-2.13,-0.4,-1.47,0.3,-3.198811,-3.237203,0.246433,0.126418,-0.113382


In [None]:
from sklearn.preprocessing import OrdinalEncoder

In [None]:
X=df.drop(['GW_2019'],axis=1)
y=df['GW_2019']

In [None]:
X1=pd.get_dummies(X)

In [None]:
X1.shape

(75, 300)

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X1, y, test_size=0.2, random_state=42)

In [None]:
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)
print(f'MSE: {mse:.2f}, RMSE: {rmse:.2f}, R2 Score: {r2:.2f}')

MSE: 3.02, RMSE: 1.74, R2 Score: 0.07


In [None]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)
print(f'MSE: {mse:.2f}, RMSE: {rmse:.2f}, R2 Score: {r2:.2f}')

MSE: 16.80, RMSE: 4.10, R2 Score: -4.19


In [None]:
from sklearn.neural_network import MLPRegressor
mlp = MLPRegressor(hidden_layer_sizes=(350,250,50), activation='relu', solver='adam', max_iter=1000, random_state=42)
mlp.fit(X_train, y_train)

In [None]:
cv=ShuffleSplit(n_splits=10,test_size=0.1,random_state=42)

In [None]:
rfg = RandomForestRegressor(n_estimators=100, random_state=42)
rfg_cv_results=cross_validate(rfg,X1,
                                  y,
                                  cv=cv,scoring='neg_mean_squared_error',
                                  return_train_score=True,
                                  return_estimator=True)
train_error = -1 * rfg_cv_results['train_score']
test_error = -1 * rfg_cv_results['test_score']

print(f"MSE on the train set:\n"f"{train_error.mean():.3f} +/- {train_error.std():.3f}")
print(f"MSEon the test set:\n"
      f"{test_error.mean():.3f} +/- {test_error.std():.3f}")

MSE on the train set:
0.494 +/- 0.087
MSEon the test set:
4.057 +/- 3.732


In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler,PolynomialFeatures

In [None]:
poly = PolynomialFeatures(degree=2)
x_train_poly = poly.fit_transform(X_train)
x_test_poly=poly.fit_transform(X_test)

model = LinearRegression()
model.fit(x_train_poly, y_train)

y_pred = model.predict(x_test_poly)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)
print(f'MSE: {mse:.2f}, RMSE: {rmse:.2f}, R2 Score: {r2:.2f}')

MSE: 13.34, RMSE: 3.65, R2 Score: -3.12


In [None]:
poly_reg_pipeline = Pipeline([("poly", PolynomialFeatures(degree=2)),
                              ("feature_scaling", StandardScaler()),
                              ("lin_reg", LinearRegression())])
poly_reg_cv_results = cross_validate(poly_reg_pipeline,
                                    X_train, 
                                    y_train, 
                                    cv=cv,
                                    scoring="neg_mean_squared_error",
                                    return_train_score=True,
                                    return_estimator=True)

poly_reg_train_error = -1 * poly_reg_cv_results['train_score']
poly_reg_test_error = -1 * poly_reg_cv_results['test_score']

print(f"Mean absolute error of linear regression model on the train set:\n"
      f"{poly_reg_train_error.mean():.3f} +/- {poly_reg_train_error.std():.3f}")
print(f"Mean absolute error of linear regression model on the test set:\n"
      f"{poly_reg_test_error.mean():.3f} +/- {poly_reg_test_error.std():.3f}")

Mean absolute error of linear regression model on the train set:
0.000 +/- 0.000
Mean absolute error of linear regression model on the test set:
6.008 +/- 5.115
