In [4]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import ElasticNet, LinearRegression, Ridge
from sklearn.metrics import mean_squared_error, r2_score,mean_absolute_error
import xgboost as xgb
import lightgbm as lgb
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

# Load data
file_path = r"E:\Pragna\Engineering\machine learning\SVM\1GA21EC096\silkboard.csv"
df = pd.read_csv(file_path, low_memory=False)

# Handle non-numeric values
# Consider handling categorical variables appropriately
df = df.apply(pd.to_numeric, errors='coerce')

# Impute missing values with the most frequent value
imputer = SimpleImputer(strategy='most_frequent')
df_imputed = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)

In [5]:
# Impute missing values with the most frequent value
imputer = SimpleImputer(strategy='most_frequent')
df_imputed = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)

# Select features and target variable
X = df_imputed.drop("PM2.5", axis=1)
y = df_imputed["PM2.5"]

# Scale the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

LINEAR REGRESSION

In [6]:
linear_reg = LinearRegression()
linear_reg.fit(X_train, y_train)
y_pred_lr = linear_reg.predict(X_test)
mse_lr = mean_squared_error(y_test, y_pred_lr)
rmse_lr= np.sqrt(mean_squared_error(y_test, y_pred_lr))
r2_lr = r2_score(y_test, y_pred_lr)
print("mse ",mse_lr)
print("rmse ",rmse_lr)
print("lr mae:",mean_absolute_error(y_test,y_pred_lr))
print(r2_lr)

mse  74.77901729959622
rmse  8.647486183833786
lr mae: 4.460739167114458
0.7237031688958417


In [19]:
#Decision tree
dt = DecisionTreeRegressor(criterion='friedman_mse',max_depth=11)
dt.fit(X_train, y_train)
y_pred_dt = dt.predict(X_test)
mse_dt = mean_squared_error(y_test, y_pred_dt)
rmse_dt= np.sqrt(mean_squared_error(y_test, y_pred_dt))
r2_dt = r2_score(y_test, y_pred_dt)
print("mse ",mse_dt)
print("rmse ",rmse_dt)
print("r2 ",r2_dt)
print("dt mae",mean_absolute_error(y_test,y_pred_dt))

mse  64.86070316222305
rmse  8.053614291870641
r2  0.760349796051593
dt mae 3.11534331099947


In [20]:
#KNN
knn=KNeighborsRegressor(n_neighbors=3)
knn.fit(X_train, y_train)
y_pred_knn = knn.predict(X_test)
mse_knn = mean_squared_error(y_test, y_pred_knn)
rmse_knn= np.sqrt(mean_squared_error(y_test, y_pred_knn))
r2_knn = r2_score(y_test, y_pred_knn)
print("mse ",mse_knn)
print("rmse ",rmse_knn)
print("r2 ",r2_knn)
print("knn mae",mean_absolute_error(y_test,y_pred_knn))

mse  38.55764826634393
rmse  6.209480515014435
r2  0.8575354903617181
knn mae 2.4720313395757696


In [22]:
#polynomial regression
from sklearn.preprocessing import PolynomialFeatures

poly = PolynomialFeatures(degree=2, include_bias=True)
X_train_poly = poly.fit_transform(X_train)
X_test_poly = poly.transform(X_test)

# Fit a linear regression model on the polynomial features
model = LinearRegression()
model.fit(X_train_poly, y_train)

# Predict on the test set
y_pred_poly = model.predict(X_test_poly)

# Evaluate the model
print("poly mse:", mean_squared_error(y_test, y_pred_poly))
print("poly r2:", r2_score(y_test, y_pred_poly))
print("poly rmse:",np.sqrt(mean_squared_error(y_test,y_pred_poly)))
print("poly mae:",mean_absolute_error(y_test,y_pred_poly))


poly mse: 51.38586570414173
poly r2: 0.8101372233775681
poly rmse: 7.16839352324785
poly mae: 3.4128942544050034


In [23]:
#SVR
from sklearn.svm import SVR
svr = SVR(kernel='rbf', C=1.0)
svr.fit(X_train, y_train)
y_pred_svr = svr.predict(X_test)
mse_svr = mean_squared_error(y_test, y_pred_svr)
rmse_svr= np.sqrt(mean_squared_error(y_test, y_pred_svr))
r2_svr = r2_score(y_test, y_pred_svr)
print("mse ",mse_svr)
print("rmse ",rmse_svr)
print("r2 ",r2_svr)
print("knn svr",mean_absolute_error(y_test,y_pred_svr))

mse  67.37592544374517
rmse  8.208283952431541
r2  0.7510564411640464
knn svr 3.0720029291028226


In [24]:
gbr= GradientBoostingRegressor(n_estimators=150,
    learning_rate=0.1,
    max_depth=10,subsample=0.8
    )

gbr.fit(X_train,y_train)
y_pred_gbr=gbr.predict(X_test)
print("gbr mse",mean_squared_error(y_test,y_pred_gbr))
print("gbr r2",r2_score(y_test,y_pred_gbr))
print("gbr rmse:",np.sqrt(mean_squared_error(y_test,y_pred_gbr)))
print("gbr mae:",mean_absolute_error(y_test,y_pred_gbr))

gbr mse 29.131736988738965
gbr r2 0.8923627655830155
gbr rmse: 5.397382420093926
gbr mae: 2.1830229710474787


In [25]:
from xgboost import XGBRegressor
xgb = XGBRegressor(n_estimators=150,learning_rate=0.1,max_depth=10,min_child_weight=1,n_jobs=-1)
xgb.fit(X_train, y_train)
y_pred_xgb = xgb.predict(X_test)
print("xgb mse",mean_squared_error(y_test,y_pred_xgb))
print("xgb r2",r2_score(y_test,y_pred_xgb))
print("xgb rmse:",np.sqrt(mean_squared_error(y_test,y_pred_xgb)))
print("xgb mae:",mean_absolute_error(y_test,y_pred_xgb))


xgb mse 27.866652391363523
xgb r2 0.897037056285889
xgb rmse: 5.2788874198417535
xgb mae: 2.1457800967693785


In [26]:
from catboost import CatBoostRegressor
catboost = CatBoostRegressor(iterations= 200,
    learning_rate= 0.1,
    depth= 10,
    l2_leaf_reg= 1)
catboost.fit(X_train, y_train)
y_pred_cat = catboost.predict(X_test)
print("cat mse",mean_squared_error(y_test,y_pred_cat))
print("cat r2",r2_score(y_test,y_pred_cat))
print("cat rmse:",np.sqrt(mean_squared_error(y_test,y_pred_cat)))
print("cat mae:",mean_absolute_error(y_test,y_pred_cat))


0:	learn: 17.5425688	total: 268ms	remaining: 53.4s
1:	learn: 16.4392422	total: 334ms	remaining: 33s
2:	learn: 15.4369582	total: 384ms	remaining: 25.2s
3:	learn: 14.5233249	total: 441ms	remaining: 21.6s
4:	learn: 13.7148092	total: 497ms	remaining: 19.4s
5:	learn: 13.0142706	total: 558ms	remaining: 18.1s
6:	learn: 12.4191670	total: 626ms	remaining: 17.2s
7:	learn: 11.8227874	total: 684ms	remaining: 16.4s
8:	learn: 11.3466253	total: 753ms	remaining: 16s
9:	learn: 10.9042678	total: 817ms	remaining: 15.5s
10:	learn: 10.4994381	total: 873ms	remaining: 15s
11:	learn: 10.1066918	total: 931ms	remaining: 14.6s
12:	learn: 9.7006432	total: 990ms	remaining: 14.2s
13:	learn: 9.4210750	total: 1.05s	remaining: 14s
14:	learn: 9.1270142	total: 1.11s	remaining: 13.7s
15:	learn: 8.8206863	total: 1.17s	remaining: 13.5s
16:	learn: 8.5970461	total: 1.25s	remaining: 13.5s
17:	learn: 8.4081151	total: 1.42s	remaining: 14.4s
18:	learn: 8.2394964	total: 1.51s	remaining: 14.4s
19:	learn: 8.0760236	total: 1.58s	rem

In [8]:
from sklearn.ensemble import RandomForestRegressor

rf_model = RandomForestRegressor(n_estimators=1200,random_state=42,n_jobs=-1)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)
print("rf mse",mean_squared_error(y_test,y_pred_rf))
print("rf r2",r2_score(y_test,y_pred_rf))
print("rf rmse:",np.sqrt(mean_squared_error(y_test,y_pred_rf)))
print("rf mae:",mean_absolute_error(y_test,y_pred_rf))




rf mse 32.13669949226682
rf r2 0.8812598967931625
rf rmse: 5.668924015390118
rf mae: 2.3091272153847933


In [11]:
from sklearn.model_selection import RandomizedSearchCV
n_estimators=[int(x) for x in np.linspace(start=200,stop=2000,num=10)]
max_features=['auto','log2','sqrt']
max_depth=[int(x) for x in np.linspace(100,1000,10)]
min_samples_split=[2,5,10,14]
min_samples_leaf=[1,2,4,6,8]
random_grid={'n_estimators':n_estimators,
             'max_features':max_features,'max_depth':max_depth,
             'min_samples_split':min_samples_split,'min_samples_leaf':min_samples_leaf,'criterion':['squared_error', 'friedman_mse']}
print(random_grid)
rf=RandomForestRegressor()
rf_random=RandomizedSearchCV(estimator=rf,param_distributions=random_grid,n_iter=100,cv=3,n_jobs=-1,random_state=100,verbose=2)
rf_random.fit(X_train,y_train)
print(rf_random.best_params_)
best_param_grid=rf_random.best_estimator_
y_predd=best_param_grid.predict(X_test)
print("rf(random) r2",r2_score(y_test,y_predd))
print("rf(random) rmse:",np.sqrt(mean_squared_error(y_test,y_predd)))
print("rf(random) mae:",mean_absolute_error(y_test,y_predd))
print("rf(random) mse",mean_squared_error(y_test,y_predd))



{'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 'max_features': ['auto', 'log2', 'sqrt'], 'max_depth': [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000], 'min_samples_split': [2, 5, 10, 14], 'min_samples_leaf': [1, 2, 4, 6, 8], 'criterion': ['squared_error', 'friedman_mse']}
Fitting 3 folds for each of 100 candidates, totalling 300 fits


KeyboardInterrupt: 

In [None]:
from sklearn.model_selection import GridSearchCV
param_grid={'criterion':[rf_random.best_params_['criterion']],
            'max_depth':[rf_random.best_params_['max_depth']],
            'max_features':[rf_random.best_params_['max_features']],
             'min_samples_split':[rf_random.best_params_['min_samples_split'],
                                  rf_random.best_params_['min_samples_split']+2,
                                  rf_random.best_params_['min_samples_split']+4]
            ,'min_samples_leaf':[rf_random.best_params_['min_samples_leaf']-2,
                                 rf_random.best_params_['min_samples_leaf']-1,
                                 rf_random.best_params_['min_samples_leaf'],
                                 rf_random.best_params_['min_samples_leaf']+1,
                                 rf_random.best_params_['min_samples_leaf']+2],
            'n_estimators':[rf_random.best_params_['n_estimators']-200,
                            rf_random.best_params_['n_estimators'],
                            rf_random.best_params_['n_estimators']+100]}
print(param_grid)
rf1=RandomForestRegressor()
rf_random1=GridSearchCV(estimator=rf,param_grid=param_grid,cv=10,n_jobs=-1,verbose=2)
rf_random1.fit(X_train,y_train)
print(rf_random1.best_params_)
y_pred_grid=best_param_grid.predict(X_test)
print("rf(grid) r2",r2_score(y_test,y_pred_grid))
print("rf(grid) rmse:",np.sqrt(mean_squared_error(y_test,y_pred_grid)))
print("rf(grid) mae:",mean_absolute_error(y_test,y_pred_grid))
print("rf(grid) mse",mean_squared_error(y_test,y_pred_grid))

