<b>The dataset is very unbalanced. </b> For example, C3S4 and C3S4 classes have only one sample. This situation is not good in respect of  using the oversampling methods. <br>

I tried to create a new class (called the other) by combining other classes which have a small sample size.
Finally, there are six classes to classify, named 'C2S1', 'C3S1', 'C3S2', 'C4S1', 'C4S2', 'Other'.


## Results

I have used optima for optimizing the model. The results were not very good. Although our model gained reasonable precision/recall scores in a few classes, some classes have bad scores.<br>

- (Test Set) R2 score : 82.851
- (Test Set) MAE : 0.207831

| class        	| precision 	| recall 	| f1-score 	| support 	|
|--------------	|-----------	|--------	|----------	|---------	|
| 0            	| 1.00      	| 0.97   	| 0.99     	| 76      	|
| 1            	| 0.97      	| 0.99   	| 0.98     	| 204     	|
| 2            	| 0.25      	| 0.17   	| 0.20     	| 6       	|
| 3            	| 0.88      	| 0.88   	| 0.88     	| 26      	|
| 4            	| 0.73      	| 0.67   	| 0.70     	| 12      	|
| 5            	| 0.62      	| 0.62   	| 0.62     	| 8       	|
| accuracy     	|           	|        	| 0.94     	| 332     	|
| macro avg    	| 0.74      	| 0.72   	| 0.73     	| 332     	|
| weighted avg 	| 0.94      	| 0.94   	| 0.94     	| 332     	|

In [None]:
# Importing dependencies

import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.metrics import r2_score, mean_squared_error, classification_report
from sklearn.utils.class_weight import compute_class_weight

from catboost import CatBoostClassifier, Pool

# <span style="color:#e74c3c;"> Reading </span> Data


In [None]:
# Reading data and cleaning, renaming and other data cleaning applications

data1 = pd.read_csv('/kaggle/input/telangana-post-monsoon-ground-water-quality-data/ground_water_quality_2018_post.csv')
data2 = pd.read_csv('/kaggle/input/telangana-post-monsoon-ground-water-quality-data/ground_water_quality_2019_post.csv')
data3 = pd.read_csv('/kaggle/input/telangana-post-monsoon-ground-water-quality-data/ground_water_quality_2020_post.csv')



data2.rename( columns ={ 'EC' : 'E.C', 'CO_-2 ' : 'CO3', 'HCO_ - ' :'HCO3', 'Cl -' : 'Cl',
                        'F -' : 'F', 'NO3- ': 'NO3 ' , 'SO4-2':'SO4' , 'Na+':'Na', 'K+':'K',
                        'Ca+2' : 'Ca', 'Mg+2':'Mg'}, inplace = True)


# dropping redundant columns
data1.drop(['sno','season'], axis = 1, inplace = True)
data2.drop(['sno','season'], axis = 1, inplace = True)
data3.drop(['sno','Unnamed: 8', 'season'], axis = 1, inplace = True)


# creating new columns
data1['year'] = 2018
data2['year'] = 2019
data3['year'] = 2020


# handling and fixing outliers
data3['pH'].iloc[261] = data3['pH'].iloc[261].replace('8..05', '8.05')
data3['pH'] = data3['pH'].apply(pd.to_numeric)

data3['Classification'].iloc[178] = data3['Classification'].iloc[178].replace('O.G', 'OG')
data3['Classification'].iloc[208] = data3['Classification'].iloc[208].replace('O.G', 'OG')


In [None]:
# creating and applying the new_class function

def new_class(X):
    if (X == 'C3S4') | (X == 'C2S2') | (X == 'C4S4') | (X == 'C3S3') | (X == 'C4S3') | (X == 'OG')  | (X == 'C1S1')  :
        return 'Other'
    else:
        return X
    
data1['Classification'] = data1['Classification'].apply(new_class)
data2['Classification'] = data2['Classification'].apply(new_class)
data3['Classification'] = data3['Classification'].apply(new_class)

data_full = pd.concat([data1, data2, data3], axis = 0)

In [None]:
# total null elements

data_full.isnull().sum()[data_full.isnull().sum() > 0]

In [None]:
# imputing null values

imp_knn = KNNImputer(n_neighbors=3)

data_full['CO3'] = imp_knn.fit_transform(np.array(data_full['CO3']).reshape(-1,1) )
data_full['gwl'] = imp_knn.fit_transform(np.array(data_full['gwl']).reshape(-1,1) )

In [None]:
data_full.isnull().sum()[data_full.isnull().sum() > 0]

In [None]:
data_full.head()

In [None]:
# creating train data and target

X = data_full.copy()
X.drop('Classification', axis= 1, inplace = True)

y = data_full['Classification']

In [None]:
LB = LabelEncoder()
y = LB.fit_transform(y)
LB.classes_

In [None]:
# categorical features

cat_feat_idx =  np.where(X.dtypes == 'object')[0]
cat_feat_idx

In [None]:
# scaling numerical data

MX = MinMaxScaler()
X.iloc[:, 3:21] = MX.fit_transform(X.iloc[:, 3:21])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.3 , shuffle = True, stratify=y , random_state= 2)

print(X_train.shape)
print(X_test.shape)

In [None]:
# creating class weights

unique_classes = np.unique(y)
weights = compute_class_weight(class_weight='balanced', classes=unique_classes, y=y_train)
class_weights = dict(zip(unique_classes, weights))
class_weights

# <span style="color:#e74c3c;"> CatBoost </span> Classifier


In [None]:
# creating pools for training and testing

train_pool = Pool(X_train, y_train, cat_features = cat_feat_idx)
test_pool = Pool(X_test, y_test, cat_features = cat_feat_idx)

In [None]:
# tuned with optima

model = CatBoostClassifier(iterations= 14400,learning_rate =0.0029536992550707585 , min_data_in_leaf = 27 , class_weights=class_weights)

model.fit(train_pool , verbose = 1000 )

# <span style="color:#e74c3c;"> Results </span> 


In [None]:
# predictions and scores

pred = model.predict(test_pool)

r2_sr = r2_score(y_test, pred)
mse = mean_squared_error(y_test, pred)

print('R2 Score :{0:.5f}'.format(r2_sr))
print('Mean Squared Error :{0:.5f}'.format(mse))

In [None]:
# classification report

clf_report = classification_report(pred, y_test )

print(clf_report)