In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

In [2]:
# Load the dataset from the CSV file
df3 = pd.read_csv('df3.csv')

# Display the first few rows of the DataFrame to verify
df3.head()

Unnamed: 0,amount_tsh,funder,installer,basin,region,population,public_meeting,permit,construction_year,extraction_type,management,management_group,payment,water_quality,quantity,source,waterpoint_type,year_recorded,number_of_years,status_group
0,6000,other,other,Lake Nyasa,Iringa,109,True,False,1999,gravity,vwc,user-group,pay annually,soft,enough,spring,communal standpipe,2011,12,functional
1,0,other,other,Lake Victoria,Mara,280,True,True,2010,gravity,wug,user-group,never pay,soft,insufficient,rainwater harvesting,communal standpipe,2013,3,functional
2,25,other,other,Pangani,Manyara,250,True,True,2009,gravity,vwc,user-group,pay per bucket,soft,enough,dam,communal standpipe multiple,2013,4,functional
3,0,other,other,Ruvuma / Southern Coast,Mtwara,58,True,True,1986,submersible,vwc,user-group,never pay,soft,dry,machine dbh,communal standpipe multiple,2013,27,non functional
4,0,other,other,Lake Victoria,Kagera,0,True,True,1986,gravity,other,other,never pay,soft,seasonal,rainwater harvesting,communal standpipe,2011,25,functional


In [3]:
obj_columns = df3.select_dtypes(include=['object'])
obj_columns.columns

Index(['funder', 'installer', 'basin', 'region', 'extraction_type',
       'management', 'management_group', 'payment', 'water_quality',
       'quantity', 'source', 'waterpoint_type', 'status_group'],
      dtype='object')

In [4]:
train, test = train_test_split(df3, test_size=0.25, random_state=42)

In [5]:
dummy_cols = ['funder', 'installer', 'basin', 'region', 'extraction_type',
       'management', 'management_group', 'payment', 'water_quality',
       'quantity', 'source', 'waterpoint_type']
train = pd.get_dummies(train, columns = dummy_cols)

train = train.sample(frac=1).reset_index(drop=True)

In [6]:
train.shape

(44550, 122)

In [7]:
test = pd.get_dummies(test, columns = dummy_cols)

In [8]:
train

Unnamed: 0,amount_tsh,population,public_meeting,permit,construction_year,year_recorded,number_of_years,status_group,funder_danida,funder_gov,...,source_shallow well,source_spring,source_unknown,waterpoint_type_cattle trough,waterpoint_type_communal standpipe,waterpoint_type_communal standpipe multiple,waterpoint_type_dam,waterpoint_type_hand pump,waterpoint_type_improved spring,waterpoint_type_other
0,0,300,True,True,1990,2013,23,functional,False,False,...,True,False,False,False,False,False,False,True,False,False
1,300,0,False,True,2007,2013,6,non functional,False,False,...,False,False,False,False,True,False,False,False,False,False
2,0,0,True,True,1986,2011,25,functional,False,False,...,False,True,False,False,False,False,False,False,True,False
3,2000,0,True,False,1986,2011,25,functional,False,False,...,False,True,False,False,True,False,False,False,False,False
4,0,0,True,False,1986,2012,26,functional,False,False,...,False,False,False,False,True,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44545,5000,1,True,False,1998,2013,15,functional,True,False,...,False,False,False,False,False,False,False,True,False,False
44546,0,0,True,True,1986,2011,25,non functional,False,True,...,False,True,False,False,True,False,False,False,False,False
44547,0,1,True,True,2005,2011,6,non functional,False,False,...,False,False,False,False,True,False,False,False,False,False
44548,50,100,True,False,2000,2013,13,functional,False,False,...,False,False,False,False,True,False,False,False,False,False


In [9]:
# Define the target variable
target = train.status_group

# Define the features
features = train.drop('status_group', axis=1)
X_train, X_val, y_train, y_val = train_test_split(features, target, train_size=0.8)

In [10]:
def model(X_train, X_val, y_train, y_val, test):
    if __name__ == '__main__':          #NOT NECESSARY

        pipe_svc = Pipeline([('scl', StandardScaler()), 
                             ('clf', LinearSVC())])
#('scl', StandardScaler()): The first step in the pipeline applies standardization using StandardScaler(), which scales the features so they have a mean
#of 0 and a standard deviation of 1.  
#('clf', LinearSVC()): The second step in the pipeline fits a Linear Support Vector Classifier (LinearSVC). This is the actual model that will be trained
#and used to make predictions.
        param_grid = {'clf__C':[0.001, 0.01, 0.1, 1.0],
                      'clf__class_weight':[None, 'balanced']}
#param_grid: This dictionary defines a grid of hyperparameters that will be tested during the model optimization process.
#clf__C: This refers to the C parameter of the SVM, which controls the regularization strength. Lower values indicate stronger regularization.
#clf__class_weight: This controls the class weighting. When set to 'balanced', it adjusts the weight inversely proportional to class frequencies, 
#helping handle imbalanced data.
        estimator = GridSearchCV(estimator=pipe_svc,
                                 param_grid=param_grid,
                                 n_jobs=-1)
#estimator=pipe_svc: This specifies that the pipeline (containing the StandardScaler and LinearSVC) is the model to be optimized.
#param_grid=param_grid: Specifies the hyperparameters to test.
#n_jobs=-1: Uses all available CPU cores to parallelize the grid search process for faster execution.
        estimator.fit(X_train, y_train)
#fits the pipeline to the training data (X_train and y_train). 
# NB GridSearchCV tries different combinations of hyperparameters and selects the best one based on cross-validation performance.
        best_params = estimator.best_params_
#best_params: Retrieves the best combination of hyperparameters found by GridSearchCV during cross-validation.                                 
        validation_accuracy = estimator.score(X_val, y_val)
        print('Validation accuracy: ', validation_accuracy)
        print(best_params)
#validation_accuracy: Computes the accuracy of the trained model on the validation dataset (X_val and y_val) using the best-found hyperparameters.

In [11]:
model(X_train, X_val, y_train, y_val, test)

Validation accuracy:  0.7326599326599327
{'clf__C': 0.001, 'clf__class_weight': None}


**NOTES**
* Pipeline: Sequentially applies data preprocessing (scaling) and model training (Linear SVC).
* GridSearchCV: This is a tool for hyperparameter tuning that evaluates the model using cross-validation on a grid of hyperparameters.
* Hyperparameter Tuning: Uses GridSearchCV to find the best C value and class weight for the Support Vector Classifier.
* Cross-Validation: The model is optimized using cross-validation, ensuring better generalization on unseen data.
* Accuracy Calculation: The accuracy of the best model is evaluated on the validation set to assess its performance.