In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [2]:
# Load the dataset
train_data = pd.read_csv("C:\\Users\\ishan\\groundwater\\train_data.csv")
test_data = pd.read_csv("C:\\Users\\ishan\\groundwater\\test_data.csv")

# Display basic information about the datasets
print(train_data.info())
print(test_data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44068 entries, 0 to 44067
Data columns (total 34 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Id                     44068 non-null  int64  
 1   Amount_of_water        44068 non-null  float64
 2   Gps_height             44068 non-null  int64  
 3   Waterpoint_type        44068 non-null  object 
 4   Basin_name             44068 non-null  object 
 5   Village                44068 non-null  object 
 6   Regionname             44068 non-null  object 
 7   Region_code            44068 non-null  int64  
 8   Wardname               44068 non-null  object 
 9   District_code          44068 non-null  int64  
 10  Population             44068 non-null  int64  
 11  Public_meeting         41543 non-null  object 
 12  Organization_funding   41468 non-null  object 
 13  Organization_surveyed  44068 non-null  object 
 14  Scheme_management      41137 non-null  object 
 15  Sc

In [3]:
import pandas as pd

# Assuming `train_data` and `test_data` are your DataFrames

# Specify the important features (including 'Id' for both train and test)
important_features = [
    'Id', 'Amount_of_water', 'Gps_height', 'Extraction_type', 
    'Water_quality', 'Quality_group', 'Quantity', 
    'Source', 'Source_type', 'Source_class', 'Status'
]

# Select only the important features for training data
train_data_important = train_data[important_features]

# Separate features and target variable for training data
X_train_important = train_data_important.drop(columns=['Status'])
y_train_important = train_data_important['Status']

# For the test set, remove 'Status' since it's not present in the test data
test_important_features = [
    'Id', 'Amount_of_water', 'Gps_height', 'Extraction_type', 
    'Water_quality', 'Quality_group', 'Quantity', 
    'Source', 'Source_type', 'Source_class'
]
X_test_important = test_data[test_important_features]

# Print the shapes to verify
print(f"Shape of reduced training data: {X_train_important.shape}")
print(f"Shape of training target: {y_train_important.shape}")
print(f"Shape of reduced testing data: {X_test_important.shape}")

Shape of reduced training data: (44068, 10)
Shape of training target: (44068,)
Shape of reduced testing data: (11015, 10)


In [4]:
# Import necessary libraries
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Step 1: Handling Missing Values
# Let's use SimpleImputer to fill missing values with the mean (for numerical features)
# and most frequent value (for categorical features)
numerical_features = X_train_important.select_dtypes(include=['float64', 'int64']).columns
categorical_features = X_train_important.select_dtypes(include=['object']).columns

numerical_transformer = SimpleImputer(strategy='mean')
categorical_transformer = SimpleImputer(strategy='most_frequent')

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Step 2: Encoding Categorical Variables
# We'll use OneHotEncoder to encode categorical variables
# We need to define a pipeline that includes preprocessing and encoding
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

# Fit and transform training data
X_train_processed = pipeline.fit_transform(X_train_important)
X_test_processed = pipeline.transform(X_test_important)

# Step 3: Splitting the Data
# Split the processed training data into training and validation sets
X_train_final, X_valid, y_train_final, y_valid = train_test_split(X_train_processed, y_train_important, test_size=0.2, random_state=42)

# Now, your preprocessed data is ready for training your machine learning model
# You can use X_train_final, y_train_final for training and X_valid, y_valid for validation
# You can use X_test_processed for making predictions on the test data


In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Define and fit the logistic regression model
logistic_model = LogisticRegression(max_iter=1000)
logistic_model.fit(X_train_final, y_train_final)

# Predict on the validation set
y_pred_logistic = logistic_model.predict(X_valid)

# Evaluate Logistic Regression
accuracy_logistic = accuracy_score(y_valid, y_pred_logistic)
precision_logistic = precision_score(y_valid, y_pred_logistic, average='weighted')
recall_logistic = recall_score(y_valid, y_pred_logistic, average='weighted')
f1_logistic = f1_score(y_valid, y_pred_logistic, average='weighted')

print("Logistic Regression:")
print(f"Accuracy: {accuracy_logistic:.2f}")
print(f"Precision: {precision_logistic:.2f}")
print(f"Recall: {recall_logistic:.2f}")
print(f"F1 Score: {f1_logistic:.2f}")


Logistic Regression:
Accuracy: 0.77
Precision: 0.77
Recall: 0.77
F1 Score: 0.76
