# ML Model for Groundwater detection based on resistivity values
**Author: Camilo Mejía**

Import Dependencies

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report
import folium
from flask import Flask, request, jsonify
from sklearn.pipeline import Pipeline

Load and preprocess the dataset

In [None]:
data = pd.read_csv("my_data.dat", delimiter='\t')  # Adjust delimiter if needed
# ... Data preprocessing steps here

# Step 2: QA/QC processing to remove outlier values

Assuming you have 2D resistivity values (x, y, rho) and an indicator for groundwater presence
Example: x_coord, y_coord, rho, groundwater_presence, groundwater_depth

Calculate the z-scores of the resistivity values

In [None]:
z_scores = (data['rho'] - data['rho'].mean()) / data['rho'].std()

Define a threshold for z-scores beyond which data points are considered outliers


In [None]:
z_score_threshold = 3  # Adjust this threshold as needed

Create a boolean mask to identify outlier data points

In [None]:
outliers_mask = abs(z_scores) > z_score_threshold

Remove outlier data points from the dataset

In [None]:
data_no_outliers = data[~outliers_mask]

# Step 3: Data splitting

In [None]:
X = data_no_outliers.drop(["groundwater_presence", "groundwater_depth"], axis=1)  # Features (x, y, and rho)
y_presence = data_no_outliers["groundwater_presence"]  # Target variable for presence
y_depth = data_no_outliers["groundwater_depth"]  # Target variable for depth
X_train, X_test, y_presence_train, y_presence_test, y_depth_train, y_depth_test = train_test_split(X, y_presence, y_depth, test_size=0.2, random_state=42)

# Step 4: Model selection

In [None]:
model = RandomForestClassifier(random_state=42)

# Step 5: Model training

In [None]:
model.fit(X_train, y_presence_train)

# Step 6: Model evaluation

In [None]:
y_presence_pred = model.predict(X_test)
accuracy_presence = accuracy_score(y_presence_test, y_presence_pred)
classification_rep_presence = classification_report(y_presence_test, y_presence_pred)
print("Accuracy for Groundwater Presence:", accuracy_presence)
print("Classification Report for Groundwater Presence:\n", classification_rep_presence)

# Step 7: Hyperparameter tuning (optional)

In [None]:
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

pipeline = Pipeline([
    ('scaler', StandardScaler()),  # Feature scaling
    ('clf', RandomForestClassifier(random_state=42))
])

grid_search = GridSearchCV(pipeline, param_grid, cv=5, n_jobs=-1)
grid_search.fit(X_train, y_presence_train)

best_model_presence = grid_search.best_estimator_


# Step 8: Model interpretation

In [None]:
importances_presence = best_model_presence.named_steps['clf'].feature_importances_
feature_names_presence = X_train.columns
feature_importances_presence = dict(zip(feature_names_presence, importances_presence))
sorted_feature_importances_presence = sorted(feature_importances_presence.items(), key=lambda x: x[1], reverse=True)

print("Feature Importances for Groundwater Presence:")
for feature, importance in sorted_feature_importances_presence:
    print(f"{feature}: {importance}")

# Step 9: Prediction and visualization

In [None]:
m = folium.Map(location=[latitude_center, longitude_center], zoom_start=10)

for index, row in X_test.iterrows():
    lat, lon = row["x_coord"], row["y_coord"]  # Adjust column names accordingly
    prediction_presence = best_model_presence.predict([row])[0]
    popup_text = f"Groundwater Presence: {'Yes' if prediction_presence == 1 else 'No'}"
    popup = folium.Popup(popup_text, parse_html=True)
    marker = folium.Marker([lat, lon], popup=popup)
    marker.add_to(m)

m.save("groundwater_presence_predictions_map.html")


# Step 10: Model deployment (optional)

In [None]:
app = Flask(__name__)

@app.route('/predict', methods=['POST'])
def predict():
    data = request.json  # Assuming you send JSON data with x, y, and rho values
    prediction_presence = best_model_presence.predict([data])[0]
    response = {'groundwater_presence_prediction': 'Yes' if prediction_presence == 1 else 'No'}
    return jsonify(response)

if __name__ == '__main__':
    app.run()