# ML Model for Groundwater detection based on resistivity values
**Author: Camilo Mejía**

Import Dependencies

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report
import folium
from flask import Flask, request, jsonify
from sklearn.pipeline import Pipeline

Load and preprocess the dataset

In [None]:
data = pd.read_csv("my_data.dat", delimiter='\t')  # Adjust delimiter if needed
# ... Data preprocessing steps here

# Step 2: Feature selection/extraction

Assuming you have columns representing resistivity values at different depths
Example: depth_1, depth_2, ..., depth_n

You might want to calculate summary statistics for each depth level
For example, mean, median, min, max, standard deviation, etc.

In [None]:
data['mean_resistivity'] = data[['depth_1', 'depth_2', 'depth_n']].mean(axis=1)
data['median_resistivity'] = data[['depth_1', 'depth_2', 'depth_n']].median(axis=1)
# ... Add more feature extraction techniques based on domain knowledge


Drop original depth columns

In [None]:
data.drop(['depth_1', 'depth_2', 'depth_n'], axis=1, inplace=True)

# Step 3: Data splitting

In [None]:
X = data.drop("groundwater_presence", axis=1)  # Features
y = data["groundwater_presence"]  # Target variable
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 4: Model selection

In [None]:
model = RandomForestClassifier(random_state=42)

# Step 5: Model training

In [None]:
model.fit(X_train, y_train)

# Step 6: Model evaluation

In [None]:
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)
print("Accuracy:", accuracy)
print("Classification Report:\n", classification_rep)

# Step 7: Hyperparameter tuning (optional)

In [None]:
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

pipeline = Pipeline([
    ('scaler', StandardScaler()),  # Feature scaling
    ('clf', RandomForestClassifier(random_state=42))
])

grid_search = GridSearchCV(pipeline, param_grid, cv=5, n_jobs=-1)
grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_


# Step 8: Model interpretation

In [None]:
importances = best_model.named_steps['clf'].feature_importances_
feature_names = X_train.columns
feature_importances = dict(zip(feature_names, importances))
sorted_feature_importances = sorted(feature_importances.items(), key=lambda x: x[1], reverse=True)

print("Feature Importances:")
for feature, importance in sorted_feature_importances:
    print(f"{feature}: {importance}")


# Step 9: Prediction and visualization

In [None]:
m = folium.Map(location=[latitude_center, longitude_center], zoom_start=10)

for index, row in X_test.iterrows():
    lat, lon = row["latitude"], row["longitude"]  # Adjust column names accordingly
    prediction = best_model.predict([row])[0]
    popup_text = f"Groundwater: {'Present' if prediction == 1 else 'Absent'}"
    popup = folium.Popup(popup_text, parse_html=True)
    marker = folium.Marker([lat, lon], popup=popup)
    marker.add_to(m)

m.save("groundwater_predictions_map.html")

# Step 10: Model deployment (optional)

In [None]:
app = Flask(__name__)

@app.route('/predict', methods=['POST'])
def predict():
    data = request.json  # Assuming you send JSON data with resistivity values
    prediction = best_model.predict([data])[0]
    response = {'prediction': prediction}
    return jsonify(response)

if __name__ == '__main__':
    app.run()