In [1]:
# -*- coding: utf-8 -*-
"""
PESCADATA Fish Acoustics: Some Machine Learning Modelling
Created on Tue Sep 17 08:44:33 2024

@author: jmanitz
"""

'\nPESCADATA Fish Acoustics: Some Machine Learning Modelling\nCreated on Tue Sep 17 08:44:33 2024\n\n@author: jmanitz\n'

In [31]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import joblib

In [21]:
# import data; source: https://ihma.org.pe/bitacoras-acusticas/
path = "/Users/navne/OneDrive/Desktop/Jule/Documents/climate_projects/ocean_capstone/acoustics_data_2020.csv"
dt = pd.read_csv(path, skipinitialspace=True, delimiter=";", low_memory=False) #dt.columns

# Select response 
dt["Species_grp"] = dt["Region_class"].replace({"BAG": "Other", "POT": "Other", "EU": "Other", "Unclassified": "Other", "OTR": "Other", "BON": "Other", "MIC": "Other", "PG": "Other", "JC": "Other", "CAM": "Other", "CAB": "Other"})
dt["Species_grp"].value_counts()

Species_grp
ANC      13763
MUN       5184
VIN       4567
Other     2162
JUR       1716
Name: count, dtype: int64

In [23]:
# select relevant feastures
features = ["Species_grp", "Height_mean", "Depth_mean", "Sv_mean","Sv_min", "Sv_max", "Sv_noise", "Lon_M", "Lat_M", "NASC", "ABC"] #"Region_class",
dt[features].describe()

Unnamed: 0,Height_mean,Depth_mean,Sv_mean,Sv_min,Sv_max,Sv_noise,Lon_M,Lat_M,NASC,ABC
count,27392.0,27392.0,27392.0,27392.0,27392.0,27392.0,27392.0,27392.0,27392.0,27392.0
mean,1.611526,34.87797,-160.970967,-167.704261,-151.590879,-980.418479,-77.126439,-13.145123,4095.618,9.502279e-05
std,2.417622,55.066196,297.059548,294.553784,300.393569,10.650949,1.064746,1.965452,104235.1,0.002418366
min,0.002306,0.725327,-999.0,-999.0,-999.0,-1006.124041,-81.505565,-18.251702,0.0,0.0
25%,0.494609,7.498035,-65.007585,-64.988363,-55.219006,-986.730548,-77.286147,-14.563022,9.705766,2.25e-07
50%,0.787972,12.852724,-56.805446,-64.817348,-47.611876,-980.091839,-77.093634,-13.47516,78.1975,1.815e-06
75%,1.644,27.789329,-50.498772,-63.641564,-40.971255,-973.618896,-76.520483,-12.079879,370.7061,8.6e-06
max,43.515127,315.094719,-7.530407,-42.212651,-0.253201,-941.01428,-72.096534,-8.282912,16627870.0,0.3857846


In [25]:
# Step 1: Basic preprocessing
def prep_data(data):
    #data.fillna(0, inplace=True)  # Fill missing values
    data = pd.get_dummies(data, drop_first=True)  # Encode categorical features
    return data

dt = prep_data(dt[features])

In [33]:
# Step 2: Model Training
def train_model(data):
    """Train a machine learning model."""
    X = data.drop('NASC', axis=1)  # Features
    y = data['NASC']               # Target variable
    
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Train model
    model = RandomForestRegressor(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)
    
    # Evaluate model
    y_pred = model.predict(X_test)
    # Compute the MSE for the test data
    mse = mean_squared_error(y_pred, y_test)
    print(f"Mean Squared Error: {mse}")
    
    # Save model
    joblib.dump(model, 'nasc_model.pkl')
    return model

m1 = train_model(dt)

Mean Squared Error: 39359731504.529


In [None]:
# Step 3: Model Inference (Predict Function)
def predict_churn(input_data):
    """Simulate a prediction with the trained model."""
    # Load the trained model
    model = joblib.load('nasc_model.pkl')
    
    # Convert input to DataFrame (in case it's passed as a dictionary)
    input_df = pd.DataFrame([input_data])
    
    # Predict
    prediction = model.predict(input_df)
    return {'nasc_prediction': int(prediction[0])}

In [None]:
# Step 4: Model Evaluation (Monitoring and Maintenance)
def evaluate_model(data):
    """Evaluate the model on the entire dataset."""
    # Load features and target
    X = data.drop('nasc', axis=1)
    y = data['nasc']
    
    # Load model and predict
    model = joblib.load('nasc_model.pkl')
    y_pred = model.predict(X)
    
    # Report metrics
    accuracy = accuracy_score(y, y_pred)
    print("Overall Evaluation Accuracy:", accuracy)
    return accuracy

In [None]:


# Running the Pipeline
if __name__ == '__main__':
    # Load and preprocess data
    data = load_data(dt[features])

    
    # Train model
    model = train_model(data)
    
    # Simulate a prediction
    sample_data = {'feature1': 0.5, 'feature2': 1.2, 'feature3': 0}  # Replace with actual feature names and values
    prediction_result = predict_churn(sample_data)
    print("Sample Prediction:", prediction_result)
    
    # Evaluate model on full dataset
    evaluate_model(data)
