In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

# import numpy as np # linear algebra
# import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

# import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#    for filename in filenames:
#        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Automatic Identification of Internal Waves Competition
+ This notebook belongs ot the competition Automatic Identification of Internal Waves
+ This notebook was created to test the competition data.
+ The performance results provide a baseline for the competition.

In [None]:
import os
import zipfile
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from skimage.io import imread
from skimage.transform import resize



### Set random seed for reproducibility

In [None]:
np.random.seed(42)

### Define paths

In [None]:
train_dir = '/kaggle/input/automatic-identification-of-internal-waves/images_train-20240709T094004Z-001/images_train/'
test_dir = '/kaggle/input/automatic-identification-of-internal-waves/images_test-20240709T093512Z-001/images_test/'
train_csv = '/kaggle/input/automatic-identification-of-internal-waves/train.csv'
test_csv = '/kaggle/input/automatic-identification-of-internal-waves/test.csv'
solution_csv = '/kaggle/input/automatic-identification-of-internal-waves/solution.csv'


### Load CSV files

In [None]:
train_df = pd.read_csv(train_csv)
train_df['id'] = train_df['id'].astype(str) + '.png'
test_df = pd.read_csv(test_csv)
test_df['id'] = test_df['id'].astype(str) + '.png'
solution_df = pd.read_csv(solution_csv)
solution_df['id'] = solution_df['id'].astype(str) + '.png'

### Define function to load images
Images are resized for faster processing on this baseline/demo.

In [None]:
def load_images(directory, df):
    images = []
    for img_name in df['id']:
        img_path = os.path.join(directory, img_name)
        img = imread(img_path)
        img_resized = resize(img, (50, 50, 4))  # Resize to smaller dimension for faster processing
        images.append(img_resized.flatten())  # Flatten the image
    return np.array(images)

### Load and preprocess the images

In [None]:
X_train = load_images(train_dir, train_df)
X_test = load_images(test_dir, test_df)
y_train = train_df['ground_truth'].values

# Split training data into train and validation sets

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

### Normalize the data

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

### Create and train the model

In [None]:
model = MLPClassifier(hidden_layer_sizes=(100, 50), max_iter=5000, random_state=42)
model.fit(X_train_scaled, y_train)

### Use the model to make predictions on the validation set

In [None]:
val_predictions = model.predict(X_val_scaled)
val_accuracy = accuracy_score(y_val, val_predictions)
print(f"Validation Accuracy: {val_accuracy:.4f}")

# Make predictions on the test set

In [None]:
test_predictions = model.predict(X_test_scaled)

### Compare predictions with the solution provided (test set)

In [None]:
test_df['predicted'] = test_predictions
merged_df = pd.merge(test_df, solution_df, on='id')

### Calculate performance metrics

In [None]:

accuracy = accuracy_score(merged_df['ground_truth'], merged_df['predicted'])
precision = precision_score(merged_df['ground_truth'], merged_df['predicted'])
recall = recall_score(merged_df['ground_truth'], merged_df['predicted'])
f1 = f1_score(merged_df['ground_truth'], merged_df['predicted'])

### Print performance metrics

In [None]:
print(f"Test Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")