In [2]:
import os
import zipfile
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from skimage.io import imread
from skimage.transform import resize

In [3]:
# unzip folders with images
with zipfile.ZipFile('/Users/jpinelo/Dropbox/JP_Lab/AIRCentre/2-Projects/54-Internal_waves/Data/images_train.zip', 'r') as zip_ref:
    zip_ref.extractall('/Users/jpinelo/Dropbox/JP_Lab/AIRCentre/2-Projects/54-Internal_waves/Data/')


with zipfile.ZipFile('/Users/jpinelo/Dropbox/JP_Lab/AIRCentre/2-Projects/54-Internal_waves/Data/images_test.zip', 'r') as zip_ref:
    zip_ref.extractall('/Users/jpinelo/Dropbox/JP_Lab/AIRCentre/2-Projects/54-Internal_waves/Data/')

In [5]:
# Set random seed for reproducibility
np.random.seed(42)

In [6]:
# Define paths
train_dir = '/Users/jpinelo/Dropbox/JP_Lab/AIRCentre/2-Projects/54-Internal_waves/Data/images_train/'
test_dir = '/Users/jpinelo/Dropbox/JP_Lab/AIRCentre/2-Projects/54-Internal_waves/Data/images_test/'
train_csv = '/Users/jpinelo/Dropbox/JP_Lab/AIRCentre/2-Projects/54-Internal_waves/Data/train.csv'
test_csv = '/Users/jpinelo/Dropbox/JP_Lab/AIRCentre/2-Projects/54-Internal_waves/Data/test.csv'
solution_csv = '/Users/jpinelo/Dropbox/JP_Lab/AIRCentre/2-Projects/54-Internal_waves/Data/solution.csv'

In [46]:
# Load CSV files
train_df = pd.read_csv(train_csv)
train_df['id'] = train_df['id'].astype(str) + '.png'
test_df = pd.read_csv(test_csv)
test_df['id'] = test_df['id'].astype(str) + '.png'
solution_df = pd.read_csv(solution_csv)
solution_df['id'] = solution_df['id'].astype(str) + '.png'

In [47]:
def load_images(directory, df):
    images = []
    for img_name in df['id']:
        img_path = os.path.join(directory, img_name)
        img = imread(img_path)
        img_resized = resize(img, (50, 50, 4))  # Resize to smaller dimension for faster processing
        images.append(img_resized.flatten())  # Flatten the image
    return np.array(images)

In [48]:
# Load and preprocess images
X_train = load_images(train_dir, train_df)
X_test = load_images(test_dir, test_df)
y_train = train_df['ground_truth'].values

In [49]:
# Split training data into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)


In [50]:
# Normalize the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

In [51]:
# Create and train the model
model = MLPClassifier(hidden_layer_sizes=(100, 50), max_iter=5000, random_state=42)
model.fit(X_train_scaled, y_train)

In [52]:
# Evaluate on validation set
val_predictions = model.predict(X_val_scaled)
val_accuracy = accuracy_score(y_val, val_predictions)
print(f"Validation Accuracy: {val_accuracy:.4f}")

Validation Accuracy: 0.5609


In [53]:
# Make predictions on the test set
test_predictions = model.predict(X_test_scaled)

In [54]:
# Compare predictions with solution
test_df['predicted'] = test_predictions
merged_df = pd.merge(test_df, solution_df, on='id')

In [57]:
test_df['id'] = test_df['id'].str.replace('.png', '')

In [58]:
test_df.to_csv('submission-baseline.csv', index=False)

In [None]:
# Calculate performance metrics
accuracy = accuracy_score(merged_df['ground_truth'], merged_df['predicted'])
precision = precision_score(merged_df['ground_truth'], merged_df['predicted'])
recall = recall_score(merged_df['ground_truth'], merged_df['predicted'])
f1 = f1_score(merged_df['ground_truth'], merged_df['predicted'])

In [None]:
# Print performance metrics
print(f"Test Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")