In [None]:
import os
import cv2
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from collections import Counter
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, f1_score

from torchvision import transforms, models
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler
import torch
import torch.nn as nn
import torch.optim as optim

from PIL import Image
from PIL import UnidentifiedImageError, Image
from pathlib import Path


from collections import defaultdict
import seaborn as snsu

In [None]:
# Step 3: Mount Google Drive or upload data via Colab
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# ==== CONFIG ==== #



TRAIN_DIR = Path('/content/drive/MyDrive/soil-classification-part-2/soil_competition-2025/train')
TEST_DIR = Path('/content/drive/MyDrive/soil-classification-part-2/soil_competition-2025/test')

train_csv = '/content/drive/MyDrive/soil-classification-part-2/soil_competition-2025/train_labels.csv'
test_csv = '/content/drive/MyDrive/soil-classification-part-2/soil_competition-2025/test_ids.csv'




train_df = pd.read_csv(train_csv)

In [None]:
#List of files in TEST_DIR and to extarct Unique file extensions
test_files = os.listdir(TEST_DIR)

extensions = [os.path.splitext(f)[1].lower().replace(".", "") for f in test_files if os.path.isfile(os.path.join(TEST_DIR, f))]

# Count and display unique types
ext_counts = Counter(extensions)
print(" Unique image file types in test set:")
for ext, count in ext_counts.items():
    print(f" {ext}: {count} file(s)")


In [None]:
# List of files in TRAIN_DIR to extract Unique file extensions
train_files = os.listdir(TRAIN_DIR)

extensions = [os.path.splitext(f)[1].lower().replace(".", "") for f in train_files if os.path.isfile(os.path.join(TRAIN_DIR, f))]

# Count unique types
ext_counts = Counter(extensions)
print(" Unique image file types in training set:")
for ext, count in ext_counts.items():
    print(f"{ext}: {count} file(s)")


In [None]:
#Extratcing file name of special types (png, webp and gif for analysis)
import os

def list_specific_file_types(directory, extensions_to_find):
    files = os.listdir(directory)
    filtered_files = [f for f in files if os.path.isfile(os.path.join(directory, f))
                      and os.path.splitext(f)[1].lower().replace('.', '') in extensions_to_find]
    return filtered_files

target_extensions = {'png', 'webp', 'gif'}

train_specific_files = list_specific_file_types(TRAIN_DIR, target_extensions)
test_specific_files = list_specific_file_types(TEST_DIR, target_extensions)

# Display results
print(" Train images with png, webp, or gif extensions:")
for fname in train_specific_files:
    print("•", fname)

print("\n Test images with png, webp, or gif extensions:")
for fname in test_specific_files:
    print("•", fname)


In [None]:
# 3. Image resolution stats
def get_image_dims(path_list):
    dims = []
    for file in path_list:
        try:
            with Image.open(file) as img:
                dims.append(img.size)
        except:
            continue
    return pd.DataFrame(dims, columns=["width", "height"])

train_files = list(TRAIN_DIR.glob("*"))
test_files = list(TEST_DIR.glob("*"))

train_dims_df = get_image_dims(train_files)
test_dims_df = get_image_dims(test_files)

# Outputs for visualization
(train_dims_df.describe(), test_dims_df.describe())


In [None]:
#Function to visulaize dataset (train/test)

def analyze_image_folder(folder_path, dataset_name="Dataset"):
    formats_count = defaultdict(int)
    dimensions = []
    file_sizes = []
    corrupt_files = []

    folder_path = Path(folder_path)

    # 🧪 Analyze each image
    for img_path in folder_path.iterdir():
        try:
            with Image.open(img_path) as img:
                formats_count[img.format.lower()] += 1
                dimensions.append(img.size)
                file_sizes.append(os.path.getsize(img_path))
        except UnidentifiedImageError:
            corrupt_files.append(img_path.name)

    # Skip analysis if no images were read
    if not dimensions:
        print(f"❌ No valid images found in {dataset_name}")
        return

    # 📈 Convert to arrays
    widths, heights = zip(*dimensions)
    aspect_ratios = np.array(widths) / np.array(heights)

    print(f"\n📂 {dataset_name} Analysis")
    print(f"⚠️ Corrupt/Unreadable Files: {len(corrupt_files)}")
    if corrupt_files:
        print("Corrupt file names:", corrupt_files)


    print(f"• Aspect Ratio: min={aspect_ratios.min():.2f}, max={aspect_ratios.max():.2f}")
    print(f"• File Size (KB): min={np.min(file_sizes)/1024:.1f}, max={np.max(file_sizes)/1024:.1f}, mean={np.mean(file_sizes)/1024:.1f}")

    return widths, heights, aspect_ratios, file_sizes
