# **import from Kaggle**

In [1]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'playground-series-s4e4:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-competitions-data%2Fkaggle-v2%2F72489%2F8096274%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240414%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240414T161239Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D3a466f9f2bcea7b05fdcd142093ea8a60561b12f77110e73e701a3f6e513d2449087ce1dfc5d340dc96466ae8b9c54e465b8dcb6316e95736078759cea48d60da5bb74df01b66604b755eaf40e962dbaab06fbb2f9e93b6b533eb4f2b2103eddea99ab46ce1a16d95215543ef5290c5fe23c75d0520f782ecbf258924fff86b69d174008252a173ebe2b0adeb9551b703dc8e828b788c68de8cde7ba5eb0497bbda8985b44e78816fa518b4f94b96b1cccddc881126026b63746ae565c7eff91e378baa21b96f716fb4ca6c87ac140a5895ef6368980b1176244a5f19431015c29711f034c6a196ea60fd5939b951a5fbbab494d9f808c230dc12551fffac334'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


Downloading playground-series-s4e4, 2529839 bytes compressed
Downloaded and uncompressed: playground-series-s4e4
Data source import complete.


# **import packs**

In [None]:
!pip install pytorch-lightning qqq
!pip install segmentation-models-pytorch qqq
!pip install wandb qqq
import pandas as pd
import numpy as np
from torch.utils.data import DataLoader, Dataset, random_split
import torch
import torch.nn as nn
import pytorch_lightning as pl
import segmentation_models_pytorch as smp
from pytorch_lightning.loggers import WandbLogger
import matplotlib.pyplot as plt

Collecting pytorch-lightning
  Downloading pytorch_lightning-2.2.2-py3-none-any.whl (801 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m801.9/801.9 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting qqq
  Downloading qqq-0.0.1-py3-none-any.whl (1.3 kB)
Collecting torchmetrics>=0.7.0 (from pytorch-lightning)
  Downloading torchmetrics-1.3.2-py3-none-any.whl (841 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m841.5/841.5 kB[0m [31m32.5 MB/s[0m eta [36m0:00:00[0m
Collecting lightning-utilities>=0.8.0 (from pytorch-lightning)
  Downloading lightning_utilities-0.11.2-py3-none-any.whl (26 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.13.0->pytorch-lightning)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.13.0->pytorch-lightning)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)


# **data preprocss**

In [None]:
# Load the training dataset
train_data_path = "/kaggle/input/playground-series-s4e4/train.csv"
train_data = pd.read_csv(train_data_path)

# Display the first 5 rows of the training dataset
print(train_data.head())
print('#################################################################')

# Get basic information about the dataset, including data types and non-null values for each column
print(train_data.info())
print('#################################################################')

# Perform exploratory data analysis to understand the distribution of the data
print(train_data.describe())
print('#################################################################')


In [None]:
# Load the data
data = pd.read_csv('/kaggle/input/playground-series-s4e4/train.csv')

# Display basic information about the data
print("Initial Data Information:")
print(data.info())
print('#################################################################')

# 1. Check and handle records with height 0
zero_height = data[data['Height'] == 0]
print("Number of records with height 0:", zero_height.shape[0])
print('#################################################################')

# You can choose to delete these rows or replace them with mean/median
data = data[data['Height'] > 0]  # Delete these rows
# or
# median_height = data['Height'].median()
# data.loc[data['Height'] == 0, 'Height'] = median_height  # Replace with median

# 2. Encode categorical variables and ensure they are floating-point
data['Sex'] = data['Sex'].map({'M': 0.0, 'F': 1.0, 'I': 2.0})  # Label encoding to float
# Convert data type to float
data['Sex'] = data['Sex'].astype(float)

# 3. Data normalization
from sklearn.preprocessing import StandardScaler

# Define columns to be normalized
features_to_scale = ['Sex', 'Length', 'Diameter', 'Height', 'Whole weight', 'Whole weight.1', 'Whole weight.2', 'Shell weight']

# Initialize the scaler
scaler = StandardScaler()

# Normalize these columns
data[features_to_scale] = scaler.fit_transform(data[features_to_scale])

# Display a sample of the normalized data
print("Sample of normalized data:")
print(data[features_to_scale].head())

# Save the cleaned data to the specified folder
data.to_csv('/kaggle/working/cleaned_data.csv', index=False)
print("Data cleaning completed and saved to '/kaggle/working/cleaned_data.csv'.")

print("Data cleaning completed and saved to cleaned_data.csv")
print('#################################################################')

# Display basic information about the cleaned data
print("Cleaned Data Information:")
print(data.info())
print('#################################################################')

# Display the first 5 rows of the cleaned dataset
print("First 5 rows of cleaned dataset:")
print(data.head())
print('#################################################################')


In [None]:
# Display basic information about the data
print(data.info())  # Now we have reduced outliers and normalized the data
print('#################################################################')

# Display the first 5 rows of the cleaned dataset
print("First 5 rows of cleaned dataset:")
print(data.head())
print('#################################################################')


In [None]:
class AbaloneDataset(Dataset):
    """
    A custom dataset class for the Abalone dataset.
    """

    def __init__(self, dataframe):
        """
        Initialize the dataset with the provided DataFrame.

        Args:
            dataframe (pandas.DataFrame): The input DataFrame containing the Abalone data.
        """
        self.dataframe = dataframe

    def __len__(self):
        """
        Return the length of the dataset (number of samples).

        Returns:
            int: The length of the dataset.
        """
        return len(self.dataframe)

    def expand_and_fill(self, data):
        """
        Expand and fill the input data into a 32x32 single-channel matrix.

        Args:
            data (torch.Tensor): The input data with shape [feature_size], where feature_size=8.

        Returns:
            torch.Tensor: The expanded and filled data with shape [32, 32].
        """
        # Initialize a 32x32 output matrix
        output = torch.zeros((32, 32))

        # Fill each 4 rows with one feature
        for i in range(8):
            output[i * 4:(i + 1) * 4, :] = data[i]

        return output.unsqueeze(0)

    def __getitem__(self, idx):
        """
        Get a single data sample and its corresponding target value.

        Args:
            idx (int): The index of the sample in the dataset.

        Returns:
            tuple: A tuple containing the data (torch.Tensor) and target (torch.Tensor).
        """
        # Extract all features except 'id' and 'Rings'
        data = torch.tensor(
            self.dataframe.iloc[idx][1:-1].values.astype(np.float32)
        )  # Exclude id and Rings

        # Expand and fill the data into a 32x32 matrix
        data = self.expand_and_fill(data)

        # Get the target value (Rings)
        target = torch.tensor(self.dataframe.iloc[idx][-1], dtype=torch.float32)

        return data, target


# Load the data
dataframe = pd.read_csv('/kaggle/working/cleaned_data.csv')

# Create the dataset
dataset = AbaloneDataset(dataframe)

# Determine the size of the training and validation sets
val_size = int(0.2 * len(dataset))  # Validation set size is 20% of the dataset
train_size = len(dataset) - val_size  # Training set size is the remaining portion

# Randomly split the dataset
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True, num_workers=4)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False, num_workers=4)

# Check the output of the data loaders
for data, target in train_loader:
    print("Train batch - Data shape:", data.shape, "; Target shape:", target.shape)
    break

for data, target in val_loader:
    print("Validation batch - Data shape:", data.shape, "; Target shape:", target.shape)
    break


In [None]:
# Get the first batch of data from the train_loader
data_iter = iter(train_loader)
data_batch = next(data_iter)

# Unpack the batch data
inputs, targets = data_batch

# Print the shapes and some data points to confirm correct data loading
print("Input batch shape:", inputs.shape)
print("Target batch shape:", targets.shape)
print("First few inputs:", inputs[0])
print("First few targets:", targets[0])

# Select a sample input to visualize
data_to_show = inputs[0]

# Set up matplotlib plot
fig, axes = plt.subplots(nrows=1, ncols=8, figsize=(20, 2.5))  # Create a 1x8 grid

# Plot the image
axes[0].imshow(data_to_show[0], cmap='gray')
axes[0].set_title('Image')

plt.tight_layout()
plt.show()


In [None]:
def rmsle(y_pred, y_true):
    """
    Calculate the Root Mean Squared Logarithmic Error (RMSLE).

    Args:
        y_pred (torch.Tensor): The predicted outputs from the model, should be a tensor.
        y_true (torch.Tensor): The true target values, should be a tensor of the same shape as y_pred.

    Returns:
        torch.Tensor: A scalar tensor representing the RMSLE for the current batch.
    """
    # Ensure predictions and targets are greater than zero after adding one to avoid log(0)
    log_pred = torch.log1p(y_pred)
    log_true = torch.log1p(y_true)

    # Calculate the squared differences between the two
    squared_log_error = (log_pred - log_true) ** 2

    # Mean the squared differences then take the square root
    mean_squared_log_error = torch.mean(squared_log_error)
    rmsle = torch.sqrt(mean_squared_log_error)

    return rmsle

# Test it with a simple tensor 😝
y_pred = torch.tensor([3.0, 5.0, 2.5], dtype=torch.float32)
y_true = torch.tensor([2.0, 4.0, 3.0], dtype=torch.float32)

# Calculate the RMSLE
loss = rmsle(y_pred, y_true)
print("RMSLE Loss:", loss.item())


In [None]:
class AbaloneModel(pl.LightningModule):
    """
    U-Net based regression model for Abalone age prediction.
    """

    def __init__(self):
        super(AbaloneModel, self).__init__()

        # Encoder-Decoder architecture using U-Net
        self.model = smp.Unet(
            encoder_name="mit_b1",  # Select encoder (e.g., ResNet34)
            # encoder_weights="imagenet",  # Use pretrained weights
            in_channels=3,
            classes=1,  # Output a single age prediction
            decoder_attention_type='scse',  # No activation function or whatever you like
        )

        # Regressor head to output age prediction
        self.regressor = nn.Sequential(
            nn.AdaptiveAvgPool2d(1),  # Pool to 1x1
            nn.Flatten(),  # Flatten
            nn.Linear(1, 1),  # Fully connected layer
        )

    def forward(self, x):
        # Repeat input channel to match U-Net input (3 channels)
        x = x.repeat(1, 3, 1, 1)

        # Pass through U-Net encoder-decoder
        x = self.model(x)

        # Apply regressor head for age prediction
        x = self.regressor(x)

        return x

    def training_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)  # Model output should be [batch_size, 1]
        y_hat = y_hat.squeeze(-1)  # Remove last dim, shape becomes [batch_size]

        loss = rmsle(y_hat, y)

        # Log training loss
        self.log("train_loss", loss, on_step=True, on_epoch=False, logger=True, prog_bar=True)

        return loss

    def validation_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)  # Model predictions on validation set
        y_hat = y_hat.squeeze(-1)  # Ensure predictions are [batch_size]

        # Calculate loss
        loss = rmsle(y_hat, y)

        # Optionally: Log validation loss
        self.log("val_loss", loss, on_step=True, on_epoch=True, logger=True, prog_bar=True)

        # return loss

    def configure_optimizers(self):
        return torch.optim.SGD(self.parameters(), lr=1e-3)


In [None]:
# wandb_logger = WandbLogger(project="Abalone", name="Round4")

model = AbaloneModel()
trainer = pl.Trainer(max_epochs=40, logger=None)

# Go train that shxt
trainer.fit(model, train_loader, val_loader)

In [None]:
wandb.finish()

# **WTF IT WORKS?**😝😝😝😝😝😝😝😝😝

In [None]:
model = AbaloneModel.load_from_checkpoint("/content/drive/MyDrive/epoch=13-step=15862.ckpt")

In [None]:
# Ensure model is in evaluation mode
model.eval()
model.freeze()  # In PyTorch Lightning, freeze to ensure model parameters don't change

# Create a DataFrame to store results
results = pd.DataFrame()

# Disable gradient computation for inference
@torch.no_grad()
for batch in val_loader:  # Assume val_loader is your validation data loader
    inputs, targets = batch
    # inputs = inputs.to('cuda')  # Move input data to GPU

    predictions = model(inputs)  # Get model predictions
    predictions = predictions.squeeze(-1)  # Adjust prediction shape if needed

    # Convert data to CPU and NumPy
    predictions = predictions.cpu().numpy()
    targets = targets.cpu().numpy()

    # Add results to DataFrame
    batch_results = pd.DataFrame({
        "Predicted Age": predictions,
        "Actual Age": targets
    })
    results = pd.concat([results, batch_results], ignore_index=True)

# Display or analyze results
print(results.head())

# Save results to CSV (optional)
results.to_csv("prediction_results.csv", index=False)


In [None]:
# 读取 CSV 文件
df = pd.read_csv("/content/prediction_results.csv")

# 计算预测年龄与实际年龄的差
age_diff = df["Predicted Age"] - df["Actual Age"]

# 将差值绝对值小于 1 的视为准确预测
accurate_predictions = (age_diff.abs() < 1).astype(int)

# 计算准确率
accuracy = accurate_predictions.mean()

# 打印准确率
print("准确率:", accuracy)

In [None]:
import torch
import pandas as pd

# 确保模型处于评估模式
model.eval()
model.freeze()  # 在 PyTorch Lightning 中，冻结用于确保模型参数不会改变

# 创建一个 DataFrame 存储结果
results = pd.DataFrame()

# 用于计算整体准确率
accurate_predictions_count = 0
total_predictions_count = 0

# 关闭梯度计算，用于推理
with torch.no_grad():
    for batch in val_loader:  # 假设 val_loader 是你的验证集加载器
        inputs, targets = batch
        # inputs = inputs.to('cuda')  # 将输入数据移至 GPU

        predictions = model(inputs)  # 获取模型的预测结果
        predictions = predictions.squeeze(-1)  # 调整预测结果的形状，如果需要的话

        # 将数据转换为 CPU 并转换为 NumPy
        predictions = predictions.cpu().numpy()
        targets = targets.cpu().numpy()

        # 计算准确率：预测和实际年龄相差1岁以内
        accurate_predictions = np.abs(predictions - targets) <= 1
        accurate_predictions_count += np.sum(accurate_predictions)
        total_predictions_count += len(predictions)

        # 将结果添加到 DataFrame
        batch_results = pd.DataFrame({
            "Predicted Age": predictions,
            "Actual Age": targets,
            "Accurate": accurate_predictions
        })
        results = pd.concat([results, batch_results], ignore_index=True)

# 计算整体准确率
overall_accuracy = accurate_predictions_count / total_predictions_count * 100
print(f"Overall Accuracy: {overall_accuracy:.2f}%")

# 显示或分析结果
print(results.head())

# 可以保存结果到 CSV
results.to_csv("prediction_results.csv", index=False)


In [None]:
# prompt: /content/Abalone/46gjhnho/checkpoints/epoch=13-step=15862.ckpt copied to /content/drive/MyDrive

!cp /content/Abalone/46gjhnho/checkpoints/epoch=13-step=15862.ckpt /content/drive/MyDrive


In [None]:
from google.colab import drive
drive.mount('/content/drive')