In [3]:
import os
import torch
import torchaudio
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import train_test_split

# Configuration
CONFIG = {'SEED': 42, 'SR': 32000}

# Load data
df = pd.read_csv('./train.csv')
train, val, _, _ = train_test_split(df, df['label'], test_size=0.2, random_state=CONFIG['SEED'])
test = pd.read_csv('./test.csv')
train.to_csv('train_a.csv', index=False)
val.to_csv('val_a.csv', index=False)

def save_spectrogram_image(y, sr, out_path, n_mels=128, hop_length=256):
    y = torch.tensor(y).cuda()
    S = torchaudio.transforms.MelSpectrogram(sample_rate=sr, n_mels=n_mels, hop_length=hop_length).cuda()
    S_dB = torchaudio.transforms.AmplitudeToDB()(S(y))
    plt.figure(figsize=(10, 4))
    plt.imshow(S_dB.cpu().numpy(), aspect='auto', origin='lower')
    plt.axis('off')
    plt.savefig(out_path, bbox_inches='tight', pad_inches=0, format='png')
    plt.close()

def convert_audio_to_spectrogram_batch(df, output_folder, batch_size=100):
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    num_batches = len(df) // batch_size + 1
    for batch_num in range(num_batches):
        batch_df = df.iloc[batch_num * batch_size : (batch_num + 1) * batch_size]
        for _, row in tqdm(batch_df.iterrows(), total=len(batch_df)):
            y, sr = torchaudio.load(row['path'])
            y = y.numpy().squeeze()
            file_name = os.path.basename(row['path']).replace('.ogg', '.png')
            out_path = os.path.join(output_folder, file_name)
            save_spectrogram_image(y, sr, out_path)

# Convert audio to spectrograms
#convert_audio_to_spectrogram_batch(train, './train_spectrograms/')
#convert_audio_to_spectrogram_batch(val, './val_spectrograms/')
#convert_audio_to_spectrogram_batch(test, './test_spectrograms/')


In [4]:
from torchvision import transforms
from torch.utils.data import Dataset
from PIL import Image

class SpectrogramDataset(Dataset):
    def __init__(self, csv_file, root_dir, transform=None):
        self.data_frame = pd.read_csv(csv_file)
        self.root_dir = root_dir
        self.transform = transform
        self.label_map = {'real': 1, 'fake': 0}  # Add a mapping for labels

    def __len__(self):
        return len(self.data_frame)

    def __getitem__(self, idx):
        img_name = os.path.join(self.root_dir, self.data_frame.iloc[idx, 0].replace('.ogg', '.png'))+".png"
        image = Image.open(img_name).convert('RGB')
        label = self.data_frame.iloc[idx, 2]
        
        if self.transform:
            image = self.transform(image)
        
        label = self.label_map[label]  # Map the string label to an integer
        label = torch.tensor(label, dtype=torch.long)  # Convert label to tensor
        
        return image, label
class TestSpectrogramDataset(Dataset):
    def __init__(self, csv_file, transform=None):
        self.data_frame = pd.read_csv(csv_file)
        self.transform = transform

    def __len__(self):
        return len(self.data_frame)

    def __getitem__(self, idx):
        img_name = self.data_frame.iloc[idx, 1].replace('./test','./test_spectrograms').replace('.ogg', '.png')
        image = Image.open(img_name).convert('RGB')
        
        if self.transform:
            image = self.transform(image)
            
        return image

# Transformations
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
])

# Datasets
train_dataset = SpectrogramDataset(csv_file='./train_a.csv', root_dir='./train_spectrograms/', transform=transform)
val_dataset = SpectrogramDataset(csv_file='./val_a.csv', root_dir='./val_spectrograms/', transform=transform)
test_dataset = TestSpectrogramDataset(csv_file='./test.csv', transform=transform)



In [5]:
import pandas as pd
import torch
from torch.utils.data import DataLoader
from torchvision.models import vision_transformer, ViT_B_16_Weights, ViT_B_32_Weights, ViT_L_16_Weights, ViT_L_32_Weights, ViT_H_14_Weights
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm

def train_model(model_name, weights, num_classes=2, num_epochs=10, batch_size=32, learning_rate=0.001):
    # Initialize the model
    model = getattr(vision_transformer, model_name)(weights=weights)
    num_ftrs = model.heads.head.in_features
    model.heads.head = nn.Linear(num_ftrs, num_classes)  # Adjust output layer for binary classification
    
    # Move model to GPU
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)
    
    # Assuming train_dataset, val_dataset, and test_dataset are already defined
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=0)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=0)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=0)
    
    # Loss and Optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    
    # Training Loop with Logging
    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0
        for inputs, labels in tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs} Training"):
            inputs, labels = inputs.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            running_loss += loss.item() * inputs.size(0)
        
        epoch_loss = running_loss / len(train_loader.dataset)
        print(f'Epoch {epoch+1}/{num_epochs}, Loss: {epoch_loss:.4f}')
        
        # Validation Loop with Logging
        model.eval()
        val_running_loss = 0.0
        with torch.no_grad():
            for inputs, labels in tqdm(val_loader, desc=f"Epoch {epoch+1}/{num_epochs} Validation"):
                inputs, labels = inputs.to(device), labels.to(device)
                outputs = model(inputs)
                loss = criterion(outputs, labels)
                val_running_loss += loss.item() * inputs.size(0)
        
        val_loss = val_running_loss / len(val_loader.dataset)
        print(f'Validation Loss: {val_loss:.4f}')

    return model, train_loader, val_loader, test_loader


In [4]:
trained_model, train_loader, val_loader, test_loader = train_model('vit_b_32', ViT_B_32_Weights.IMAGENET1K_V1)

Downloading: "https://download.pytorch.org/models/vit_b_32-d86f8d99.pth" to C:\Users\p/.cache\torch\hub\checkpoints\vit_b_32-d86f8d99.pth
100%|███████████████████████████████████████████████████████████████████████████████| 337M/337M [00:58<00:00, 6.01MB/s]
  attn_output = scaled_dot_product_attention(q, k, v, attn_mask, dropout_p, is_causal)
Epoch 1/10 Training: 100%|█████████████████████████████████████████████████████████| 1386/1386 [07:02<00:00,  3.28it/s]


Epoch 1/10, Loss: 0.4101


Epoch 1/10 Validation: 100%|█████████████████████████████████████████████████████████| 347/347 [01:40<00:00,  3.46it/s]


Validation Loss: 0.1463


Epoch 2/10 Training: 100%|█████████████████████████████████████████████████████████| 1386/1386 [04:30<00:00,  5.12it/s]


Epoch 2/10, Loss: 0.1388


Epoch 2/10 Validation: 100%|█████████████████████████████████████████████████████████| 347/347 [01:04<00:00,  5.41it/s]


Validation Loss: 0.1272


Epoch 3/10 Training: 100%|█████████████████████████████████████████████████████████| 1386/1386 [04:28<00:00,  5.17it/s]


Epoch 3/10, Loss: 0.0848


Epoch 3/10 Validation: 100%|█████████████████████████████████████████████████████████| 347/347 [01:02<00:00,  5.59it/s]


Validation Loss: 0.0436


Epoch 4/10 Training: 100%|█████████████████████████████████████████████████████████| 1386/1386 [04:18<00:00,  5.37it/s]


Epoch 4/10, Loss: 0.0627


Epoch 4/10 Validation: 100%|█████████████████████████████████████████████████████████| 347/347 [01:02<00:00,  5.51it/s]


Validation Loss: 0.0387


Epoch 5/10 Training: 100%|█████████████████████████████████████████████████████████| 1386/1386 [04:17<00:00,  5.39it/s]


Epoch 5/10, Loss: 0.0565


Epoch 5/10 Validation: 100%|█████████████████████████████████████████████████████████| 347/347 [01:01<00:00,  5.65it/s]


Validation Loss: 0.0368


Epoch 6/10 Training: 100%|█████████████████████████████████████████████████████████| 1386/1386 [04:20<00:00,  5.32it/s]


Epoch 6/10, Loss: 0.0383


Epoch 6/10 Validation: 100%|█████████████████████████████████████████████████████████| 347/347 [01:02<00:00,  5.57it/s]


Validation Loss: 0.0276


Epoch 7/10 Training: 100%|█████████████████████████████████████████████████████████| 1386/1386 [04:16<00:00,  5.40it/s]


Epoch 7/10, Loss: 0.0307


Epoch 7/10 Validation: 100%|█████████████████████████████████████████████████████████| 347/347 [01:02<00:00,  5.55it/s]


Validation Loss: 0.0305


Epoch 8/10 Training: 100%|█████████████████████████████████████████████████████████| 1386/1386 [04:23<00:00,  5.27it/s]


Epoch 8/10, Loss: 0.0316


Epoch 8/10 Validation: 100%|█████████████████████████████████████████████████████████| 347/347 [01:03<00:00,  5.48it/s]


Validation Loss: 0.0476


Epoch 9/10 Training: 100%|█████████████████████████████████████████████████████████| 1386/1386 [04:22<00:00,  5.28it/s]


Epoch 9/10, Loss: 0.0242


Epoch 9/10 Validation: 100%|█████████████████████████████████████████████████████████| 347/347 [01:01<00:00,  5.63it/s]


Validation Loss: 0.0196


Epoch 10/10 Training: 100%|████████████████████████████████████████████████████████| 1386/1386 [04:22<00:00,  5.28it/s]


Epoch 10/10, Loss: 0.0211


Epoch 10/10 Validation: 100%|████████████████████████████████████████████████████████| 347/347 [01:00<00:00,  5.69it/s]

Validation Loss: 0.0423





In [4]:
trained_model, train_loader, val_loader, test_loader = train_model('vit_l_16', ViT_L_16_Weights.IMAGENET1K_V1)

Downloading: "https://download.pytorch.org/models/vit_l_16-852ce7e3.pth" to C:\Users\p/.cache\torch\hub\checkpoints\vit_l_16-852ce7e3.pth
100%|█████████████████████████████████████████████████████████████████████████████| 1.13G/1.13G [02:00<00:00, 10.1MB/s]
  attn_output = scaled_dot_product_attention(q, k, v, attn_mask, dropout_p, is_causal)
Epoch 1/10 Training: 100%|███████████████████████████████████████████████████████| 1386/1386 [1:04:18<00:00,  2.78s/it]


Epoch 1/10, Loss: 0.7137


Epoch 1/10 Validation: 100%|█████████████████████████████████████████████████████████| 347/347 [13:45<00:00,  2.38s/it]


Validation Loss: 0.6931


Epoch 2/10 Training: 100%|███████████████████████████████████████████████████████| 1386/1386 [1:04:22<00:00,  2.79s/it]


Epoch 2/10, Loss: 0.6936


Epoch 2/10 Validation: 100%|█████████████████████████████████████████████████████████| 347/347 [13:47<00:00,  2.39s/it]


Validation Loss: 0.6931


Epoch 3/10 Training: 100%|███████████████████████████████████████████████████████| 1386/1386 [1:04:12<00:00,  2.78s/it]


Epoch 3/10, Loss: 0.6936


Epoch 3/10 Validation: 100%|█████████████████████████████████████████████████████████| 347/347 [13:46<00:00,  2.38s/it]


Validation Loss: 0.6927


Epoch 4/10 Training: 100%|███████████████████████████████████████████████████████| 1386/1386 [1:04:16<00:00,  2.78s/it]


Epoch 4/10, Loss: 0.5366


Epoch 4/10 Validation: 100%|█████████████████████████████████████████████████████████| 347/347 [13:47<00:00,  2.39s/it]


Validation Loss: 0.2776


Epoch 5/10 Training: 100%|███████████████████████████████████████████████████████| 1386/1386 [1:04:15<00:00,  2.78s/it]


Epoch 5/10, Loss: 0.2646


Epoch 5/10 Validation: 100%|█████████████████████████████████████████████████████████| 347/347 [13:47<00:00,  2.38s/it]


Validation Loss: 0.2308


Epoch 6/10 Training: 100%|███████████████████████████████████████████████████████| 1386/1386 [1:04:16<00:00,  2.78s/it]


Epoch 6/10, Loss: 0.2373


Epoch 6/10 Validation: 100%|█████████████████████████████████████████████████████████| 347/347 [13:47<00:00,  2.38s/it]


Validation Loss: 0.2349


Epoch 7/10 Training: 100%|███████████████████████████████████████████████████████| 1386/1386 [1:04:17<00:00,  2.78s/it]


Epoch 7/10, Loss: 0.2180


Epoch 7/10 Validation: 100%|█████████████████████████████████████████████████████████| 347/347 [13:48<00:00,  2.39s/it]


Validation Loss: 0.2552


Epoch 8/10 Training: 100%|███████████████████████████████████████████████████████| 1386/1386 [1:04:17<00:00,  2.78s/it]


Epoch 8/10, Loss: 0.2193


Epoch 8/10 Validation: 100%|█████████████████████████████████████████████████████████| 347/347 [13:48<00:00,  2.39s/it]


Validation Loss: 0.1936


Epoch 9/10 Training: 100%|███████████████████████████████████████████████████████| 1386/1386 [1:04:19<00:00,  2.78s/it]


Epoch 9/10, Loss: 0.2073


Epoch 9/10 Validation: 100%|█████████████████████████████████████████████████████████| 347/347 [13:47<00:00,  2.39s/it]


Validation Loss: 0.2124


Epoch 10/10 Training: 100%|██████████████████████████████████████████████████████| 1386/1386 [1:04:17<00:00,  2.78s/it]


Epoch 10/10, Loss: 0.1949


Epoch 10/10 Validation: 100%|████████████████████████████████████████████████████████| 347/347 [13:48<00:00,  2.39s/it]

Validation Loss: 0.1761





In [4]:
trained_model, train_loader, val_loader, test_loader = train_model('vit_l_32', ViT_L_32_Weights.IMAGENET1K_V1)

Downloading: "https://download.pytorch.org/models/vit_l_32-c7638314.pth" to C:\Users\p/.cache\torch\hub\checkpoints\vit_l_32-c7638314.pth
100%|██████████| 1.14G/1.14G [01:45<00:00, 11.7MB/s]
  attn_output = scaled_dot_product_attention(q, k, v, attn_mask, dropout_p, is_causal)
Epoch 1/10 Training: 100%|██████████| 1386/1386 [08:04<00:00,  2.86it/s]


Epoch 1/10, Loss: 0.4963


Epoch 1/10 Validation: 100%|██████████| 347/347 [01:33<00:00,  3.73it/s]


Validation Loss: 0.2427


Epoch 2/10 Training: 100%|██████████| 1386/1386 [06:33<00:00,  3.53it/s]


Epoch 2/10, Loss: 0.1760


Epoch 2/10 Validation: 100%|██████████| 347/347 [01:09<00:00,  5.02it/s]


Validation Loss: 0.1117


Epoch 3/10 Training: 100%|██████████| 1386/1386 [06:35<00:00,  3.50it/s]


Epoch 3/10, Loss: 0.0856


Epoch 3/10 Validation: 100%|██████████| 347/347 [01:08<00:00,  5.06it/s]


Validation Loss: 0.1013


Epoch 4/10 Training: 100%|██████████| 1386/1386 [06:33<00:00,  3.52it/s]


Epoch 4/10, Loss: 0.0601


Epoch 4/10 Validation: 100%|██████████| 347/347 [01:09<00:00,  4.97it/s]


Validation Loss: 0.0471


Epoch 5/10 Training: 100%|██████████| 1386/1386 [06:34<00:00,  3.52it/s]


Epoch 5/10, Loss: 0.0455


Epoch 5/10 Validation: 100%|██████████| 347/347 [01:10<00:00,  4.95it/s]


Validation Loss: 0.0430


Epoch 6/10 Training: 100%|██████████| 1386/1386 [06:36<00:00,  3.49it/s]


Epoch 6/10, Loss: 0.0380


Epoch 6/10 Validation: 100%|██████████| 347/347 [01:08<00:00,  5.03it/s]


Validation Loss: 0.0401


Epoch 7/10 Training: 100%|██████████| 1386/1386 [06:35<00:00,  3.51it/s]


Epoch 7/10, Loss: 0.0313


Epoch 7/10 Validation: 100%|██████████| 347/347 [01:09<00:00,  4.98it/s]


Validation Loss: 0.0398


Epoch 8/10 Training: 100%|██████████| 1386/1386 [06:36<00:00,  3.50it/s]


Epoch 8/10, Loss: 0.0282


Epoch 8/10 Validation: 100%|██████████| 347/347 [01:08<00:00,  5.04it/s]


Validation Loss: 0.0331


Epoch 9/10 Training: 100%|██████████| 1386/1386 [06:37<00:00,  3.48it/s]


Epoch 9/10, Loss: 0.0247


Epoch 9/10 Validation: 100%|██████████| 347/347 [01:09<00:00,  4.99it/s]


Validation Loss: 0.0377


Epoch 10/10 Training: 100%|██████████| 1386/1386 [06:37<00:00,  3.49it/s]


Epoch 10/10, Loss: 0.0229


Epoch 10/10 Validation: 100%|██████████| 347/347 [01:09<00:00,  4.99it/s]

Validation Loss: 0.0324





In [4]:
trained_model, train_loader, val_loader, test_loader = train_model('vit_h_14', ViT_H_14_Weights.IMAGENET1K_SWAG_LINEAR_V1)

  attn_output = scaled_dot_product_attention(q, k, v, attn_mask, dropout_p, is_causal)
Epoch 1/10 Training:   0%|          | 0/1386 [00:02<?, ?it/s]


OutOfMemoryError: CUDA out of memory. Tried to allocate 122.00 MiB. GPU 

In [5]:
# Save the model
torch.save(trained_model.state_dict(), 'vit_l_32_model.pth')
print('Model saved successfully.')

Model saved successfully.


In [6]:
class Test2SpectrogramDataset(Dataset):
    def __init__(self, csv_file, transform=None):
        self.data_frame = pd.read_csv(csv_file)
        self.transform = transform

    def __len__(self):
        return len(self.data_frame)

    def __getitem__(self, idx):
        img_name = self.data_frame.iloc[idx, 1].replace('./train','./val_spectrograms').replace('.ogg', '.png')
        image = Image.open(img_name).convert('RGB')
        
        if self.transform:
            image = self.transform(image)
            
        return image
test2_dataset = Test2SpectrogramDataset(csv_file='./val_a.csv', transform=transform)
test2_loader = DataLoader(test2_dataset, batch_size=32, shuffle=False, num_workers=0)

In [7]:
import torch
from torchvision.models import vision_transformer

# Define the model architecture
model_name = 'vit_l_32'
weights = ViT_L_16_Weights.IMAGENET1K_V1
model = vision_transformer.vit_l_16(weights=weights)

# Replace the classification head (assuming binary classification)
num_ftrs = model.heads.head.in_features
model.heads.head = torch.nn.Linear(num_ftrs, 2)  # Adjust as necessary for your task

# Load the saved model weights
model.load_state_dict(torch.load('vit_l_16_model.pth'))

# Move the model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

# Set the model to evaluation mode
model.eval()

print('Model loaded successfully.')


Model loaded successfully.


In [2]:
train_dataset = SpectrogramDataset(csv_file='./train.csv', root_dir='./train_spectrograms/', transform=transform)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, num_workers=0)

NameError: name 'SpectrogramDataset' is not defined

In [23]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Inference Function with Logging
def inference(model, data_loader, device):
    model.eval()
    predictions = []
    with torch.no_grad():
        for inputs in tqdm(data_loader, desc="Inference"):
            inputs = inputs.to(device)
            outputs = model(inputs)
            probabilities = torch.softmax(outputs, dim=1)
            predictions.extend(probabilities.cpu().numpy())
    return predictions

# Inference and Submission
test = pd.read_csv('./test.csv')  # Load your test CSV file
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, num_workers=0)
preds = inference(model, test_loader, device)
submit = pd.read_csv('./sample_submission.csv')
submit.iloc[:, 1:] = preds
submit.head()

Inference: 100%|██████████| 1563/1563 [09:18<00:00,  2.80it/s]
  submit.iloc[:, 1:] = preds
  submit.iloc[:, 1:] = preds


Unnamed: 0,id,fake,real
0,TEST_00000,0.868462,0.131538
1,TEST_00001,0.033277,0.966723
2,TEST_00002,0.015815,0.984185
3,TEST_00003,0.004869,0.995131
4,TEST_00004,0.008452,0.991548


In [24]:
submit.to_csv("test.csv",index=False)