# Import libraries

In [3]:
import json
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import SelectKBest, chi2
from torch.utils.data import DataLoader, TensorDataset
import random
import warnings
warnings.filterwarnings('ignore')

# Set random seeds for reproducibility
torch.manual_seed(42)
np.random.seed(42)
random.seed(42)

# Loading data and preprocessing

In [4]:
def load_json_data(path):
    texts, labels = [], []
    with open(path, 'r') as f:
        for line in f:
            sample = json.loads(line)
            token_str = ' '.join(map(str, sample['text']))
            texts.append(token_str)
            labels.append(sample['label'])
    return texts, labels

def load_test_data(path):
    test_texts, test_ids = [], []
    with open(path, 'r') as f:
        for line in f:
            sample = json.loads(line)
            text = ' '.join(map(str, sample['text']))
            test_texts.append(text)
            test_ids.append(sample['id'])
    return test_texts, test_ids

# Load datasets 
X1, y1 = load_json_data('../domain1_train_data.json')
X2, y2 = load_json_data('../domain2_train_data.json')

print(f"Domain 1: {len(X1)} samples, Class distribution: {np.bincount(y1)}")
print(f"Domain 2: {len(X2)} samples, Class distribution: {np.bincount(y2)}")

Domain 1: 1000 samples, Class distribution: [500 500]
Domain 2: 5000 samples, Class distribution: [ 250 4750]


# Data augmentation strategy

In [5]:
def simple_text_augmentation(texts, labels, augment_ratio=0.3):
    """Simple text data augmentation: random token shuffling"""
    augmented_texts = []
    augmented_labels = []
    
    for text, label in zip(texts, labels):
        # Original data
        augmented_texts.append(text)
        augmented_labels.append(label)
        
        # Data augmentation with random probability
        if random.random() < augment_ratio:
            tokens = text.split()
            if len(tokens) > 3:  # Only augment sufficiently long texts
                # Strategy: randomly shuffle partial tokens (preserve most structure)
                shuffle_ratio = 0.2  # Only shuffle 20% of tokens
                n_shuffle = max(1, int(len(tokens) * shuffle_ratio))
                indices = random.sample(range(len(tokens)), min(n_shuffle, len(tokens)))
                
                tokens_copy = tokens.copy()
                shuffled_tokens = [tokens_copy[i] for i in indices]
                random.shuffle(shuffled_tokens)
                
                for i, idx in enumerate(indices):
                    tokens_copy[idx] = shuffled_tokens[i]
                
                augmented_text = ' '.join(tokens_copy)
                augmented_texts.append(augmented_text)
                augmented_labels.append(label)
    
    return augmented_texts, augmented_labels

# Split domains into train/validation
X1_train, X1_val, y1_train, y1_val = train_test_split(
    X1, y1, test_size=0.2, random_state=42, stratify=y1
)
X2_train, X2_val, y2_train, y2_val = train_test_split(
    X2, y2, test_size=0.2, random_state=42, stratify=y2
)

# Apply data augmentation
X1_train_aug, y1_train_aug = simple_text_augmentation(X1_train, y1_train, augment_ratio=0.4)
X2_train_aug, y2_train_aug = simple_text_augmentation(X2_train, y2_train, augment_ratio=0.1)

# Conservative feature engineering

In [6]:
X_train_combined = X1_train_aug + X2_train_aug

# TF-IDF vectorization with conservative parameters
tfidf_vectorizer = TfidfVectorizer(
    max_features=3000,  
    ngram_range=(1, 2),
    min_df=3,
    max_df=0.95,
    sublinear_tf=True
)

tfidf_vectorizer.fit(X_train_combined)

# Transform all data
X1_train_vec = tfidf_vectorizer.transform(X1_train_aug).toarray()
X2_train_vec = tfidf_vectorizer.transform(X2_train_aug).toarray()
X1_val_vec = tfidf_vectorizer.transform(X1_val).toarray()
X2_val_vec = tfidf_vectorizer.transform(X2_val).toarray()

print(f"Feature dimensions: {X1_train_vec.shape[1]}")

# Merge training data
X_train_all = np.vstack([X1_train_vec, X2_train_vec])
y_train_all = np.hstack([y1_train_aug, y2_train_aug])

# Feature selection
selector = SelectKBest(chi2, k=min(2000, X_train_all.shape[1]))
X_train_selected = selector.fit_transform(X_train_all, y_train_all)
X1_val_selected = selector.transform(X1_val_vec)
X2_val_selected = selector.transform(X2_val_vec)

# Convert to numpy arrays
X1_val_np = X1_val_selected.astype(np.float32)
X2_val_np = X2_val_selected.astype(np.float32)
y1_val_np = np.array(y1_val, dtype=np.int64)
y2_val_np = np.array(y2_val, dtype=np.int64)

Feature dimensions: 3000


# Optimal DANN model structure

In [7]:
class GradientReversalLayer(torch.autograd.Function):
    """Gradient Reversal Layer for adversarial training"""
    @staticmethod
    def forward(ctx, x, alpha):
        ctx.alpha = alpha
        return x
    
    @staticmethod
    def backward(ctx, grad_output):
        return -ctx.alpha * grad_output, None

class OptimalDANN(nn.Module):
    """Optimal Domain Adversarial Neural Network with conservative architecture"""
    
    def __init__(self, input_dim, hidden_dim=160, dropout_rate=0.1):
        super(OptimalDANN, self).__init__()
        
        self.feature_extractor = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(dropout_rate)
        )
        
        self.label_classifier = nn.Sequential(
            nn.Linear(hidden_dim, 2)
        )
        
        self.domain_classifier = nn.Sequential(
            nn.Linear(hidden_dim, 2)
        )
    
    def forward(self, x, alpha=0.03):
        features = self.feature_extractor(x)
        label_output = self.label_classifier(features)
        
        reversed_features = GradientReversalLayer.apply(features, alpha)
        domain_output = self.domain_classifier(reversed_features)
        
        return label_output, domain_output

# Initialize model
input_dim = X_train_selected.shape[1]
model = OptimalDANN(
    input_dim=input_dim,
    hidden_dim=160,
    dropout_rate=0.1
)

print(f"Input dimension: {input_dim}")
print(f"Model parameters: {sum(p.numel() for p in model.parameters())}")

Input dimension: 2000
Model parameters: 320804


# Model evaluation function

In [8]:
def evaluate_model(model, X1_val, y1_val, X2_val, y2_val):
    """Evaluate model on validation sets"""
    model.eval()
    with torch.no_grad():
        # Domain 1 validation
        X1_tensor = torch.FloatTensor(X1_val)
        label_out1, _ = model(X1_tensor, alpha=0.0)
        pred1 = torch.argmax(label_out1, dim=1)
        acc1 = accuracy_score(y1_val, pred1.numpy())
        
        # Domain 2 validation
        X2_tensor = torch.FloatTensor(X2_val)
        label_out2, _ = model(X2_tensor, alpha=0.0)
        pred2 = torch.argmax(label_out2, dim=1)
        acc2 = accuracy_score(y2_val, pred2.numpy())
        
        # Weighted accuracy
        weighted_acc = 0.60 * acc1 + 0.4 * acc2
        
    model.train()
    return weighted_acc, acc1, acc2

# Training Function

In [9]:
def train_augmented_dann(model, X_train, y_train, X1_val, y1_val, X2_val, y2_val):
    """Training function with optimal hyperparameters"""
    
    # Prepare domain labels (adjusted for augmented data)
    n_domain1_aug = len([x for x in y_train if x in y1_train_aug])
    d_train = np.hstack([
        np.zeros(n_domain1_aug),
        np.ones(len(X_train) - n_domain1_aug)
    ])
    
    X_tensor = torch.FloatTensor(X_train)
    y_tensor = torch.LongTensor(y_train)
    d_tensor = torch.LongTensor(d_train)
    
    dataset = TensorDataset(X_tensor, y_tensor, d_tensor)
    dataloader = DataLoader(dataset, batch_size=64, shuffle=True)
    
    optimizer = optim.Adam(model.parameters(), lr=0.0012, weight_decay=1e-5)
    criterion = nn.CrossEntropyLoss()
    
    model.train()
    best_val_acc = 0.0
    best_d1_acc = 0.0
    best_d2_acc = 0.0
    patience_counter = 0
    
    for epoch in range(20):
        total_loss = 0
        total_label_loss = 0
        total_domain_loss = 0
        
        for batch_X, batch_y, batch_d in dataloader:
            optimizer.zero_grad()
            
            label_output, domain_output = model(batch_X, alpha=0.03)
            
            label_loss = criterion(label_output, batch_y)
            domain_loss = criterion(domain_output, batch_d)
            
            total_batch_loss = label_loss + 0.05 * domain_loss
            
            total_batch_loss.backward()
            optimizer.step()
            
            total_loss += total_batch_loss.item()
            total_label_loss += label_loss.item()
            total_domain_loss += domain_loss.item()
        
        if (epoch + 1) % 5 == 0:
            val_acc, d1_acc, d2_acc = evaluate_model(
                model, X1_val, y1_val, X2_val, y2_val
            )
            
            print(f"Epoch [{epoch+1:2d}/20], Loss: {total_loss/len(dataloader):.4f}, "
                  f"Label Loss: {total_label_loss/len(dataloader):.4f}, "
                  f"Domain Loss: {total_domain_loss/len(dataloader):.4f}, "
                  f"Val Acc: {val_acc:.4f}")
            
            if val_acc > best_val_acc:
                best_val_acc = val_acc
                best_d1_acc = d1_acc
                best_d2_acc = d2_acc
                patience_counter = 0
                torch.save(model.state_dict(), 'best_augmented_dann.pth')
            else:
                patience_counter += 1
                
            if patience_counter >= 3:
                print("Early stopping triggered")
                break
    
    # Load best model
    model.load_state_dict(torch.load('best_augmented_dann.pth'))
    return model, best_val_acc, best_d1_acc, best_d2_acc

# Model training

In [10]:
print("Starting augmented DANN training...")

trained_model, best_accuracy, best_d1_acc, best_d2_acc = train_augmented_dann(
    model,
    X_train_selected, y_train_all,
    X1_val_np, y1_val_np,
    X2_val_np, y2_val_np
)

Starting augmented DANN training...
Epoch [ 5/20], Loss: 0.0254, Label Loss: 0.0252, Domain Loss: 0.0037, Val Acc: 0.9316
Epoch [10/20], Loss: 0.0054, Label Loss: 0.0053, Domain Loss: 0.0015, Val Acc: 0.9282
Epoch [15/20], Loss: 0.0025, Label Loss: 0.0024, Domain Loss: 0.0011, Val Acc: 0.9342
Epoch [20/20], Loss: 0.0015, Label Loss: 0.0014, Domain Loss: 0.0010, Val Acc: 0.9372


# Final evaluation

In [11]:
print("\n=== DNN_18 Data Augmentation Results ===")
print(f"Validation accuracy: {best_accuracy:.4f}")
print(f"Domain 1 accuracy: {best_d1_acc:.4f} ({best_d1_acc:.1%})")
print(f"Domain 2 accuracy: {best_d2_acc:.4f} ({best_d2_acc:.1%})")
print(f"Model parameters: {sum(p.numel() for p in model.parameters()):,}")

# Final validation confirmation
final_val_acc, final_d1_acc, final_d2_acc = evaluate_model(
    trained_model, X1_val_np, y1_val_np, X2_val_np, y2_val_np
)


=== DNN_18 Data Augmentation Results ===
Validation accuracy: 0.9372
Domain 1 accuracy: 0.9100 (91.0%)
Domain 2 accuracy: 0.9780 (97.8%)
Model parameters: 320,804


# Test predictions and submission

In [12]:
# Load and process test data 
X_test_raw, test_ids = load_test_data('../test_data.json')
X_test_tfidf = tfidf_vectorizer.transform(X_test_raw).toarray()
X_test_selected = selector.transform(X_test_tfidf)
X_test_tensor = torch.FloatTensor(X_test_selected)

print(f"Test data shape: {X_test_tensor.shape}")

# Generate predictions
trained_model.eval()
with torch.no_grad():
    label_output, _ = trained_model(X_test_tensor, alpha=0.0)
    predictions = torch.argmax(label_output, dim=1).tolist()

# Create submission file
submission_df = pd.DataFrame({
    'id': test_ids,
    'class': predictions
})

submission_df.to_csv('dnn_18_data_augmentation_predictions.csv', index=False)

print(f"Predictions saved to 'dnn_18_data_augmentation_predictions.csv'")
print(f"Prediction distribution: {np.bincount(predictions)}")

# Display first few predictions
print("\nFirst 10 predictions:")
print(submission_df.head(10))

Test data shape: torch.Size([4000, 2000])
Predictions saved to 'dnn_18_data_augmentation_predictions.csv'
Prediction distribution: [1884 2116]

First 10 predictions:
   id  class
0   0      1
1   1      0
2   2      0
3   3      1
4   4      0
5   5      1
6   6      0
7   7      1
8   8      1
9   9      1
