**Train**

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report, f1_score
from sklearn.model_selection import StratifiedKFold
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW
import torch
from torch.utils.data import Dataset, DataLoader
from torch.nn import CrossEntropyLoss
import re
from html import unescape
import torch.nn.functional as F

class LabelSmoothingLoss(torch.nn.Module):
    def __init__(self, classes, smoothing=0.1, dim=-1):
        super(LabelSmoothingLoss, self).__init__()
        self.confidence = 1.0 - smoothing
        self.smoothing = smoothing
        self.cls = classes
        self.dim = dim

    def forward(self, pred, target):
        pred = pred.log_softmax(dim=self.dim)
        with torch.no_grad():
            true_dist = torch.zeros_like(pred)
            true_dist.fill_(self.smoothing / (self.cls - 1))
            true_dist.scatter_(1, target.data.unsqueeze(1), self.confidence)
        return torch.mean(torch.sum(-true_dist * pred, dim=self.dim))



# Preprocess text function
def preprocess_text(text):
    # 1. Find #[xX]?\w+; and put '&' to the first
    text = re.sub(r'(^|\D)#\w+;', lambda match: f'&{match.group()}' if match.group().startswith('#') else f'{match.group()[0]}&{match.group()[1:]}', text)
    # 2. Convert HTML character to unicode
    text = unescape(text)
    # 3. Remove http, https
    text = re.sub(r'http\S+|https\S+', '', text)
    # 4. Remove email
    text = re.sub(r'\S+@\S+', '', text)
    # 5. Remove twitter id
    text = re.sub(r'@\w+', '', text)
    # 6. Remove "&lt;/b&gt;"
    text = re.sub(r'&lt;/b&gt;', '', text)
    # 7. Remove &quot; and quot;
    text = re.sub(r'&quot;|quot;', '', text)
    # 8. Replace &amp; and amp; with &
    text = re.sub(r'&amp;|amp;', '&', text)
    # 9. Replace &lt; and lt; with <
    text = re.sub(r'&lt;|lt;', '<', text)
    # 10. Replace &gt; and gt; with >
    text = re.sub(r'&gt;|gt;', '>', text)
    # 11. Remove the text inside parentheses
    text = re.sub(r'\(.*?\)', '', text)
    # 12. Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text


def tokenize_texts(texts, tokenizer, max_len=512):
    return tokenizer(texts, padding='max_length', truncation=True, max_length=max_len, return_tensors='pt')

class NewsDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = tokenize_texts(text, self.tokenizer, self.max_len)
        return {'input_ids': encoding['input_ids'][0], 'attention_mask': encoding['attention_mask'][0], 'label': torch.tensor(label)}

train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

# Step 1: Set the k_fold
k_fold = 1

# Step 2: Stratified K-fold
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
train_val_splits = list(skf.split(train_df, train_df['label']))
train_indices, val_indices = train_val_splits[k_fold]

# Step 3: Preprocess text
train_df['text'] = train_df['text'].apply(preprocess_text)
test_df['text'] = test_df['text'].apply(preprocess_text)

# Save the preprocessed DataFrame as a new CSV file
train_df.to_csv('preprocessed_train_data.csv', index=False)

# Step 4: Use the selected split
train_data = train_df.iloc[train_indices]
val_data = train_df.iloc[val_indices]



# Load the augmented data
sum_aug_df = pd.read_csv('sum_aug.csv')
backtrans_aug_df = pd.read_csv('backtrans_aug.csv')

# Filter the augmented data by train_df['id']
merged_sum_aug_df = sum_aug_df[sum_aug_df['id'].isin(train_df['id'])]
merged_backtrans_aug_df = backtrans_aug_df[backtrans_aug_df['id'].isin(train_df['id'])]

# Preprocess the text in the merged_sum_aug_df and merged_backtrans_aug_df DataFrames
merged_sum_aug_df['text'] = merged_sum_aug_df['text'].apply(preprocess_text)
merged_backtrans_aug_df['text'] = merged_backtrans_aug_df['text'].apply(preprocess_text)

# Combine the original train_df with the merged augmented data
extended_train_df = pd.concat([train_df, merged_sum_aug_df, merged_backtrans_aug_df], ignore_index=True)

# Assign the extended_train_df to the train_data
train_data = extended_train_df





# Step 5: Tokenize
tokenizer = AutoTokenizer.from_pretrained('sileod/deberta-v3-base-tasksource-nli')

train_dataset = NewsDataset(train_data['text'].to_numpy(), train_data['label'].to_numpy(), tokenizer, max_len=512)
val_dataset = NewsDataset(val_data['text'].to_numpy(), val_data['label'].to_numpy(), tokenizer, max_len=512)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)


# Step 6: Model, optimizer, and loss function
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = AutoModelForSequenceClassification.from_pretrained('sileod/deberta-v3-base-tasksource-nli', num_labels=8, ignore_mismatched_sizes=True)
model.to(device)

optimizer = AdamW(model.parameters(), lr=1e-5)

# Loss function
num_classes = 8
smoothing = 0.01
criterion = LabelSmoothingLoss(classes=num_classes, smoothing=smoothing).to(device)

# Step 7: Training and validation
def train(model, data_loader, optimizer, device):
    model.train()
    losses = []
    for batch in data_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        logits = outputs.logits

        loss = criterion(logits, labels)
        losses.append(loss.item())

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
    return np.mean(losses)




def eval(model, data_loader, device):
  model.eval()
  y_true, y_pred = [], []
  with torch.no_grad():
    for batch in data_loader:
      input_ids = batch['input_ids'].to(device)
      attention_mask = batch['attention_mask'].to(device)
      labels = batch['label'].to(device)
      outputs = model(input_ids, attention_mask=attention_mask)
      predictions = torch.argmax(outputs.logits, dim=1)
      y_true.extend(labels.cpu().numpy())
      y_pred.extend(predictions.cpu().numpy())
  return y_true, y_pred





best_macro_f1 = 0

for epoch in range(10):
  print(f"Epoch: {epoch+1}")
  train_loss = train(model, train_loader, optimizer, device)
  print(f"Train Loss: {train_loss}")
  y_true_train, y_pred_train = eval(model, train_loader, device)
  y_true_val, y_pred_val = eval(model, val_loader, device)

  train_macro_f1 = f1_score(y_true_train, y_pred_train, average='macro')
  val_macro_f1 = f1_score(y_true_val, y_pred_val, average='macro')

  print(f"Train Macro F1: {train_macro_f1}")
  print(f"Val Macro F1: {val_macro_f1}")
  print(classification_report(y_true_val, y_pred_val))

  if val_macro_f1 > best_macro_f1:
      best_macro_f1 = val_macro_f1
      torch.save(model.state_dict(), f'model_{k_fold}.pt')




# Step 8: Inference

# Load the best model for the current k_fold
model.load_state_dict(torch.load(f'model_{k_fold}.pt'))

# Create a test dataset with dummy labels (zeros) since we only need the text data for inference
test_dataset = NewsDataset(test_df['text'].to_numpy(), np.zeros(len(test_df)), tokenizer, max_len=512)

# Create a DataLoader for the test dataset
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# Perform inference on the test dataset and get the predicted labels
_, test_predictions = eval(model, test_loader, device)

# Assign the predicted labels to the 'label' column in the test DataFrame
test_df['label'] = test_predictions

# Save the 'id' and 'label' columns of the test DataFrame to a CSV file
test_df[['id', 'label']].to_csv(f'inference_{k_fold}.csv', index=False)


**Ensemble**

In [None]:
import pandas as pd
from scipy.stats import mode

# Read the selected inference CSV files into separate DataFrames
inference_files = [f'inference_{i}.csv' for i in [1, 3]]
inference_dfs = [pd.read_csv(file) for file in inference_files]

# Extract the 'label' column from each DataFrame
label_columns = [df['label'] for df in inference_dfs]

# Perform the hard voting for each row using the mode() function
voted_labels = mode(label_columns, axis=0).mode[0]

# Combine the 'id' column from the first DataFrame with the hard-voted labels into a new DataFrame
ensemble_submission = pd.DataFrame({'id': inference_dfs[0]['id'], 'label': voted_labels})

# Save the new DataFrame as the final ensemble submission file
ensemble_submission.to_csv('ensemble_submission.csv', index=False)
print("Ensemble submission file saved as 'ensemble_submission.csv'")


: 

**Data Augmentation (summary)**

In [None]:
import pandas as pd
from scipy.stats import mode

# Read the selected inference CSV files into separate DataFrames
inference_files = [f'inference_{i}.csv' for i in [1, 3]]
inference_dfs = [pd.read_csv(file) for file in inference_files]

# Extract the 'label' column from each DataFrame
label_columns = [df['label'] for df in inference_dfs]

# Perform the hard voting for each row using the mode() function
voted_labels = mode(label_columns, axis=0).mode[0]

# Combine the 'id' column from the first DataFrame with the hard-voted labels into a new DataFrame
ensemble_submission = pd.DataFrame({'id': inference_dfs[0]['id'], 'label': voted_labels})

# Save the new DataFrame as the final ensemble submission file
ensemble_submission.to_csv('ensemble_submission.csv', index=False)
print("Ensemble submission file saved as 'ensemble_submission.csv'")


**Data Augmentation (Back Translation)**

In [None]:
import os
import time
import pandas as pd
from translatepy import Translate

translator = Translate()

def back_translate(text: str, target_lang: str = 'ja') -> str:

    translated_obj = translator.translate(text, target_lang)
    translated = translated_obj.result  # Get the translated text as a string

    back_translated_obj = translator.translate(translated, 'en')
    back_translated = back_translated_obj.result  # Get the back-translated text as a string

    return back_translated
        


# Load your train.csv
data = pd.read_csv('preprocessed_train_data.csv')

# Select rows with label 5 or 6
selected_data = data[data['label'].isin([5, 6])]



# Perform back translation on the 'text' column one by one
back_translated_texts = []
for text in selected_data['text']:
    back_translated_text = back_translate(text, target_lang='ja')
    back_translated_texts.append(back_translated_text)
    time.sleep(0.1)  # Sleep for 0.1 seconds

selected_data['back_translated'] = back_translated_texts

# Only keep the back-translated data
augmented_data = selected_data[['id', 'back_translated', 'label']].rename(columns={'back_translated': 'text'})

# Check if backtrans_aug.csv exists
if os.path.exists('backtrans_aug.csv'):
    # Read the existing file and append the new data
    existing_data = pd.read_csv('backtrans_aug.csv')
    combined_data = pd.concat([existing_data, augmented_data], ignore_index=True)
else:
    # If the file does not exist, create an empty DataFrame with the same columns
    combined_data = augmented_data

# Save the combined dataset to the CSV file
combined_data.to_csv('backtrans_aug.csv', index=False)


**(additional code) Inference and Ensemble for my model**

if you want to inference your data with my fine-tuned model, Run this cell.

Prepare your data (test.csv) and model (model_1.pt and model_3.pt)

fine-tuned model download : https://www.dropbox.com/sh/80bwc6vlc9gvrse/AAB1t9SXqObr4B82rbJkWHPaa?dl=0

In [None]:
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
from torch.utils.data import Dataset, DataLoader

import re
from html import unescape



# Preprocess text function
def preprocess_text(text):
    # 1. Find #[xX]?\w+; and put '&' to the first
    text = re.sub(r'(^|\D)#\w+;', lambda match: f'&{match.group()}' if match.group().startswith('#') else f'{match.group()[0]}&{match.group()[1:]}', text)
    # 2. Convert HTML character to unicode
    text = unescape(text)
    # 3. Remove http, https
    text = re.sub(r'http\S+|https\S+', '', text)
    # 4. Remove email
    text = re.sub(r'\S+@\S+', '', text)
    # 5. Remove twitter id
    text = re.sub(r'@\w+', '', text)
    # 6. Remove "&lt;/b&gt;"
    text = re.sub(r'&lt;/b&gt;', '', text)
    # 7. Remove &quot; and quot;
    text = re.sub(r'&quot;|quot;', '', text)
    # 8. Replace &amp; and amp; with &
    text = re.sub(r'&amp;|amp;', '&', text)
    # 9. Replace &lt; and lt; with <
    text = re.sub(r'&lt;|lt;', '<', text)
    # 10. Replace &gt; and gt; with >
    text = re.sub(r'&gt;|gt;', '>', text)
    # 11. Remove the text inside parentheses
    text = re.sub(r'\(.*?\)', '', text)
    # 12. Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text


def tokenize_texts(texts, tokenizer, max_len=512):
    return tokenizer(texts, padding='max_length', truncation=True, max_length=max_len, return_tensors='pt')

class NewsDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = tokenize_texts(text, self.tokenizer, self.max_len)
        return {'input_ids': encoding['input_ids'][0], 'attention_mask': encoding['attention_mask'][0], 'label': torch.tensor(label)}



def eval(model, data_loader, device):
    model.eval()
    y_true, y_pred = [], []
    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask)
            predictions = torch.argmax(outputs.logits, dim=1)
            y_true.extend(labels.cpu().numpy())
            y_pred.extend(predictions.cpu().numpy())
    return y_true, y_pred



# Load test data
test_df = pd.read_csv('test.csv')

# Preprocess text
test_df['text'] = test_df['text'].apply(preprocess_text)

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained('sileod/deberta-v3-base-tasksource-nli')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = AutoModelForSequenceClassification.from_pretrained('sileod/deberta-v3-base-tasksource-nli', num_labels=8, ignore_mismatched_sizes=True)
model.to(device)

# Create a test dataset with dummy labels (zeros) since we only need the text data for inference
test_dataset = NewsDataset(test_df['text'].to_numpy(), np.zeros(len(test_df)), tokenizer, max_len=512)

# Create a DataLoader for the test dataset
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# Iterate over the desired k_fold values (1 and 3)
for k_fold in [1, 3]:
    # Load the best model for the current k_fold
    model.load_state_dict(torch.load(f'model_{k_fold}.pt'))

    # Perform inference on the test dataset and get the predicted labels
    _, test_predictions = eval(model, test_loader, device)

    # Assign the predicted labels to the 'label' column in the test DataFrame
    test_df['label'] = test_predictions

    # Save the 'id' and 'label' columns of the test DataFrame to a CSV file
    test_df[['id', 'label']].to_csv(f'inference_{k_fold}.csv', index=False)




import pandas as pd
from scipy.stats import mode

# Read the selected inference CSV files into separate DataFrames
inference_files = [f'inference_{i}.csv' for i in [1, 3]]
inference_dfs = [pd.read_csv(file) for file in inference_files]

# Extract the 'label' column from each DataFrame
label_columns = [df['label'] for df in inference_dfs]

# Perform the hard voting for each row using the mode() function
voted_labels = mode(label_columns, axis=0).mode[0]

# Combine the 'id' column from the first DataFrame with the hard-voted labels into a new DataFrame
ensemble_submission = pd.DataFrame({'id': inference_dfs[0]['id'], 'label': voted_labels})

# Save the new DataFrame as the final ensemble submission file
ensemble_submission.to_csv('ensemble_submission.csv', index=False)
print("Ensemble submission file saved as 'ensemble_submission.csv'")