# Kaggle Templates: CV, ML, NLP
Ниже представлены шаблоны для быстрого старта задач на Kaggle: компьютерное зрение (CV), классическое машинное обучение (ML) и обработка естественного языка (NLP).

## Computer Vision (CV) Template

In [None]:
# Библиотеки
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import torchvision.transforms as transforms
from torchvision import datasets, models

# Dataset и DataLoader
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225]),
])
train_dataset = datasets.ImageFolder('path/to/train', transform=transform)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

# Прототип модели
class SimpleCNN(nn.Module):
    def __init__(self, num_classes=10):
        super(SimpleCNN, self).__init__()
        self.features = models.resnet18(pretrained=True)
        self.features.fc = nn.Linear(self.features.fc.in_features, num_classes)
    def forward(self, x):
        return self.features(x)

# Training loop stub
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = SimpleCNN(num_classes=10).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-4)

for epoch in range(5):
    model.train()
    for images, labels in train_loader:
        images, labels = images.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch+1}, Loss: {loss.item():.4f}")


## Machine Learning (ML) Template

In [None]:
# Библиотеки
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import xgboost as xgb
import lightgbm as lgb

# Загрузка и подготовка данных
df = pd.read_csv('path/to/data.csv')
X = df.drop('target', axis=1)
y = df['target']
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

# Масштабирование признаков
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_valid = scaler.transform(X_valid)

# RandomForestClassifier
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
preds_rf = rf.predict(X_valid)
print('Random Forest Classification Report:')
print(classification_report(y_valid, preds_rf))

# LightGBM
train_data = lgb.Dataset(X_train, label=y_train)
valid_data = lgb.Dataset(X_valid, label=y_valid, reference=train_data)
params = {
    'objective': 'binary',
    'metric': 'auc',
    'verbosity': -1,
}
model_lgb = lgb.train(params, train_data, valid_sets=[valid_data], num_boost_round=1000, early_stopping_rounds=50)
preds_lgb = model_lgb.predict(X_valid, num_iteration=model_lgb.best_iteration)
print(f'LightGBM AUC: {roc_auc_score(y_valid, preds_lgb):.4f}')


## Natural Language Processing (NLP) Template

In [None]:
# Библиотеки
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
import torch

# Загрузка данных
df = pd.read_csv('path/to/text_data.csv')
texts = df['text'].tolist()
labels = df['label'].tolist()
train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)

# Токенизация
model_name = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_name)
def tokenize(batch):
    return tokenizer(batch, padding=True, truncation=True, max_length=128)

train_encodings = tokenize(train_texts)
val_encodings = tokenize(val_texts)

class DatasetTorch(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    def __len__(self):
        return len(self.labels)
    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

train_dataset = DatasetTorch(train_encodings, train_labels)
val_dataset = DatasetTorch(val_encodings, val_labels)

# Модель и тренировка
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(set(labels)))
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    evaluation_strategy='steps',
    eval_steps=500,
    save_steps=1000,
    logging_dir='./logs',
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)
trainer.train()
