In [19]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import lightgbm as lgb
from sklearn.model_selection import GroupKFold
from sklearn.metrics import ndcg_score
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.filterwarnings('ignore')
import re
import json
from collections import Counter
import gc

SEED = 993
np.random.seed(SEED)

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

print(f"Train shape: {train.shape}")
print(f"Test shape: {test.shape}")

# Предобработка текста
def quick_preprocess(df):
    df = df.copy()
    
    text_cols = ['query', 'product_title', 'product_description', 'product_bullet_point']
    for col in text_cols:
        df[col] = df[col].fillna('')
    
    cat_cols = ['product_brand', 'product_color']
    for col in cat_cols:
        if col in df.columns:
            df[col] = df[col].fillna('unknown')
    
    def clean_text(text):
        text = str(text).lower()
        text = re.sub(r'[^\w\s]', ' ', text)
        text = ' '.join(text.split())
        return text
    
    for col in text_cols:
        df[col] = df[col].apply(clean_text)
    
    return df

train = quick_preprocess(train)
test = quick_preprocess(test)

# Создание фич
def create_optimal_features(df, vectorizer=None, is_train=True):
    df = df.copy()
    
    # Текстовые совпадения
    df['query_in_title'] = df.apply(
        lambda x: int(x['query'] in x['product_title']), axis=1
    )
    
    def word_overlap(row):
        query_words = set(row['query'].split())
        title_words = set(row['product_title'].split())
        
        if not query_words:
            return 0, 0, 0, 0
        
        intersect = len(query_words & title_words)
        union = len(query_words | title_words)
        
        return pd.Series({
            'word_overlap': intersect,
            'word_overlap_ratio': intersect / len(query_words),
            'jaccard': intersect / union if union > 0 else 0,
            'contains_all': int(query_words.issubset(title_words))
        })
    
    overlap_features = df.apply(word_overlap, axis=1)
    df = pd.concat([df, overlap_features], axis=1)
    
    df['title_starts_with_query'] = df.apply(
        lambda x: int(x['product_title'].startswith(x['query'])), axis=1
    )
    
    # Статистики
    df['query_len'] = df['query'].str.len()
    df['title_len'] = df['product_title'].str.len()
    df['query_word_count'] = df['query'].str.split().str.len()
    df['title_word_count'] = df['product_title'].str.split().str.len()
    
    df['len_ratio'] = df['title_len'] / (df['query_len'] + 1e-5)
    df['word_count_ratio'] = df['title_word_count'] / (df['query_word_count'] + 1e-5)
    
    # Бренд и цвет
    if 'product_brand' in df.columns:
        df['brand_in_query'] = df.apply(
            lambda x: int(any(brand_word in x['query'] 
                             for brand_word in str(x['product_brand']).lower().split()
                             if len(brand_word) > 2)), axis=1
        )
        df['has_brand'] = (~df['product_brand'].isin(['unknown', ''])).astype(int)
    
    if 'product_color' in df.columns:
        df['color_in_query'] = df.apply(
            lambda x: int(any(color_word in x['query']
                             for color_word in str(x['product_color']).lower().split()
                             if len(color_word) > 2)), axis=1
        )
        df['has_color'] = (df['product_color'] != 'unknown').astype(int)
    
    # BM25
    def bm25_similarity(row):
        query = row['query']
        title = row['product_title']
        
        if not query or not title:
            return 0
        
        query_words = query.split()
        title_words = title.split()
        
        if not query_words or not title_words:
            return 0
        
        avg_doc_len = np.mean([len(w) for w in title_words])
        
        k1 = 1.5
        b = 0.75
        
        score = 0
        for word in query_words:
            if word in title_words:
                tf = title_words.count(word)
                idf = np.log((len(title_words) + 1) / (title_words.count(word) + 0.5))
                score += idf * (tf * (k1 + 1)) / (tf + k1 * (1 - b + b * len(title_words) / avg_doc_len))
        
        return score / len(query_words)
    
    df['bm25_score'] = df.apply(bm25_similarity, axis=1)
    
    # TF-IDF cosine similarity
    if is_train:
        corpus = pd.concat([df['query'], df['product_title']]).unique()
        vectorizer = TfidfVectorizer(
            max_features=100,
            ngram_range=(1, 2),
            stop_words='english',
            min_df=2
        )
        vectorizer.fit(corpus)
        global tfidf_vectorizer
        tfidf_vectorizer = vectorizer
    else:
        vectorizer = tfidf_vectorizer
    
    query_tfidf = vectorizer.transform(df['query'])
    title_tfidf = vectorizer.transform(df['product_title'])
    
    batch_size = 1000
    cosine_sims = []
    
    for i in range(0, len(df), batch_size):
        batch_end = min(i + batch_size, len(df))
        batch_query = query_tfidf[i:batch_end]
        batch_title = title_tfidf[i:batch_end]
        batch_sim = cosine_similarity(batch_query, batch_title).diagonal()
        cosine_sims.extend(batch_sim)
    
    df['tfidf_cosine'] = cosine_sims
    
    # Character n-gram overlap
    def char_ngram_overlap(text1, text2, n=3):
        if len(text1) < n or len(text2) < n:
            return 0
        
        ngrams1 = set([text1[i:i+n] for i in range(len(text1)-n+1)])
        ngrams2 = set([text2[i:i+n] for i in range(len(text2)-n+1)])
        
        if not ngrams1:
            return 0
        
        return len(ngrams1 & ngrams2) / len(ngrams1)
    
    df['char_3gram_overlap'] = df.apply(
        lambda x: char_ngram_overlap(x['query'], x['product_title'], 3), axis=1
    )
    
    # Композитный скор
    df['composite_score'] = (
        0.3 * df['query_in_title'] +
        0.25 * df['contains_all'] +
        0.15 * df['title_starts_with_query'] +
        0.1 * df['jaccard'] +
        0.08 * df['tfidf_cosine'] +
        0.07 * df['bm25_score'] +
        0.05 * df.get('brand_in_query', 0)
    )
    
    # Group normalization
    if 'query_id' in df.columns:
        for feat in ['composite_score', 'tfidf_cosine', 'bm25_score', 'word_overlap_ratio']:
            if feat in df.columns:
                df[f'{feat}_group_norm'] = df.groupby('query_id')[feat].transform(
                    lambda x: (x - x.mean()) / (x.std() + 1e-8)
                )
                df[f'{feat}_group_minmax'] = df.groupby('query_id')[feat].transform(
                    lambda x: (x - x.min()) / (x.max() - x.min() + 1e-8)
                )
    
    # Числовые совпадения
    def extract_numbers(text):
        numbers = re.findall(r'\d+', str(text))
        return [int(n) for n in numbers]
    
    df['query_numbers'] = df['query'].apply(extract_numbers)
    df['title_numbers'] = df['product_title'].apply(extract_numbers)
    df['number_match'] = df.apply(
        lambda x: len(set(x['query_numbers']) & set(x['title_numbers'])) / 
        max(len(set(x['query_numbers'])), 1), axis=1
    )
    
    df = df.drop(['query_numbers', 'title_numbers'], axis=1, errors='ignore')
    
    # Дополнительные фичи
    df['first_word_match'] = df.apply(
        lambda x: int(x['query'].split()[0] == x['product_title'].split()[0]) 
        if x['query'].split() and x['product_title'].split() else 0, axis=1
    )
    
    if 'product_description' in df.columns:
        df['desc_len'] = df['product_description'].str.len()
        df['desc_has_query'] = df.apply(
            lambda x: int(x['query'] in x['product_description']), axis=1
        )
    
    return df

print("Создание фич...")
train_features = create_optimal_features(train, is_train=True)
test_features = create_optimal_features(test, is_train=False)

# Подготовка фичей
print("\nПодготовка данных для модели...")

non_feature_cols = [
    'id', 'query_id', 'query', 'product_title', 'product_description',
    'product_bullet_point', 'product_brand', 'product_color', 'product_locale'
]

if 'relevance' in train_features.columns:
    non_feature_cols.append('relevance')

feature_cols = [col for col in train_features.columns 
                if col not in non_feature_cols 
                and pd.api.types.is_numeric_dtype(train_features[col])]

print(f"Количество фич: {len(feature_cols)}")
print(f"Примеры фич: {feature_cols[:10]}")

for col in feature_cols:
    train_features[col] = train_features[col].fillna(0).astype(np.float32)
    test_features[col] = test_features[col].fillna(0).astype(np.float32)

# Обучение модели
print("\n" + "="*60)
print("ОБУЧЕНИЕ МОДЕЛИ")
print("="*60)

X = train_features[feature_cols].values
y = train_features['relevance'].values
groups = train_features['query_id'].values

X_test = test_features[feature_cols].values

best_params = {
    'objective': 'regression',
    'metric': 'rmse',
    'boosting_type': 'gbdt',
    'learning_rate': 0.05,
    'num_leaves': 63,
    'max_depth': 8,
    'min_child_samples': 20,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'reg_alpha': 0.1,
    'reg_lambda': 0.3,
    'min_split_gain': 0.01,
    'verbosity': -1,
    'n_jobs': -1,
    'seed': SEED,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.9,
    'bagging_freq': 1,
}

n_folds = 5
group_kfold = GroupKFold(n_splits=n_folds)

oof_predictions = np.zeros(len(X))
test_predictions = np.zeros(len(X_test))
fold_scores = []

for fold, (train_idx, val_idx) in enumerate(group_kfold.split(X, y, groups)):
    print(f"\nFold {fold + 1}/{n_folds}")
    print("-" * 40)
    
    X_train, X_val = X[train_idx], X[val_idx]
    y_train, y_val = y[train_idx], y[val_idx]
    
    print(f"Train samples: {len(X_train)}")
    print(f"Val samples: {len(X_val)}")
    
    train_data = lgb.Dataset(X_train, label=y_train)
    val_data = lgb.Dataset(X_val, label=y_val, reference=train_data)
    
    model = lgb.train(
        best_params,
        train_data,
        num_boost_round=1000,
        valid_sets=[val_data],
        valid_names=['val'],
        callbacks=[
            lgb.early_stopping(100, verbose=False),
            lgb.log_evaluation(0)
        ]
    )
    
    val_pred = model.predict(X_val)
    oof_predictions[val_idx] = val_pred
    test_predictions += model.predict(X_test) / n_folds
    
    fold_ndcg_scores = []
    unique_queries = np.unique(groups[val_idx])
    
    for query_id in unique_queries:
        mask = groups[val_idx] == query_id
        if mask.sum() > 1:
            y_true_sub = y_val[mask].reshape(1, -1)
            y_pred_sub = val_pred[mask].reshape(1, -1)
            try:
                ndcg = ndcg_score(y_true_sub, y_pred_sub, k=min(10, len(y_true_sub[0])))
                fold_ndcg_scores.append(ndcg)
            except:
                pass
    
    fold_ndcg = np.mean(fold_ndcg_scores) if fold_ndcg_scores else 0
    fold_scores.append(fold_ndcg)
    print(f"Fold {fold + 1} nDCG@10: {fold_ndcg:.4f}")
    
    if fold == 0:
        importance = pd.DataFrame({
            'feature': feature_cols,
            'gain': model.feature_importance(importance_type='gain')
        }).sort_values('gain', ascending=False)
        
        print("\nТоп-10 важных фич:")
        for i, row in importance.head(10).iterrows():
            print(f"  {row['feature']}: {row['gain']:.2f}")
    
    del model, train_data, val_data
    gc.collect()

# OOF оценка
print("\n" + "="*60)
print("РЕЗУЛЬТАТЫ ВАЛИДАЦИИ")
print("="*60)

train_features['oof_pred'] = oof_predictions
oof_ndcg_scores = []

for query_id in train_features['query_id'].unique():
    query_data = train_features[train_features['query_id'] == query_id]
    if len(query_data) > 1:
        y_true = query_data['relevance'].values.reshape(1, -1)
        y_pred = query_data['oof_pred'].values.reshape(1, -1)
        try:
            ndcg = ndcg_score(y_true, y_pred, k=min(10, len(query_data)))
            oof_ndcg_scores.append(ndcg)
        except:
            pass

print(f"Средний nDCG@10 по фолдам: {np.mean(fold_scores):.4f} (±{np.std(fold_scores):.4f})")
print(f"OOF nDCG@10: {np.mean(oof_ndcg_scores):.4f}")

print("\nАнализ предсказаний по relevance:")
for rel in sorted(train_features['relevance'].unique()):
    mask = train_features['relevance'] == rel
    if mask.sum() > 0:
        avg_pred = train_features.loc[mask, 'oof_pred'].mean()
        std_pred = train_features.loc[mask, 'oof_pred'].std()
        print(f"  Relevance {rel}: {mask.sum():5d} samples, pred: {avg_pred:.3f} ± {std_pred:.3f}")

# Постобработка
print("\nПрименяем постобработку...")

final_test_predictions = test_predictions.copy()

if 'query_in_title' in test_features.columns:
    exact_mask = test_features['query_in_title'] == 1
    if exact_mask.any():
        boost_amount = 0.5
        final_test_predictions[exact_mask] += boost_amount
        print(f"Усилено {exact_mask.sum()} exact matches (+{boost_amount})")

if 'contains_all' in test_features.columns:
    all_mask = test_features['contains_all'] == 1
    if all_mask.any():
        boost_amount = 0.3
        final_test_predictions[all_mask] += boost_amount
        print(f"Усилено {all_mask.sum()} contains_all matches (+{boost_amount})")

if 'title_starts_with_query' in test_features.columns:
    starts_mask = test_features['title_starts_with_query'] == 1
    if starts_mask.any():
        boost_amount = 0.2
        final_test_predictions[starts_mask] += boost_amount
        print(f"Усилено {starts_mask.sum()} title_starts_with_query (+{boost_amount})")

if 'brand_in_query' in test_features.columns:
    brand_mask = test_features['brand_in_query'] == 1
    if brand_mask.any():
        boost_amount = 0.15
        final_test_predictions[brand_mask] += boost_amount
        print(f"Усилено {brand_mask.sum()} brand_in_query (+{boost_amount})")

if 'query_id' in test_features.columns:
    print("\nПрименяем group-wise enhancement...")
    test_features['model_score'] = final_test_predictions
    
    for query_id in test_features['query_id'].unique():
        mask = test_features['query_id'] == query_id
        if mask.sum() > 1:
            scores = final_test_predictions[mask]
            score_range = scores.max() - scores.min()
            if score_range < 0.1:
                mean_score = scores.mean()
                final_test_predictions[mask] = mean_score + (scores - mean_score) * 2.0
            
            if len(set(scores)) < len(scores):
                noise = np.random.uniform(-0.0001, 0.0001, size=mask.sum())
                final_test_predictions[mask] += noise

# Создание submission
print("\n" + "="*60)
print("СОЗДАНИЕ SUBMISSION")
print("="*60)

submission = pd.DataFrame({
    'id': test['id'],
    'prediction': final_test_predictions
})

print(f"\nСтатистика предсказаний:")
print(f"Min: {final_test_predictions.min():.6f}")
print(f"Max: {final_test_predictions.max():.6f}")
print(f"Mean: {final_test_predictions.mean():.6f}")
print(f"Std: {final_test_predictions.std():.6f}")
print(f"Median: {np.median(final_test_predictions):.6f}")

print(f"\nПерцентили:")
percentiles = [0, 1, 5, 25, 50, 75, 95, 99, 100]
for p in percentiles:
    value = np.percentile(final_test_predictions, p)
    print(f"  {p:3d}%: {value:.4f}")

submission_path = 'E:\\submission.csv'
submission.to_csv(submission_path, index=False)

print(f"\n{'='*60}")
print(f"✓ SUBMISSION СОХРАНЕН: {submission_path}")
print(f"Размер: {submission.shape}")
print(f"{'='*60}")

print(f"\nПервые 5 строк:")
print(submission.head())

print("\nРешение готово!")

Train shape: (49496, 11)
Test shape: (21184, 10)
Создание фич...

Подготовка данных для модели...
Количество фич: 32
Примеры фич: ['query_in_title', 'word_overlap', 'word_overlap_ratio', 'jaccard', 'contains_all', 'title_starts_with_query', 'query_len', 'title_len', 'query_word_count', 'title_word_count']

ОБУЧЕНИЕ МОДЕЛИ

Fold 1/5
----------------------------------------
Train samples: 39596
Val samples: 9900
Fold 1 nDCG@10: 0.8372

Топ-10 важных фич:
  char_3gram_overlap: 23713.62
  bm25_score: 3662.18
  composite_score_group_norm: 3609.98
  word_overlap_ratio_group_norm: 3094.06
  query_len: 2997.73
  len_ratio: 2897.79
  title_len: 2808.67
  composite_score: 2374.66
  bm25_score_group_norm: 2342.96
  query_word_count: 2237.65

Fold 2/5
----------------------------------------
Train samples: 39597
Val samples: 9899
Fold 2 nDCG@10: 0.8440

Fold 3/5
----------------------------------------
Train samples: 39597
Val samples: 9899
Fold 3 nDCG@10: 0.8391

Fold 4/5
------------------------