<a href="https://colab.research.google.com/github/kumkvattiinc-lab/kumkvatti/blob/main/Untitled27.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!unzip stage1_individual_data.zip

Archive:  stage1_individual_data.zip
  inflating: book_descriptions.csv   
  inflating: book_genres.csv         
  inflating: books.csv               
  inflating: genres.csv              
  inflating: sample_submission.csv   
  inflating: test.csv                
  inflating: train.csv               
  inflating: users.csv               


In [2]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GroupKFold
from sklearn.metrics import mean_squared_error
import gc
import pickle

def load_data():
    train = pd.read_csv('train.csv')
    test = pd.read_csv('test.csv')
    return train, test

def create_features(train, test):
    user_stats = train.groupby('user_id')['rating'].agg(['mean', 'std', 'count', 'min', 'max']).fillna(0)
    user_stats.columns = ['user_mean', 'user_std', 'user_count', 'user_min', 'user_max']
    user_stats['user_range'] = user_stats['user_max'] - user_stats['user_min']

    for col in user_stats.columns:
        train[col] = train['user_id'].map(user_stats[col]).fillna(0)
        test[col] = test['user_id'].map(user_stats[col]).fillna(0)

    book_stats = train.groupby('book_id')['rating'].agg(['mean', 'std', 'count', 'min', 'max']).fillna(0)
    book_stats.columns = ['book_mean', 'book_std', 'book_count', 'book_min', 'book_max']
    book_stats['book_range'] = book_stats['book_max'] - book_stats['book_min']

    for col in book_stats.columns:
        train[col] = train['book_id'].map(book_stats[col]).fillna(0)
        test[col] = test['book_id'].map(book_stats[col]).fillna(0)

    train['user_book_diff'] = train['user_mean'] - train['book_mean']
    train['user_book_avg'] = (train['user_mean'] + train['book_mean']) / 2
    test['user_book_diff'] = test['user_mean'] - test['book_mean']
    test['user_book_avg'] = (test['user_mean'] + test['book_mean']) / 2

    return train, test

def prepare_data(train, test):
    train = train[train['has_read'] == 1].copy()
    train, test = create_features(train, test)

    feature_cols = [col for col in train.columns
                   if col not in ['user_id', 'book_id', 'rating', 'has_read', 'timestamp', 'Unnamed: 0']
                   and train[col].dtype in ['int64', 'float64']]

    X_train = train[feature_cols].fillna(0).values
    y_train = train['rating'].values
    X_test = test[feature_cols].fillna(0).values

    return X_train, y_train, X_test, feature_cols, train['user_id'].values

def train_model(X_train, y_train, X_test, groups, feature_cols):
    oof_predictions = np.zeros(len(X_train))
    test_predictions = np.zeros(len(X_test))
    models = []

    gkf = GroupKFold(n_splits=5)

    for fold, (train_idx, val_idx) in enumerate(gkf.split(X_train, y_train, groups)):
        X_fold_train = X_train[train_idx]
        y_fold_train = y_train[train_idx]
        X_fold_val = X_train[val_idx]

        model = RandomForestRegressor(
            n_estimators=100,
            max_depth=20,
            min_samples_split=5,
            min_samples_leaf=2,
            max_features='sqrt',
            bootstrap=True,
            random_state=42 + fold,
            n_jobs=-1,
            verbose=0
        )

        model.fit(X_fold_train, y_fold_train)

        y_val_pred = model.predict(X_fold_val)
        y_test_pred = model.predict(X_test)

        oof_predictions[val_idx] = y_val_pred
        test_predictions += y_test_pred / 5
        models.append(model)

        gc.collect()

    feature_importance = np.zeros(len(feature_cols))
    for model in models:
        feature_importance += model.feature_importances_
    feature_importance /= len(models)

    importance_df = pd.DataFrame({
        'feature': feature_cols,
        'importance': feature_importance
    })
    importance_df = importance_df.sort_values('importance', ascending=False)
    importance_df.to_csv('feature_importance.csv', index=False)

    with open('models.pkl', 'wb') as f:
        pickle.dump(models, f)

    rmse = np.sqrt(mean_squared_error(y_train, oof_predictions))

    return test_predictions, models, importance_df, oof_predictions, rmse

def create_submission(test, predictions):
    submission = test[['user_id', 'book_id']].copy()
    predictions = np.clip(predictions, 0.0, 10.0)
    submission['rating_predict'] = predictions
    submission.to_csv('submission.csv', index=False, float_format='%.6f')
    return submission

def main():
    train, test = load_data()
    X_train, y_train, X_test, feature_cols, groups = prepare_data(train, test)
    test_predictions, models, importance_df, oof_predictions, rmse = train_model(X_train, y_train, X_test, groups, feature_cols)
    create_submission(test, test_predictions)

    print(f"RMSE: {rmse:.4f}")

if __name__ == "__main__":
    main()

RMSE: 2.0740
