# 🧪 Train/Test Split, Cross-Validation, and Feature Store Usage
This notebook demonstrates how to split data for training, apply cross-validation, and use a mock feature store for storing and retrieving features and models.

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import joblib  # for saving/loading models and features

## 📄 Create Sample Dataset

In [None]:
# Create a synthetic dataset
data = {
    'age': [25, 30, 35, 40, 45, 50, 55, 60],
    'income': [50000, 60000, 70000, 80000, 85000, 90000, 95000, 100000],
    'purchased': [0, 0, 1, 0, 1, 1, 1, 1]
}
df = pd.DataFrame(data)
X = df[['age', 'income']]
y = df['purchased']
df

## ✂️ Train/Test Split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
print('Training data shape:', X_train.shape)
print('Testing data shape:', X_test.shape)

## 🔁 Cross-Validation

In [None]:
model = LogisticRegression()
cv = KFold(n_splits=4, shuffle=True, random_state=1)
scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv)
print('Cross-validation scores:', scores)
print('Average accuracy:', scores.mean())

## 💾 Train and Save Model to Feature Store

In [None]:
# Train the model
model.fit(X_train, y_train)

# Save model
joblib.dump(model, 'model_store/logreg_model.pkl')

# Save training features
X_train.to_csv('feature_store/train_features.csv', index=False)
print('Model and features saved.')

## 📥 Load Features and Model from Store

In [None]:
# Load model and features
loaded_model = joblib.load('model_store/logreg_model.pkl')
loaded_features = pd.read_csv('feature_store/train_features.csv')
print('Loaded model and features:')
print(loaded_features.head())

## ✅ Predict with Loaded Model

In [None]:
# Predict on test set
y_pred = loaded_model.predict(X_test)
print('Test Accuracy:', accuracy_score(y_test, y_pred))

## 🏪 Storing Features in a Feature Store with Feast

In [None]:
# Install Feast if not already installed (uncomment if running interactively)
# !pip install feast

In [None]:
# Import Feast libraries
from feast import FeatureStore, Entity, FeatureView, Field
from feast.types import Int64, Float64
from datetime import datetime
from pathlib import Path
import os

In [None]:
# Create a local repo for Feast if it doesn't exist
repo_path = Path('feast_repo')
repo_path.mkdir(exist_ok=True)
os.chdir(repo_path)
repo_path_absolute = str(Path.cwd())

In [None]:
# Create sample feature data
entity_df = pd.DataFrame({
    'user_id': range(len(X)),
    'age': X['age'],
    'income': X['income'],
    'event_timestamp': [datetime.utcnow() for _ in range(len(X))]
})
entity_df.to_parquet('feature_data.parquet')

In [None]:
# Write feature repo files (feature_store.yaml, feature_view.py, etc.)
feature_repo_yaml = '''
project: feast_demo
registry: data/registry.db
provider: local
online_store:
    path: data/online_store.db
offline_store:
    type: file
'''
Path('feature_store.yaml').write_text(feature_repo_yaml)

In [None]:
# Define feature view in a Python module
feature_view_code = '''
from feast import Entity, FeatureView, Field
from feast.types import Int64, Float64
from datetime import timedelta
from feast.file_source import FileSource

user_df_source = FileSource(
    path="feature_data.parquet",
    event_timestamp_column="event_timestamp",
)

user = Entity(name="user_id", join_keys=["user_id"])

user_features_view = FeatureView(
    name="user_features",
    entities=["user_id"],
    ttl=timedelta(days=1),
    schema=[
        Field(name="age", dtype=Float64),
        Field(name="income", dtype=Float64),
    ],
    source=user_df_source,
    online=True
)
'''
Path('user_features.py').write_text(feature_view_code)

In [None]:
# Initialize and apply the feature repo
!feast init .
!feast apply

In [None]:
# Materialize features
!feast materialize-incremental $(date +%F)

In [None]:
# Retrieve features back from Feast
store = FeatureStore(repo_path_absolute)
retrieved = store.get_online_features(
    features=["user_features:age", "user_features:income"],
    entity_rows=[{"user_id": i} for i in range(len(X))]
).to_df()
retrieved.head()