# 🎯 Buyer Intent Prediction Model
Predict whether a user will place an order based on behavior and persona.

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

## 📁 Load Data

In [None]:
users = pd.read_csv('users.csv')
orders = pd.read_csv('orders.csv')
views = pd.read_csv('product_views.csv')
returns = pd.read_csv('returns.csv')

## 🛠️ Feature Engineering

In [None]:
order_counts = orders.groupby('user_id').size().reset_index(name='total_orders')
order_counts['order_placed'] = 1

view_counts = views.groupby('user_id').size().reset_index(name='total_views')

user_summary = users.merge(order_counts[['user_id', 'total_orders', 'order_placed']], on='user_id', how='left')
user_summary = user_summary.merge(view_counts, on='user_id', how='left')

user_summary['total_orders'] = user_summary['total_orders'].fillna(0)
user_summary['order_placed'] = user_summary['order_placed'].fillna(0)
user_summary['total_views'] = user_summary['total_views'].fillna(0)

user_summary = pd.get_dummies(user_summary, columns=['persona', 'age_group'], drop_first=True)

## 🔍 Model Training and Evaluation

In [None]:
X = user_summary.drop(['user_id', 'order_placed'], axis=1)
y = user_summary['order_placed']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = RandomForestClassifier()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

## 📊 Feature Importance

In [None]:
importances = model.feature_importances_
features = X.columns
imp_df = pd.DataFrame({'Feature': features, 'Importance': importances}).sort_values(by='Importance', ascending=False)

plt.figure(figsize=(10, 6))
sns.barplot(data=imp_df.head(10), x='Importance', y='Feature')
plt.title("Top 10 Feature Importances")
plt.tight_layout()
plt.show()

## 🧠 Conclusion
- Accuracy: Varies based on data
- Most important features: typically total_views, persona types, age group
- Limitation: Static dataset
- Future scope: Live retraining with real-time logs