In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, ConfusionMatrixDisplay
import xgboost as xgb
import pickle
from sklearn.tree import DecisionTreeClassifier

# Load Data
customers = pd.read_csv('/home/faial/code/ifai2/E_commerce_Sales_Insights/olist_customers_dataset.csv')
geolocation = pd.read_csv('/home/faial/code/ifai2/E_commerce_Sales_Insights/olist_geolocation_dataset.csv')
order_items = pd.read_csv('/home/faial/code/ifai2/E_commerce_Sales_Insights/olist_order_items_dataset.csv')
order_payments = pd.read_csv('/home/faial/code/ifai2/E_commerce_Sales_Insights/olist_order_payments_dataset.csv')
order_reviews = pd.read_csv('/home/faial/code/ifai2/E_commerce_Sales_Insights/olist_order_reviews_dataset.csv')
orders = pd.read_csv('/home/faial/code/ifai2/E_commerce_Sales_Insights/olist_orders_dataset.csv')
products = pd.read_csv('/home/faial/code/ifai2/E_commerce_Sales_Insights/olist_products_dataset.csv')
sellers = pd.read_csv('/home/faial/code/ifai2/E_commerce_Sales_Insights/olist_sellers_dataset.csv')
category_translation = pd.read_csv('/home/faial/code/ifai2/E_commerce_Sales_Insights/product_category_name_translation.csv')

print("All datasets loaded successfully")

# Merge datasets
def prepare_seller_data():
    orders_payments = orders.merge(
        order_payments[['order_id', 'payment_value']],
        on='order_id',
        how='left'
    )
    orders_items_payments = order_items.merge(
        orders_payments[['order_id', 'order_purchase_timestamp', 'payment_value']],
        on='order_id',
        how='left'
    ).merge(
        order_reviews[['order_id', 'review_score']],
        on='order_id',
        how='left'
    )

    seller_stats = orders_items_payments.groupby('seller_id').agg({
        'order_id': 'nunique',
        'order_item_id': 'count',
        'price': 'mean',
        'freight_value': 'mean',
        'payment_value': 'sum',
        'review_score': 'mean'
    }).reset_index()

    seller_stats.columns = [
        'seller_id', 'total_orders', 'total_products_sold',
        'avg_price', 'avg_freight', 'total_revenue', 'avg_review_score'
    ]

    seller_data = seller_stats.merge(sellers, on='seller_id', how='left')
    seller_data.fillna({
        'avg_review_score': seller_data['avg_review_score'].mean(),
        'avg_price': seller_data['avg_price'].mean(),
        'avg_freight': seller_data['avg_freight'].mean(),
        'total_revenue': 0
    }, inplace=True)

    return seller_data

seller_data = prepare_seller_data()
print("Seller data prepared successfully")

# Create target variable
median_revenue = seller_data['total_revenue'].median()
seller_data['profitable'] = (seller_data['total_revenue'] > median_revenue).astype(int)
print("Target variable created:")
print(seller_data['profitable'].value_counts())

# Select features and split data
features = ['total_orders', 'total_products_sold', 'avg_price', 'avg_freight', 'avg_review_score']
X = seller_data[features]
y = seller_data['profitable']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Models to try
models = {
    "RandomForest": RandomForestClassifier(n_estimators=150, max_depth=10, random_state=42, class_weight='balanced'),
    "XGBoost": xgb.XGBClassifier(n_estimators=150, max_depth=10, random_state=42, scale_pos_weight=len(y_train) / sum(y_train)),
    "LogisticRegression": LogisticRegression(random_state=42, class_weight='balanced'),
    "SVM": SVC(kernel='rbf', random_state=42, class_weight='balanced'),
    "KNN": KNeighborsClassifier(n_neighbors=5),
    "AdaBoost": AdaBoostClassifier(  # No base_estimator, defaults to DecisionTreeClassifier
        n_estimators=150,
        random_state=42
    )
}

# Train and evaluate each model
results = {}

for name, model in models.items():
    print(f"\nTraining {name}...")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    confusion = confusion_matrix(y_test, y_pred)
    report = classification_report(y_test, y_pred)

    cv_scores = cross_val_score(model, X, y, cv=5, scoring='accuracy')

    results[name] = {
        "accuracy": accuracy,
        "confusion_matrix": confusion,
        "classification_report": report,
        "cross_val_mean_accuracy": cv_scores.mean(),
        "cross_val_scores": cv_scores
    }

    print(f"{name} - Accuracy: {accuracy:.4f}")
    print(f"Confusion Matrix:\n{confusion}")
    print(f"Classification Report:\n{report}")
    print(f"Cross-validation mean accuracy: {cv_scores.mean():.4f}")

# Visualizing feature importance and confusion matrix for Random Forest
fig, axes = plt.subplots(1, 2, figsize=(15, 5))
rf = models['RandomForest']
importances = rf.feature_importances_
feature_importance_df = pd.DataFrame({
    'Feature': features,
    'Importance': importances
}).sort_values('Importance', ascending=True)

axes[0].barh(feature_importance_df['Feature'], feature_importance_df['Importance'])
axes[0].set_title('Feature Importance')
axes[0].set_xlabel('Importance')

ConfusionMatrixDisplay.from_estimator(rf, X_test, y_test, ax=axes[1], cmap='Blues')
axes[1].set_title('Confusion Matrix')

plt.tight_layout()
plt.show()

# Save model and data
with open('seller_profitability_model.pkl', 'wb') as f:
    pickle.dump(models['RandomForest'], f)

seller_data.to_csv('cleaned_seller_data.csv', index=False)

print("Model and data saved successfully")

# Visualization of top 5 best sellers and other insights
top_5_sellers = seller_data.sort_values(by='total_revenue', ascending=False).head(5)
print(f"üèÜ Top 5 Best Sellers:\n{top_5_sellers[['seller_id', 'seller_city', 'seller_state', 'total_revenue', 'total_orders', 'avg_review_score']]}")


All datasets loaded successfully
Seller data prepared successfully
Target variable created:
profitable
0    1548
1    1547
Name: count, dtype: int64


TypeError: AdaBoostClassifier.__init__() got an unexpected keyword argument 'base_estimator'

In [7]:
import sklearn
print(sklearn.__version__)


1.6.1
