In [22]:
## import libraries
!pip install pyngrok streamlit

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, ConfusionMatrixDisplay
import pickle



## Load Data
customers = pd.read_csv('olist_customers_dataset.csv')
geolocation = pd.read_csv('olist_geolocation_dataset.csv')
order_items = pd.read_csv('olist_order_items_dataset.csv')
order_payments = pd.read_csv('olist_order_payments_dataset.csv')
order_reviews = pd.read_csv('olist_order_reviews_dataset.csv')
orders = pd.read_csv('olist_orders_dataset.csv')
products = pd.read_csv('olist_products_dataset.csv')
sellers = pd.read_csv('olist_sellers_dataset.csv')
category_translation = pd.read_csv('product_category_name_translation.csv')

print("All datasets loaded successfully")

## Merge datasets
def prepare_seller_data():
    # Merge orders with payments to get actual revenue
    orders_payments = orders.merge(
        order_payments[['order_id', 'payment_value']],
        on='order_id',
        how='left'
    )

    # Merge with order items to get seller information
    orders_items_payments = order_items.merge(
        orders_payments[['order_id', 'order_purchase_timestamp', 'payment_value']],
        on='order_id',
        how='left'
    ).merge(
        order_reviews[['order_id', 'review_score']],
        on='order_id',
        how='left'
    )

    # Calculate seller statistics
    seller_stats = orders_items_payments.groupby('seller_id').agg({
        'order_id': 'nunique',
        'order_item_id': 'count',
        'price': 'mean',
        'freight_value': 'mean',
        'payment_value': 'sum',
        'review_score': 'mean'
    }).reset_index()

    seller_stats.columns = [
        'seller_id', 'total_orders', 'total_products_sold',
        'avg_price', 'avg_freight', 'total_revenue', 'avg_review_score'
    ]

    # Merge with seller location data
    seller_data = seller_stats.merge(sellers, on='seller_id', how='left')

    # Handle missing values
    seller_data.fillna({
        'avg_review_score': seller_data['avg_review_score'].mean(),
        'avg_price': seller_data['avg_price'].mean(),
        'avg_freight': seller_data['avg_freight'].mean(),
        'total_revenue': 0
    }, inplace=True)

    return seller_data

seller_data = prepare_seller_data()
print("Seller data prepared successfully")

## Create target variable
median_revenue = seller_data['total_revenue'].median()
seller_data['profitable'] = (seller_data['total_revenue'] > median_revenue).astype(int)
print("Target variable created:")
print(seller_data['profitable'].value_counts())

## Select features and split data
features = ['total_orders', 'total_products_sold', 'avg_price', 'avg_freight', 'avg_review_score']
X = seller_data[features]
y = seller_data['profitable']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

## Train model
model = Pipeline([
    ('scaler', StandardScaler()),
    ('clf', RandomForestClassifier(
        n_estimators=150,
        max_depth=10,
        random_state=42,
        class_weight='balanced'
    ))
])

model.fit(X_train, y_train)

## Evaluate model
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print("Model Evaluation:")
print("Accuracy:", accuracy)
print("\n Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\n Classification Report:\n", classification_report(y_test, y_pred))

# Cross validation
cv_scores = cross_val_score(model, X, y, cv=5, scoring='accuracy')
print("Cross-validation scores:", cv_scores)
print("Mean CV accuracy:", cv_scores.mean())

# Save model and data
with open('seller_profitability_model_simple.pkl', 'wb') as f:
    pickle.dump(model, f)

# Save complete seller data for Streamlit
seller_data.to_csv('cleaned_seller_data.csv', index=False)

print("Model and data saved successfully")

## Visualization
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Feature importance
rf = model.named_steps['clf']
importances = rf.feature_importances_
feature_importance_df = pd.DataFrame({
    'Feature': features,
    'Importance': importances
}).sort_values('Importance', ascending=True)

axes[0].barh(feature_importance_df['Feature'], feature_importance_df['Importance'])
axes[0].set_title('Feature Importance')
axes[0].set_xlabel('Importance')

# Confusion Matrix
ConfusionMatrixDisplay.from_estimator(model, X_test, y_test, ax=axes[1], cmap='Blues')
axes[1].set_title('Confusion Matrix')

plt.tight_layout()
plt.show()

## Additional analysis
print("\n Additional Analysis:")
print(f"Total sellers: {len(seller_data)}")
print(f"Profitable sellers: {seller_data['profitable'].sum()} ({seller_data['profitable'].mean()*100:.1f}%)")
print(f"Median revenue: BRL {median_revenue:,.2f}")

# Show top features
print("\n Top Features by Importance:")
for feature, importance in zip(features, importances):
    print(f"{feature}: {importance:.3f}")


%%writefile app.py
import streamlit as st
import pandas as pd
import pickle
import matplotlib.pyplot as plt
import numpy as np

# Load model and data
@st.cache_resource
def load_model():
    with open("seller_profitability_model_simple.pkl", "rb") as f:
        return pickle.load(f)

@st.cache_data
def load_seller_data():
    return pd.read_csv("cleaned_seller_data.csv")

model = load_model()
seller_data = load_seller_data()

# Streamlit UI
st.set_page_config(page_title="Seller Profitability Predictor", page_icon="ðŸ›’", layout="centered")
st.title("Seller Profitability Prediction App")

st.write("""
Predict if a seller is **profitable (1)** or **not profitable (0)**
based on key performance indicators.
""")

# User input
st.subheader("Enter Seller Data")

col1, col2 = st.columns(2)
with col1:
    total_orders = st.number_input("Total Orders", min_value=0, step=1, value=50)
    total_products_sold = st.number_input("Total Products Sold", min_value=0, step=1, value=75)
    avg_price = st.number_input("Average Price (BRL)", min_value=0.0, step=1.0, value=120.0)

with col2:
    avg_freight = st.number_input("Average Freight (BRL)", min_value=0.0, step=1.0, value=18.0)
    avg_review_score = st.slider("Average Review Score", 1.0, 5.0, 4.2, 0.1)

# Prepare input
input_df = pd.DataFrame([{
    "total_orders": total_orders,
    "total_products_sold": total_products_sold,
    "avg_price": avg_price,
    "avg_freight": avg_freight,
    "avg_review_score": avg_review_score
}])

# Prediction
if st.button("Predict Profitability"):
    prediction = model.predict(input_df)[0]
    probability = model.predict_proba(input_df)[0]

    st.subheader("Prediction Result")
    if prediction == 1:
        st.success(f"Profitable Seller (Confidence: {probability[1]:.1%})")
    else:
        st.error(f"Not Profitable Seller (Confidence: {probability[0]:.1%})")

    # Show probabilities
    st.write(f"Probability of being profitable: {probability[1]:.1%}")
    st.write(f"Probability of not being profitable: {probability[0]:.1%}")

# Feature importance
if st.button("Show Feature Importance"):
    st.subheader("Feature Importance")
    rf = model.named_steps['clf']
    importances = rf.feature_importances_
    features = ['total_orders', 'total_products_sold', 'avg_price', 'avg_freight', 'avg_review_score']

    fig, ax = plt.subplots(figsize=(8, 4))
    y_pos = np.arange(len(features))
    ax.barh(y_pos, importances)
    ax.set_yticks(y_pos)
    ax.set_yticklabels(features)
    ax.set_xlabel('Importance')
    ax.set_title('Feature Importance in Profitability Prediction')
    st.pyplot(fig)

# Seller recommendation
if st.button("Find Similar Profitable Sellers"):
    prediction = model.predict(input_df)[0]

    if prediction == 0:
        st.info("Finding profitable sellers with similar characteristics...")

        # Find profitable sellers with similar features
        profitable_sellers = seller_data[seller_data['profitable'] == 1].copy()

        if not profitable_sellers.empty:
            # Calculate similarity score (simplified)
            for feature in features:
                profitable_sellers[f'{feature}_diff'] = abs(
                    profitable_sellers[feature] - input_df[feature].iloc[0]
                )

            profitable_sellers['total_diff'] = profitable_sellers[[f'{f}_diff' for f in features]].sum(axis=1)
            similar_sellers = profitable_sellers.nsmallest(3, 'total_diff')

            st.subheader("Recommended Profitable Sellers")

            for idx, seller in similar_sellers.iterrows():
                st.write(f"**Seller ID:** {seller['seller_id']}")
                st.write(f"**Location:** {seller.get('seller_city', 'Unknown')}, {seller.get('seller_state', 'Unknown')}")
                st.write(f"**Performance:** {int(seller['total_orders'])} orders, {seller['avg_review_score']:.1f}â˜… rating")
                st.write(f"**Revenue:** BRL {seller['total_revenue']:,.2f}")
                st.write("---")
        else:
            st.warning("No profitable sellers found in the database.")
    else:
        st.success("Your seller is already profitable! ðŸ’¸")

st.sidebar.info("""
**About this app:**
- Predicts seller profitability using machine learning
- Trained on Olist e-commerce data
- Uses Random Forest classifier
- Accuracy: 94.7%
""")


!pip install pyngrok streamlit

from pyngrok import ngrok
import subprocess
import time
import os
import requests

def cleanup():
    try:
        ngrok.kill()
        os.system("pkill -f streamlit")
        os.system("pkill -f ngrok")
        os.system("fuser -k 8501/tcp 8502/tcp 8503/tcp 2>/dev/null")
        time.sleep(3)
    except Exception as e:
        print(f"Cleanup: {e}")

cleanup()

NGROK_AUTH_TOKEN = "34ddysFwXISWWW6um42MVQKsnpD_4DGgG45UWFvCipQPRi5Jm"
ngrok.set_auth_token(NGROK_AUTH_TOKEN)
print("Ngrok authentication set successfully")

# Streamlit
def start_streamlit_app(port=8501):
    try:

        os.system(f"fuser -k {port}/tcp 2>/dev/null")
        time.sleep(2)

        process = subprocess.Popen([
            "streamlit", "run", "app.py",
            "--server.port", str(port),
            "--server.headless", "true",
            "--server.address", "0.0.0.0"
        ])

        print(f"Starting Streamlit on port {port}...")
        time.sleep(10)

        try:
            response = requests.get(f"http://localhost:{port}", timeout=10)
            if response.status_code == 200:
                print(f"Streamlit is running on port {port}")
                return process, port
        except:
            print(f"Streamlit failed to start on port {port}")
            process.terminate()
            return None, port

    except Exception as e:
        print(f" Error starting Streamlit: {e}")
        return None, port

ports_to_try = [8501, 8502, 8503, 8504]
streamlit_process = None
success_port = None

for port in ports_to_try:
    streamlit_process, success_port = start_streamlit_app(port)
    if streamlit_process:
        break

if not streamlit_process:
    print("Failed to start Streamlit on any port")
else:
    print(f"Streamlit running successfully on port {success_port}")

    # Ngrok pipeline
    try:
        print("ðŸ”— Creating ngrok tunnel...")
        public_url = ngrok.connect(success_port, bind_tls=True)

        print("\n" + "="*50)
        print(" SUCCESS! Your App is Live! ")
        print("="*50)
        print(f"Public URL: {public_url}")
        print("Share this link with anyone!")
        print("The app will auto-close when this session ends")
        print("="*50)

        tunnels = ngrok.get_tunnels()
        print(f" Active tunnels: {len(tunnels)}")
        for tunnel in tunnels:
            print(f"   - {tunnel.public_url} -> {tunnel.config['addr']}")

    except Exception as e:
        print(f"Failed to create ngrok tunnel: {e}")
        print("\n Alternative solutions:")
        print("1. Wait 2 minutes and try again")
        print("2. Visit https://dashboard.ngrok.com/endpoints/status")
        print("3. Use this command: !ngrok http 8501")
