In [1]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB, CategoricalNB
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import pandas as pd
import numpy as np

In [2]:
try:
    df_model_data = pd.read_csv('dataset/model1_dataset.csv')
    print("Dataset 'model1_dataset.csv' loaded successfully.")
except FileNotFoundError:
    print("Error: 'model1_dataset.csv' not found. Please ensure the file is uploaded.")
    exit() # Exit if the file is not found

Dataset 'model1_dataset.csv' loaded successfully.


In [3]:
TARGET_COLUMN = 'has_purchased'
if TARGET_COLUMN not in df_model_data.columns:
    print(f"Error: Target column '{TARGET_COLUMN}' not found in the dataset.")
    exit()

In [4]:
y = df_model_data[TARGET_COLUMN]

In [5]:
columns_to_drop_from_features = [
TARGET_COLUMN,
'customer_id',
'product_id',
'product_name',          # Textual
'product_description',   # Textual
'purchase_count',        # CRITICAL: Removed due to data leakage
# Columns below might be present from the original merge and are not intended as direct features
'customer_name',
'customer_email',
'customer_address',
'customer_phone_number',
]

In [6]:
actual_columns_to_drop = [col for col in columns_to_drop_from_features if col in df_model_data.columns]

In [7]:
X = df_model_data.drop(columns=actual_columns_to_drop)

print("\nColumns remaining in X (features set, after removing target and problematic/irrelevant columns):")
print(X.columns.tolist())


Columns remaining in X (features set, after removing target and problematic/irrelevant columns):
['age', 'city', 'preferred_product_id', 'category', 'brand', 'price', 'discount', 'storage', 'color', 'release_year', 'year_since_release', 'sales_factor']


In [8]:
numerical_features = []
categorical_features = []

for col in X.columns:
    if X[col].dtype in ['int64', 'float64']:
    # Explicitly define based on business logic and data type
        if col in ['customer_age', 'product_base_price', 'product_default_discount','product_release_year', 'product_year_since_release', 'product_sales_factor']:
            numerical_features.append(col)
    # Assuming 'customer_preferred_product_id' is an ID and should be treated as categorical if it's not a true continuous number
        elif col in ['customer_preferred_product_id']:
            categorical_features.append(col)
        else: # Catch any other numerical columns
            numerical_features.append(col)
    else: # Treat as categorical (object, bool, etc.)
        categorical_features.append(col)

In [9]:
for col in numerical_features:
    if col in X.columns and X[col].isnull().any():
        X[col] = X[col].fillna(X[col].median())

In [10]:
for col in categorical_features:
    if col in X.columns and X[col].isnull().any():
        X[col] = X[col].fillna('Missing') # Or X[col].mode()[0]

print(f"\nIdentified Numerical features: {numerical_features}")
print(f"Identified Categorical features: {categorical_features}")


Identified Numerical features: ['age', 'preferred_product_id', 'price', 'discount', 'release_year', 'year_since_release', 'sales_factor']
Identified Categorical features: ['city', 'category', 'brand', 'storage', 'color']


In [11]:
numerical_transformer = Pipeline(steps=[
('scaler', StandardScaler())
])

In [12]:
categorical_transformer = Pipeline(steps=[
('onehot', OneHotEncoder(handle_unknown='ignore'))
])

In [13]:
preprocessor = ColumnTransformer(
transformers=[
('num', numerical_transformer, numerical_features),
('cat', categorical_transformer, categorical_features)
],
remainder='drop' # Drop any columns not specified
)

In [14]:
model = Pipeline(steps=[('preprocessor', preprocessor),
('classifier', GaussianNB())])

In [15]:
# Gabungkan X dan y menjadi satu DataFrame
df_encoded = X.copy()
df_encoded[TARGET_COLUMN] = y

# Simpan ke CSV
df_encoded.to_csv('dataset/encoded_model1_dataset.csv', index=False)
print("Encoded dataset saved as 'encoded_model1_dataset.csv'.")

Encoded dataset saved as 'encoded_model1_dataset.csv'.


In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
print(f"\nTrain set size: {len(X_train)} rows")
print(f"Test set size: {len(X_test)} rows")


Train set size: 274707 rows
Test set size: 68677 rows


In [17]:
print("\nTraining Naive Bayes model...")
model.fit(X_train, y_train)
print("Model training complete.")


Training Naive Bayes model...
Model training complete.


In [18]:
y_pred = model.predict(X_test)

In [19]:
if hasattr(model, 'predict_proba'):
    y_proba = model.predict_proba(X_test)[:, 1] # Probability of the positive class (1)
else:
    y_proba = None
    print("Warning: predict_proba is not available for this model, skipping ROC-AUC calculation.")

print("\n--- Model Evaluation ---")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"Precision: {precision_score(y_test, y_pred):.4f}")
print(f"Recall: {recall_score(y_test, y_pred):.4f}")
print(f"F1-Score: {f1_score(y_test, y_pred):.4f}")
if y_proba is not None:
    print(f"ROC-AUC: {roc_auc_score(y_test, y_proba):.4f}")
else:
    print("ROC-AUC not calculated.")

print("\nSintaks untuk model Naive Bayes telah dibuat dan dieksekusi.")
print("Dataset yang digunakan adalah 'model1_dataset.csv'.")
print("Kolom 'has_purchased' adalah target, dan kolom-kolom lain yang relevan digunakan sebagai fitur.")


--- Model Evaluation ---
Accuracy: 0.4978
Precision: 0.4981
Recall: 0.5864
F1-Score: 0.5387
ROC-AUC: 0.4990

Sintaks untuk model Naive Bayes telah dibuat dan dieksekusi.
Dataset yang digunakan adalah 'model1_dataset.csv'.
Kolom 'has_purchased' adalah target, dan kolom-kolom lain yang relevan digunakan sebagai fitur.


In [None]:
import pickle
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# Simpan model dan hasil evaluasi ke dalam satu dictionary
evaluation_results = {
    "accuracy": accuracy_score(y_test, y_pred),
    "precision": precision_score(y_test, y_pred),
    "recall": recall_score(y_test, y_pred),
    "f1_score": f1_score(y_test, y_pred)
}
if y_proba is not None:
    evaluation_results["roc_auc"] = roc_auc_score(y_test, y_proba)

model_and_metrics = {
    "model": model,
    "evaluation": evaluation_results
}

# Simpan ke file .pkl
with open('naive_bayes_model.pkl', 'wb') as f:
    pickle.dump(model_and_metrics, f)
print("Model dan hasil evaluasi berhasil disimpan ke 'naive_bayes_model.pkl'.")

# Simpan hasil evaluasi ke file teks
with open('dataset/model_evaluation.txt', 'w') as f:
    for metric, value in evaluation_results.items():
        f.write(f"{metric}: {value:.4f}\n")
print("Hasil evaluasi model berhasil disimpan ke 'model_evaluation.txt'.")

Model dan hasil evaluasi berhasil disimpan ke 'naive_bayes_model.pkl'.
Hasil evaluasi model berhasil disimpan ke 'model_evaluation.txt'.


In [21]:
import pickle
import numpy as np
import pandas as pd
from flask import Flask, render_template_string, request

In [22]:
# Load model and evaluation results
with open('naive_bayes_model.pkl', 'rb') as f:
    model_and_metrics = pickle.load(f)
model = model_and_metrics['model']
evaluation = model_and_metrics['evaluation']

In [23]:
# Ambil nama fitur dari model pipeline
feature_names = model.named_steps['preprocessor'].transformers_[0][2] + model.named_steps['preprocessor'].transformers_[1][2]

In [27]:
!ngrok config add-authtoken 2pbnehUeLOtB0htlrVKOrlYmUH0_2UDLiDAH3cZpPjP896ubA

Authtoken saved to configuration file: C:\Users\User\AppData\Local/ngrok/ngrok.yml


In [29]:
!pip install streamlit

Collecting streamlit
  Downloading streamlit-1.45.1-py3-none-any.whl (9.9 MB)
     ---------------------------------------- 9.9/9.9 MB 2.9 MB/s eta 0:00:00
Collecting altair<6,>=4.0
  Downloading altair-5.5.0-py3-none-any.whl (731 kB)
     -------------------------------------- 731.2/731.2 kB 4.2 MB/s eta 0:00:00
Collecting pyarrow>=7.0
  Downloading pyarrow-20.0.0-cp311-cp311-win_amd64.whl (25.8 MB)
     ---------------------------------------- 25.8/25.8 MB 3.1 MB/s eta 0:00:00
Collecting tenacity<10,>=8.1.0
  Downloading tenacity-9.1.2-py3-none-any.whl (28 kB)
Collecting toml<2,>=0.10.1
  Downloading toml-0.10.2-py2.py3-none-any.whl (16 kB)
Collecting gitpython!=3.1.19,<4,>=3.0.7
  Downloading GitPython-3.1.44-py3-none-any.whl (207 kB)
     ------------------------------------ 207.6/207.6 kB 972.2 kB/s eta 0:00:00
Collecting pydeck<1,>=0.8.0b4
  Downloading pydeck-0.9.1-py2.py3-none-any.whl (6.9 MB)
     ---------------------------------------- 6.9/6.9 MB 2.2 MB/s eta 0:00:00
Collect


[notice] A new release of pip available: 22.3 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [30]:
import streamlit as st
import pandas as pd
import numpy as np
import pickle

# Set page configuration
st.set_page_config(
    page_title="Customer Purchase Prediction",
    page_icon="🛍️",
    layout="wide"
)

# Load the model
@st.cache_resource
def load_model():
    with open('naive_bayes_model.pkl', 'rb') as f:
        model_and_metrics = pickle.load(f)
    return model_and_metrics

# Load model and evaluation metrics
model_and_metrics = load_model()
model = model_and_metrics['model']
evaluation = model_and_metrics['evaluation']

# App title
st.title("🛍️ Customer Purchase Prediction")
st.markdown("### Predict if a customer will make a purchase based on their characteristics")

# Create two columns
col1, col2 = st.columns(2)

with col1:
    st.subheader("Customer Information")
    customer_age = st.number_input("Customer Age", min_value=18, max_value=100, value=30)
    customer_preferred_product_id = st.number_input("Customer Preferred Product ID", value=1)

with col2:
    st.subheader("Product Information")
    product_base_price = st.number_input("Product Base Price", min_value=0.0, value=100.0)
    product_default_discount = st.number_input("Product Default Discount (%)", min_value=0.0, max_value=100.0, value=0.0)
    product_release_year = st.number_input("Product Release Year", min_value=2000, max_value=2025, value=2023)
    product_year_since_release = 2025 - product_release_year
    product_sales_factor = st.number_input("Product Sales Factor", min_value=0.0, max_value=10.0, value=1.0)

# Create predict button
if st.button("Predict Purchase Likelihood"):
    # Create input data frame
    input_data = pd.DataFrame({
        'customer_age': [customer_age],
        'product_base_price': [product_base_price],
        'product_default_discount': [product_default_discount],
        'product_release_year': [product_release_year],
        'product_year_since_release': [product_year_since_release],
        'product_sales_factor': [product_sales_factor],
        'customer_preferred_product_id': [customer_preferred_product_id]
    })

    # Make prediction
    prediction = model.predict(input_data)
    prediction_proba = model.predict_proba(input_data)

    # Show results
    st.markdown("### Prediction Results")
    if prediction[0] == 1:
        st.success("This customer is likely to make a purchase! 🎯")
    else:
        st.error("This customer is unlikely to make a purchase 😔")

    # Show probability
    st.markdown("### Purchase Probability")
    prob_no = prediction_proba[0][0]
    prob_yes = prediction_proba[0][1]
    
    st.progress(prob_yes)
    st.write(f"Probability of Purchase: {prob_yes:.2%}")

# Show model metrics
st.sidebar.header("Model Performance Metrics")
for metric, value in evaluation.items():
    st.sidebar.metric(
        label=metric.replace('_', ' ').title(),
        value=f"{value:.4f}"
    )

# Add information about the model
st.sidebar.markdown("---")
st.sidebar.markdown("""
### About this Model
This is a Naive Bayes classifier trained to predict customer purchases based on various features.
The model takes into account:
- Customer age
- Product pricing
- Product age
- Sales factors
""")

# Footer
st.markdown("---")
st.markdown("Made with ❤️ using Streamlit")

2025-06-04 14:20:00.503 
  command:

    streamlit run C:\Users\User\AppData\Roaming\Python\Python311\site-packages\ipykernel_launcher.py [ARGUMENTS]
2025-06-04 14:20:00.521 Session state does not function when running a script without `streamlit run`


DeltaGenerator()