In [None]:
import pandas as pd

try:
    df = pd.read_csv('DataCoSupplyChainDataset.csv', encoding='latin1')
    print("Dataset loaded successfully!")
    display(df.head())
except FileNotFoundError:
    print("Error: CSV file not found. Please ensure the file is in the same folder.")

Dataset loaded successfully!


Unnamed: 0,Type,Days for shipping (real),Days for shipment (scheduled),Benefit per order,Sales per customer,Delivery Status,Late_delivery_risk,Category Id,Category Name,Customer City,...,Order Zipcode,Product Card Id,Product Category Id,Product Description,Product Image,Product Name,Product Price,Product Status,shipping date (DateOrders),Shipping Mode
0,DEBIT,3,4,91.25,314.640015,Advance shipping,0,73,Sporting Goods,Caguas,...,,1360,73,,http://images.acmesports.sports/Smart+watch,Smart watch,327.75,0,2/3/2018 22:56,Standard Class
1,TRANSFER,5,4,-249.089996,311.359985,Late delivery,1,73,Sporting Goods,Caguas,...,,1360,73,,http://images.acmesports.sports/Smart+watch,Smart watch,327.75,0,1/18/2018 12:27,Standard Class
2,CASH,4,4,-247.779999,309.720001,Shipping on time,0,73,Sporting Goods,San Jose,...,,1360,73,,http://images.acmesports.sports/Smart+watch,Smart watch,327.75,0,1/17/2018 12:06,Standard Class
3,DEBIT,3,4,22.860001,304.809998,Advance shipping,0,73,Sporting Goods,Los Angeles,...,,1360,73,,http://images.acmesports.sports/Smart+watch,Smart watch,327.75,0,1/16/2018 11:45,Standard Class
4,PAYMENT,2,4,134.210007,298.25,Advance shipping,0,73,Sporting Goods,Caguas,...,,1360,73,,http://images.acmesports.sports/Smart+watch,Smart watch,327.75,0,1/15/2018 11:24,Standard Class


In [None]:
# Create the target variable 'is_delayed'
# Logic: If real shipping days > scheduled shipping days, then it's a delay (1)
df['is_delayed'] = (df['Days for shipping (real)'] > df['Days for shipment (scheduled)']).astype(int)

total_orders = len(df)
delayed_orders = df['is_delayed'].sum()
delay_rate = (delayed_orders / total_orders) * 100

print(f"--- Logistics Analysis ---")
print(f"Total Orders: {total_orders:,}")
print(f"Delayed Orders: {delayed_orders:,}")
print(f"Delay Rate: {delay_rate:.2f}%")


--- Logistics Analysis ---
Total Orders: 180,519
Delayed Orders: 103,400
Delay Rate: 57.28%


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

# Selection of relevant columns for Tableau and ML
columns_to_keep = [
    'Type', 'Order Region', 'Order Country', 'Order City',
    'Days for shipping (real)', 'Days for shipment (scheduled)',
    'Shipping Mode', 'Category Name', 'Customer Segment', 
    'Sales', 'Order Item Quantity', 'is_delayed'
]

# Create a clean dataframe for our dashboard
df_final = df[columns_to_keep].dropna()

# Simple Machine Learning Model to calculate "Delay Probability"
X = pd.get_dummies(df_final[['Type', 'Order Region', 'Shipping Mode', 'Customer Segment']], drop_first=True)
y = df_final['is_delayed']

# Initialize and train the Model (Random Forest)
model = RandomForestClassifier(n_estimators=50, random_state=42)
model.fit(X, y)

# Add the "AI Insight" (Probability of Delay) to the final dataset
df_final['Delay_Probability'] = model.predict_proba(X)[:, 1]

# Save the processed data for Tableau
df_final.to_csv('logistics_for_tableau.csv', index=False)

print("Success! 'logistics_for_tableau.csv' has been created.")
print("The dataset now includes AI-predicted delay probabilities.")

Success! 'logistics_for_tableau.csv' has been created.
The dataset now includes AI-predicted delay probabilities.


In [4]:
import joblib

# Save the trained model to a file
joblib.dump(model, 'logistic_delay_model.pkl')

# Save the list of columns (features) to ensure consistency in the app
model_columns = list(X.columns)
joblib.dump(model_columns, 'model_columns.pkl')

print("Model and columns saved successfully!")

Model and columns saved successfully!
