In [1]:
#import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_curve, auc, precision_recall_curve
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from scipy.stats import f_oneway

THIS IS AN EXAMPLE AND PUT AT TOP OF FILE
Problem Statement:
Objective:
The objective of this analysis is to develop a predictive model capable of accurately forecasting the delivery status of orders within the supply chain. The "Delivery Status" column will serve as the target variable for the model. Through the implementation of diverse machine learning algorithms, the intention is to discern underlying patterns and trends in the data, thereby gaining insights into the determinants of delayed deliveries and enhancing the overall efficiency of the supply chain.
 




Hypothesis:
Our hypothesis posits that specific features in the dataset, namely "Scheduled Days for shipping," "Shipping Mode," "Order Region", "Payment type","Order Item quantity", "order date" and "shipping date" exert a substantial influence on the delivery status of orders. 
Additionally, we expect that certain customer categories or segments may exhibit higher susceptibility to experiencing late deliveries. Moreover, we anticipate that both the chosen shipping mode and delivery location will have an impact on the order delivery time. Through the utilization of these identified insights, we can construct a model with the capability to accurately forecast delivery status and provide actionable recommendations for optimizing the supply chain process.

In [None]:
#import the file midterm_data from downloads - REPLACE THIS WITH PROPER DATASET
df = pd.read_csv(R'C:\Users\hanna\Downloads\midterm_data.csv')

In [None]:
#use head() function to quickly assess the data set
df.head()

We can see several things by using the head() function: REPLACE THIS WITH RELEVANT OBSERVATIONS
- We have a numerical column describing the user ID associated to each person and another numerical column describing their age
- There is another numerical column called sessions, which is the session ID for each user
- We have a numerical time_spent column, which is the number of minutes they spent browsing, a numerical column pages_visited describing the number of pages they visited during their session, and another numerical column cart_items describing the number of items in their cart
- The next numerical column is cart_value, which lists the value of the items in each user's cart
- Checkout status lists a 1 if they did check out and a 0 if they did not
- Finally, there is a categorical column describing the device type and another categorical column describing the location of the user

In [None]:
#next use info() function to assess if there are structural issues
df.info()

We can see several things using the info() function: REPLACE THIS WITH RELEVANT OBSERVATIONS
- There are 5000 entries
- Many of the columns do not have missing entries, with the exception of the device and location columns
- Device has (5000 - 4900) 100 missing entries, or about 2% missing data
- Location has (5000 - 4970) missing entries, or about 0.4% missing data
- These are relatively small amounts of missing data

In [None]:
#next, use the describe() function for some initial descriptive statistics
#include = all because there is categorical data
df.describe(include = 'all')

We can see several things using the describe() function: REPLACE THIS WITH RELEVANT OBSERVATIONS
- The mean age of users is 42 (rounded) which is the same as the median age of 42. This indicates a normal distribution
- The mean time spent (25) and median time spent (25) are the same, also indicating normal distribution
- Mean/median for pages visited and cart items are also about the same
- The mean cart value is $149.44 while the median is 143.44, which may indicate a minimal amount of skewness
- There are 3 unique device types and 5 unique locations
- The most common device type is a desktop and the most common location of users is location 4

In [2]:
#MOVE ON TO DATA CLEANING

In [None]:
#if there are unwanted columns, drop them with this code
# List of columns to drop
columns_to_drop = ['Customer Zipcode','Customer Password','Benefit per order','Category Id', 'Category Name','Customer City',
       'Customer Country', 'Customer Id', 'Customer Segment', 'Customer State',
                   'Department Id','Order Zipcode','Customer Street'
                   ,'Product Category Id',
                   'Customer Fname','Customer Lname',
                   'Order City']

# Dropping the specified columns from the DataFrame
df= df.drop(columns=columns_to_drop)

In [None]:
#if want to check if rows have missing data in two or more columns, use this code
df[df['device'].isnull() & df['location'].isnull()]

#if you get results, use this to drop the rows
df = df.drop(df[df['userID'] == 2131].index)

In [None]:
# can use this to check data statistics again and make sure mean, median, mode have not changed too much
df.describe(include = 'all')

In [None]:
#this gives the sum of null values in every column
df.isnull().sum()

In [None]:
#if there is not too many, can use this to drop the values
df.dropna(inplace=True)

In [None]:
#can use this again to make sure there are no more null values
df.isnull().sum()

In [None]:
#use this to fill in missing values in a column with 'Other'
df['device'] = df['device'].fillna('Other')

In [None]:
#this creates a boxplot to check for outliers in certain columns
sns.boxplot(data = df, y='time_spent')

In [None]:
#this is used to drop the outliers if there does not appear to be many
df.drop(df[df['time_spent'] > 80].index, axis = 0, inplace = True)

In [None]:
#lastly, drop duplicates if needed
df.drop_duplicates(inplace=True)

In [3]:
#make sure not too many observations have been dropped, <10% of observations
df.info()

In [None]:
# Visualizations and Descriptive Statistics
# You can create various visualizations to understand the data
sns.pairplot(df)  # Create pairwise scatter plots
plt.show()

In [None]:
# Feature Engineering
# Feature Transformation and Scaling
# Assuming you have numerical and categorical features
num_features = ['feature1', 'feature2', 'feature3']
cat_features = ['categorical_feature']
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), num_features),
        ('cat', OneHotEncoder(), cat_features)
    ])

In [None]:
# Model Development
# Split the data into train and test sets
X = df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Initialize models
models = [
    ('Naive Bayes', GaussianNB()),
    ('K-Nearest Neighbors', KNeighborsClassifier()),
    ('Logistic Regression', LogisticRegression()),
    ('Support Vector Machine', SVC()),
    ('Decision Tree', DecisionTreeClassifier()),
    ('Random Forest', RandomForestClassifier())
]

In [None]:
# Loop through models, train, and evaluate
results = []
for name, model in models:
    pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                               ('model', model)])
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    
    results.append((name, accuracy, precision, recall, f1))

In [None]:
# Display model evaluation results
evaluation_df = pd.DataFrame(results, columns=['Model', 'Accuracy', 'Precision', 'Recall', 'F1 Score'])
print(evaluation_df)

In [None]:
# Model Evaluation
# You can also evaluate ROC curve and other metrics
for name, model in models:
    pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                               ('model', model)])
    pipeline.fit(X_train, y_train)
    
    y_prob = pipeline.predict_proba(X_test)[:, 1]  # Probability of positive class
    fpr, tpr, thresholds = roc_curve(y_test, y_prob)
    roc_auc = auc(fpr, tpr)
    
    plt.figure()
    plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'{name} (AUC = {roc_auc:.2f})')
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic')
    plt.legend(loc='lower right')
    plt.show()

In [None]:
# Additional Analysis (if needed)
# You can perform further statistical tests or analysis using f_oneway or other methods
# Example:
# groups = df['group_column'].unique()
# anova_results = {}
# for group in groups:
#     data = df[df['group_column'] == group]['feature_to_compare']
#     anova_results[group] = data
# f_statistic, p_value = f_oneway(*anova_results.values())
# print(f'F-statistic: {f_statistic}, p-value: {p_value}')

# Select the best model based on evaluation metrics
# You can use the evaluation results to decide on the best model for your problem

# Final Thoughts
# Interpret the results and draw conclusions about the problem statement and hypothesis.