In [1]:
#import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_curve, auc, precision_recall_curve
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from scipy.stats import f_oneway

THIS IS AN EXAMPLE AND PUT AT TOP OF FILE
Problem Statement:
Objective:
The objective of this analysis is to develop a predictive model capable of accurately forecasting the delivery status of orders within the supply chain. The "Delivery Status" column will serve as the target variable for the model. Through the implementation of diverse machine learning algorithms, the intention is to discern underlying patterns and trends in the data, thereby gaining insights into the determinants of delayed deliveries and enhancing the overall efficiency of the supply chain.
 




Hypothesis:
Our hypothesis posits that specific features in the dataset, namely "Scheduled Days for shipping," "Shipping Mode," "Order Region", "Payment type","Order Item quantity", "order date" and "shipping date" exert a substantial influence on the delivery status of orders. 
Additionally, we expect that certain customer categories or segments may exhibit higher susceptibility to experiencing late deliveries. Moreover, we anticipate that both the chosen shipping mode and delivery location will have an impact on the order delivery time. Through the utilization of these identified insights, we can construct a model with the capability to accurately forecast delivery status and provide actionable recommendations for optimizing the supply chain process.

In [None]:
#import the file midterm_data from downloads - REPLACE THIS WITH PROPER DATASET
df = pd.read_csv(R'C:\Users\hanna\Downloads\midterm_data.csv')

In [None]:
#use head() function to quickly assess the data set
df.head()

We can see several things by using the head() function: REPLACE THIS WITH RELEVANT OBSERVATIONS
- We have a numerical column describing the user ID associated to each person and another numerical column describing their age
- There is another numerical column called sessions, which is the session ID for each user
- We have a numerical time_spent column, which is the number of minutes they spent browsing, a numerical column pages_visited describing the number of pages they visited during their session, and another numerical column cart_items describing the number of items in their cart
- The next numerical column is cart_value, which lists the value of the items in each user's cart
- Checkout status lists a 1 if they did check out and a 0 if they did not
- Finally, there is a categorical column describing the device type and another categorical column describing the location of the user

In [None]:
#next use info() function to assess if there are structural issues
df.info()

We can see several things using the info() function: REPLACE THIS WITH RELEVANT OBSERVATIONS
- There are 5000 entries
- Many of the columns do not have missing entries, with the exception of the device and location columns
- Device has (5000 - 4900) 100 missing entries, or about 2% missing data
- Location has (5000 - 4970) missing entries, or about 0.4% missing data
- These are relatively small amounts of missing data

In [None]:
#next, use the describe() function for some initial descriptive statistics
#include = all because there is categorical data
df.describe(include = 'all')

We can see several things using the describe() function: REPLACE THIS WITH RELEVANT OBSERVATIONS
- The mean age of users is 42 (rounded) which is the same as the median age of 42. This indicates a normal distribution
- The mean time spent (25) and median time spent (25) are the same, also indicating normal distribution
- Mean/median for pages visited and cart items are also about the same
- The mean cart value is $149.44 while the median is 143.44, which may indicate a minimal amount of skewness
- There are 3 unique device types and 5 unique locations
- The most common device type is a desktop and the most common location of users is location 4

In [2]:
#MOVE ON TO DATA CLEANING

In [None]:
#if there are unwanted columns, drop them with this code
# List of columns to drop
columns_to_drop = ['Customer Zipcode','Customer Password','Benefit per order','Category Id', 'Category Name','Customer City',
       'Customer Country', 'Customer Id', 'Customer Segment', 'Customer State',
                   'Department Id','Order Zipcode','Customer Street'
                   ,'Product Category Id',
                   'Customer Fname','Customer Lname',
                   'Order City']

# Dropping the specified columns from the DataFrame
df= df.drop(columns=columns_to_drop)

In [None]:
#if want to check if rows have missing data in two or more columns, use this code
df[df['device'].isnull() & df['location'].isnull()]

#if you get results, use this to drop the rows
df = df.drop(df[df['userID'] == 2131].index)

In [None]:
# can use this to check data statistics again and make sure mean, median, mode have not changed too much
df.describe(include = 'all')

In [None]:
#this gives the sum of null values in every column
df.isnull().sum()

In [None]:
#if there is not too many, can use this to drop the values
df.dropna(inplace=True)

In [None]:
#can use this again to make sure there are no more null values
df.isnull().sum()

In [None]:
#use this to fill in missing values in a column with 'Other'
df['device'] = df['device'].fillna('Other')

In [None]:
#this creates a boxplot to check for outliers in certain columns
sns.boxplot(data = df, y='time_spent')

In [None]:
#this is used to drop the outliers if there does not appear to be many
df.drop(df[df['time_spent'] > 80].index, axis = 0, inplace = True)

In [None]:
#lastly, drop duplicates if needed
df.drop_duplicates(inplace=True)

In [3]:
#make sure not too many observations have been dropped, <10% of observations
df.info()

In [None]:
# Visualizations and Descriptive Statistics
# You can create various visualizations to understand the data
sns.pairplot(df)  # Create pairwise scatter plots
plt.show()

In [None]:
# Feature Engineering
# Feature Transformation and Scaling
# Assuming you have numerical and categorical features
num_features = ['feature1', 'feature2', 'feature3']
cat_features = ['categorical_feature']
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), num_features),
        ('cat', OneHotEncoder(), cat_features)
    ])

In [None]:
# Model Development
# Split the data into train and test sets
X = df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Initialize models
models = [
    ('Naive Bayes', GaussianNB()),
    ('K-Nearest Neighbors', KNeighborsClassifier()),
    ('Logistic Regression', LogisticRegression()),
    ('Support Vector Machine', SVC()),
    ('Decision Tree', DecisionTreeClassifier()),
    ('Random Forest', RandomForestClassifier())
]

In [None]:
# Loop through models, train, and evaluate
results = []
for name, model in models:
    pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                               ('model', model)])
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    
    results.append((name, accuracy, precision, recall, f1))

In [None]:
# Display model evaluation results
evaluation_df = pd.DataFrame(results, columns=['Model', 'Accuracy', 'Precision', 'Recall', 'F1 Score'])
print(evaluation_df)

In [None]:
# Model Evaluation
# You can also evaluate ROC curve and other metrics
for name, model in models:
    pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                               ('model', model)])
    pipeline.fit(X_train, y_train)
    
    y_prob = pipeline.predict_proba(X_test)[:, 1]  # Probability of positive class
    fpr, tpr, thresholds = roc_curve(y_test, y_prob)
    roc_auc = auc(fpr, tpr)
    
    plt.figure()
    plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'{name} (AUC = {roc_auc:.2f})')
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic')
    plt.legend(loc='lower right')
    plt.show()

In [None]:
# Additional Analysis (if needed)
# You can perform further statistical tests or analysis using f_oneway or other methods
# Example:
# groups = df['group_column'].unique()
# anova_results = {}
# for group in groups:
#     data = df[df['group_column'] == group]['feature_to_compare']
#     anova_results[group] = data
# f_statistic, p_value = f_oneway(*anova_results.values())
# print(f'F-statistic: {f_statistic}, p-value: {p_value}')

# Select the best model based on evaluation metrics
# You can use the evaluation results to decide on the best model for your problem

# Final Thoughts
# Interpret the results and draw conclusions about the problem statement and hypothesis.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# 1. Import data
# Replace 'your_dataset.csv' with the actual path to your dataset
df = pd.read_csv('your_dataset.csv')

# 2. Dataframe checks
print(df.info())
print(df.describe())

# 3. Data cleaning
# Clean missing values
missing_percentage = df.isnull().mean() * 100
cols_to_drop = missing_percentage[missing_percentage > 10].index
cols_to_fill = missing_percentage[missing_percentage <= 10].index

df_cleaned = df.drop(cols_to_drop, axis=1)
df_cleaned[cols_to_fill] = df_cleaned[cols_to_fill].fillna(df_cleaned[cols_to_fill].median())

# 4. Data exploration
# ... (Exploratory analysis and visualization for each variable)

# 5. Feature engineering
# ... (Feature engineering, binning, dummy variables, etc.)

# 6. Data model
# Assuming 'X' contains your feature columns and 'y' contains your target variable
X = df_cleaned.drop('target_column_name', axis=1)
y = df_cleaned['target_column_name']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize models
models = {
    'Logistic Regression': LogisticRegression(),
    'Naive Bayes': GaussianNB(),
    'KNN': KNeighborsClassifier(),
    'SVM': SVC(),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier()
}

# Cross-validation and model selection
results = {}
for model_name, model in models.items():
    cv_scores = cross_val_score(model, X_train, y_train, cv=10, scoring='f1')
    results[model_name] = cv_scores.mean()

best_model = max(results, key=results.get)
selected_model = models[best_model]

# Hyperparameter tuning using RandomizedSearchCV
param_dist = {
    # Define hyperparameter ranges for your selected model
}
#example
param_dist = {
    'SVM__C': [0.1, 1, 10],  # Example: tuning C parameter for SVM
    'Random Forest__n_estimators': [50, 100, 150],  # Example: tuning n_estimators for Random Forest
    # Add more hyperparameters as needed for each model
}


random_search = RandomizedSearchCV(selected_model, param_distributions=param_dist, n_iter=100, cv=5, scoring='f1', random_state=42)
random_search.fit(X_train, y_train)
best_tuned_model = random_search.best_estimator_

# Model evaluation
y_pred = best_tuned_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred)

# 7. Explaining your results
# ... (Explanation of your process, choices, and results)


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Import the dataset
df = pd.read_csv('your_dataset.csv')

# Exploratory Analysis
print(df.info())
print(df.describe())

# Explore conversion rate by demographic and behavioral variables
conversion_by_gender = df.groupby('gender')['converted_Fri'].mean()
conversion_by_channel = df.groupby('marketing_channel')['converted_Fri'].mean()
# ... more exploratory analysis ...

# Data Preparation
# Drop irrelevant columns and handle missing values
# ...

# Feature Engineering
# Calculate average ratings and prices for the week
df['avg_rating_week'] = (df['avg_rating_Mon'] + df['avg_rating_Tue'] +
                         df['avg_rating_Wed'] + df['avg_rating_Thu'] + df['avg_rating_Fri']) / 5
df['avg_price_week'] = (df['avg_price_Mon'] + df['avg_price_Tue'] +
                        df['avg_price_Wed'] + df['avg_price_Thu'] + df['avg_price_Fri']) / 5

# Model Building
# Prepare data for modeling
X = df.drop(['visitor_id', 'converted_Fri'], axis=1)
y = df['converted_Fri']

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define preprocessing steps for different types of columns
numeric_features = ['product_views_Mon', 'product_views_Tue', 'product_views_Wed', 'product_views_Thu',
                    'product_views_Fri', 'time_on_site_Mon', 'time_on_site_Tue', 'time_on_site_Wed',
                    'time_on_site_Thu', 'time_on_site_Fri', 'pages_visited_Mon', 'pages_visited_Tue',
                    'pages_visited_Wed', 'pages_visited_Thu', 'pages_visited_Fri', 'avg_rating_week', 'avg_price_week']
categorical_features = ['visitor_location', 'marketing_channel', 'gender']

numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder()

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Build a Random Forest model
model = Pipeline(steps=[('preprocessor', preprocessor),
                        ('classifier', RandomForestClassifier(random_state=42))])

# Train the model
model.fit(X_train, y_train)

# Model Evaluation
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Explanation to Business Executive
# ...

# Print evaluation metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)


In [None]:
df['views_to_ratings_ratio'] = df['product_views_week'] / df['avg_rating_week']


In [None]:
df['total_time_spent_week'] = df['time_on_site_Mon'] + df['time_on_site_Tue'] +
                              df['time_on_site_Wed'] + df['time_on_site_Thu'] + df['time_on_site_Fri']


In [None]:
df['product_views_change'] = df['product_views_Fri'] - df['product_views_Mon']
df['avg_rating_change'] = df['avg_rating_Fri'] - df['avg_rating_Mon']


In [None]:
df['avg_pages_visited_per_day'] = (df['pages_visited_Mon'] + df['pages_visited_Tue'] +
                                   df['pages_visited_Wed'] + df['pages_visited_Thu'] + df['pages_visited_Fri']) / 5


In [None]:
df['interacted_with_facebook'] = df['marketing_channel'].apply(lambda x: 1 if x == 'Facebook' else 0)
df['interacted_with_google'] = df['marketing_channel'].apply(lambda x: 1 if x == 'Google' else 0)
# Repeat for other marketing channels


In [None]:
df['gender_views_ratio'] = df['gender'].apply(lambda x: 1 if x == 'M' else 0) * df['product_views_week']
# Repeat for other demographic attributes


In [None]:
# Using pandas get_dummies() function
df_dummies = pd.get_dummies(df, columns=['visitor_location'], drop_first=True)

# Resulting columns: 'USA', 'Canada', 'UK'


In [None]:
# Using pandas get_dummies() function
df_dummies = pd.get_dummies(df, columns=['marketing_channel'], drop_first=True)

# Resulting columns: 'marketing_channel_Facebook', 'marketing_channel_Google', ...


In [None]:
# Using pandas get_dummies() function
df_dummies = pd.get_dummies(df, columns=['gender'], drop_first=True)

# Resulting columns: 'gender_M'


In [None]:
# using drop_first=True will avoid multicollinearity issues by dropping the first category, which serves as the reference category.

#After creating dummy variables, remember to drop the original categorical columns from the DataFrame to prevent redundancy and improve model performance. Also, make sure to incorporate these new features into your data preparation and model building process.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import ttest_ind

# Import the dataset
df = pd.read_csv('your_dataset.csv')

# Data Preparation
# ... (Drop irrelevant columns, handle missing values, etc.)

# Calculate average time spent for users who converted and didn't convert
avg_time_spent_converted = df[df['converted_Fri'] == 1]['total_time_spent_week'].mean()
avg_time_spent_not_converted = df[df['converted_Fri'] == 0]['total_time_spent_week'].mean()

# Visualize distributions using box plots
plt.figure(figsize=(8, 6))
plt.boxplot([df[df['converted_Fri'] == 1]['total_time_spent_week'],
             df[df['converted_Fri'] == 0]['total_time_spent_week']],
            labels=['Converted', 'Not Converted'])
plt.title('Distribution of Total Time Spent by Conversion Status')
plt.ylabel('Total Time Spent')
plt.show()

# Perform a t-test to test the hypothesis
t_stat, p_value = ttest_ind(df[df['converted_Fri'] == 1]['total_time_spent_week'],
                            df[df['converted_Fri'] == 0]['total_time_spent_week'])

print("T-Statistic:", t_stat)
print("P-Value:", p_value)

if p_value < 0.05:
    print("The difference in time spent is statistically significant.")
else:
    print("There is no statistically significant difference in time spent.")

#based on hypothesis
#Users who spend more time on the website throughout the week are more likely to convert on Friday.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import ttest_ind
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Import the dataset
df = pd.read_csv('your_dataset.csv')

# Data Preparation
# ... (Drop irrelevant columns, handle missing values, etc.)

# Calculate average time spent for users who converted and didn't convert
avg_time_spent_converted = df[df['converted_Fri'] == 1]['total_time_spent_week'].mean()
avg_time_spent_not_converted = df[df['converted_Fri'] == 0]['total_time_spent_week'].mean()

# Visualize distributions using box plots
plt.figure(figsize=(8, 6))
plt.boxplot([df[df['converted_Fri'] == 1]['total_time_spent_week'],
             df[df['converted_Fri'] == 0]['total_time_spent_week']],
            labels=['Converted', 'Not Converted'])
plt.title('Distribution of Total Time Spent by Conversion Status')
plt.ylabel('Total Time Spent')
plt.show()

# Perform a t-test to test the hypothesis
t_stat, p_value = ttest_ind(df[df['converted_Fri'] == 1]['total_time_spent_week'],
                            df[df['converted_Fri'] == 0]['total_time_spent_week'])

print("T-Statistic:", t_stat)
print("P-Value:", p_value)

if p_value < 0.05:
    print("The difference in time spent is statistically significant.")
else:
    print("There is no statistically significant difference in time spent.")

# Model Building
X = df.drop(['visitor_id', 'converted_Fri'], axis=1)
y = df['converted_Fri']

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define preprocessing steps
numeric_features = ['product_views_Mon', 'product_views_Tue', 'product_views_Wed', 'product_views_Thu',
                    'product_views_Fri', 'time_on_site_Mon', 'time_on_site_Tue', 'time_on_site_Wed',
                    'time_on_site_Thu', 'time_on_site_Fri', 'pages_visited_Mon', 'pages_visited_Tue',
                    'pages_visited_Wed', 'pages_visited_Thu', 'pages_visited_Fri', 'avg_rating_week', 'avg_price_week']
categorical_features = ['visitor_location', 'marketing_channel', 'gender']

numeric_transformer = StandardScaler()

# ... Define categorical_transformer as in previous code ...

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Build a Random Forest model
model = Pipeline(steps=[('preprocessor', preprocessor),
                        ('classifier', RandomForestClassifier(random_state=42))])

# Train the model
model.fit(X_train, y_train)

# Model Evaluation
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print("Model Accuracy:", accuracy)


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Import the dataset
df = pd.read_csv('your_dataset.csv')

# Data Preparation
# ... (Drop irrelevant columns, handle missing values, etc.)

# Explore conversion rate by demographic and behavioral variables
conversion_by_gender = df.groupby('gender')['converted_Fri'].mean()
conversion_by_channel = df.groupby('marketing_channel')['converted_Fri'].mean()

# Create a bar plot for conversion rate by gender
plt.figure(figsize=(8, 6))
sns.barplot(x=conversion_by_gender.index, y=conversion_by_gender.values)
plt.title('Conversion Rate by Gender')
plt.xlabel('Gender')
plt.ylabel('Conversion Rate')
plt.show()

# Create a bar plot for conversion rate by marketing channel
plt.figure(figsize=(10, 6))
sns.barplot(x=conversion_by_channel.index, y=conversion_by_channel.values)
plt.title('Conversion Rate by Marketing Channel')
plt.xlabel('Marketing Channel')
plt.ylabel('Conversion Rate')
plt.xticks(rotation=45)
plt.show()

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import ttest_ind
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Import the dataset
df = pd.read_csv('your_dataset.csv')

# Data Preparation
# ... (Drop irrelevant columns, handle missing values, etc.)

# Explore conversion rate by gender
conversion_by_gender = df.groupby('gender')['converted_Fri'].mean()

# Visualize conversion rate by gender
plt.figure(figsize=(8, 6))
sns.barplot(x=conversion_by_gender.index, y=conversion_by_gender.values)
plt.title('Conversion Rate by Gender')
plt.xlabel('Gender')
plt.ylabel('Conversion Rate')
plt.show()

# Perform a t-test to test the hypothesis
converted_females = df[df['gender'] == 'F']['converted_Fri']
converted_males = df[df['gender'] == 'M']['converted_Fri']

t_stat, p_value = ttest_ind(converted_females, converted_males)

print("T-Statistic:", t_stat)
print("P-Value:", p_value)

if p_value < 0.05:
    print("Females are more likely to convert than males.")
else:
    print("There is no statistically significant difference in conversion rates between genders.")

# Model Building
X = df.drop(['visitor_id', 'converted_Fri'], axis=1)
y = df['converted_Fri']

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define preprocessing steps
numeric_features = ['product_views_Mon', 'product_views_Tue', 'product_views_Wed', 'product_views_Thu',
                    'product_views_Fri', 'time_on_site_Mon', 'time_on_site_Tue', 'time_on_site_Wed',
                    'time_on_site_Thu', 'time_on_site_Fri', 'pages_visited_Mon', 'pages_visited_Tue',
                    'pages_visited_Wed', 'pages_visited_Thu', 'pages_visited_Fri', 'avg_rating_week', 'avg_price_week']
categorical_features = ['visitor_location', 'marketing_channel']

numeric_transformer = StandardScaler()

# ... Define categorical_transformer as in previous code ...

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Build a Random Forest model
model = Pipeline(steps=[('preprocessor', preprocessor),
                        ('classifier', RandomForestClassifier(random_state=42))])

# Train the model
model.fit(X_train, y_train)

# Model Evaluation
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print("Model Accuracy:", accuracy)


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Import the dataset
df = pd.read_csv('your_dataset.csv')

# Data Preparation
# ... (Drop irrelevant columns, handle missing values, etc.)

# Model Building
X = df.drop(['visitor_id', 'converted_Fri'], axis=1)
y = df['converted_Fri']

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define preprocessing steps
numeric_features = ['product_views_Mon', 'product_views_Tue', 'product_views_Wed', 'product_views_Thu',
                    'product_views_Fri', 'time_on_site_Mon', 'time_on_site_Tue', 'time_on_site_Wed',
                    'time_on_site_Thu', 'time_on_site_Fri', 'pages_visited_Mon', 'pages_visited_Tue',
                    'pages_visited_Wed', 'pages_visited_Thu', 'pages_visited_Fri', 'avg_rating_week', 'avg_price_week']
categorical_features = ['visitor_location', 'marketing_channel', 'gender']

numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(drop='first')

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Initialize models
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Naive Bayes': GaussianNB(),
    'KNN': KNeighborsClassifier(),
    'SVM': SVC(),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'Random Forest': RandomForestClassifier(random_state=42)
}

# Train and evaluate models
for name, model in models.items():
    model_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                     ('classifier', model)])
    
    # Train the model
    model_pipeline.fit(X_train, y_train)
    
    # Predict on test data
    y_pred = model_pipeline.predict(X_test)
    
    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    
    print(f"Model: {name}")
    print(f"Accuracy: {accuracy:.2f}, Precision: {precision:.2f}, Recall: {recall:.2f}, F1-Score: {f1:.2f}")
    print("=" * 50)


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Import the dataset
df = pd.read_csv('your_dataset.csv')

# Data Exploration
# ... (Exploring conversion rates by marketing channels, demographics, etc.)

# Prepare Data for Modeling
# ... (Data cleaning, feature engineering, creating dummy variables, etc.)

# Focus on marketing channels Facebook, Instagram, Google, and YouTube
channels_of_interest = ['Facebook', 'Instagram', 'Google', 'YouTube']
df['exposed_to_channels'] = df['marketing_channel'].apply(lambda x: 1 if x in channels_of_interest else 0)

# Model Building
X = df.drop(['visitor_id', 'converted_Fri', 'marketing_channel'], axis=1)
y = df['converted_Fri']

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define preprocessing steps
numeric_features = ['product_views_Mon', 'product_views_Tue', 'product_views_Wed', 'product_views_Thu',
                    'product_views_Fri', 'time_on_site_Mon', 'time_on_site_Tue', 'time_on_site_Wed',
                    'time_on_site_Thu', 'time_on_site_Fri', 'pages_visited_Mon', 'pages_visited_Tue',
                    'pages_visited_Wed', 'pages_visited_Thu', 'pages_visited_Fri', 'avg_rating_week', 'avg_price_week']
categorical_features = ['visitor_location', 'gender']

numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(drop='first')

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Initialize models
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Naive Bayes': GaussianNB(),
    'KNN': KNeighborsClassifier(),
    'SVM': SVC(),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'Random Forest': RandomForestClassifier(random_state=42)
}

# Train and evaluate models
for name, model in models.items():
    model_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                     ('classifier', model)])
    
    # Train the model
    model_pipeline.fit(X_train, y_train)
    
    # Predict on test data
    y_pred = model_pipeline.predict(X_test)
    
    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    
    print(f"Model: {name}")
    print(f"Accuracy: {accuracy:.2f}, Precision: {precision:.2f}, Recall: {recall:.2f}, F1-Score: {f1:.2f}")
    print("=" * 50)

# Explanation of Results
# ... (Explain findings from data exploration, model evaluation, and how the hypothesis is supported or not)


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Import the dataset
df = pd.read_csv('your_dataset.csv')

# Data Preparation
# ... (Drop irrelevant columns, handle missing values, etc.)

# Feature Engineering
channels_of_interest = ['Facebook', 'Instagram', 'Google', 'YouTube']
df['exposed_to_channels'] = df['marketing_channel'].apply(lambda x: 1 if x in channels_of_interest else 0)

# Data Exploration
# Conversion Rate by Exposed Channels
conversion_by_channels = df.groupby('exposed_to_channels')['converted_Fri'].mean()

# Create a bar plot for conversion rate by exposed channels
plt.figure(figsize=(8, 6))
sns.barplot(x=conversion_by_channels.index, y=conversion_by_channels.values)
plt.title('Conversion Rate by Exposed Channels')
plt.xlabel('Exposed to Channels')
plt.ylabel('Conversion Rate')
plt.xticks([0, 1], ['Not Exposed', 'Exposed'])
plt.show()

# Prepare Data for Modeling
# ... (Data cleaning, feature engineering, creating dummy variables, etc.)

# Model Building
X = df.drop(['visitor_id', 'converted_Fri', 'marketing_channel'], axis=1)
y = df['converted_Fri']

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define preprocessing steps
numeric_features = ['product_views_Mon', 'product_views_Tue', 'product_views_Wed', 'product_views_Thu',
                    'product_views_Fri', 'time_on_site_Mon', 'time_on_site_Tue', 'time_on_site_Wed',
                    'time_on_site_Thu', 'time_on_site_Fri', 'pages_visited_Mon', 'pages_visited_Tue',
                    'pages_visited_Wed', 'pages_visited_Thu', 'pages_visited_Fri', 'avg_rating_week', 'avg_price_week']
categorical_features = ['visitor_location', 'gender']

numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(drop='first')

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Initialize models
models = {
    'Random Forest': RandomForestClassifier(random_state=42)
}

# Train and evaluate models
for name, model in models.items():
    model_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                     ('classifier', model)])
    
    # Train the model
    model_pipeline.fit(X_train, y_train)
    
    # Predict on test data
    y_pred = model_pipeline.predict(X_test)
    
    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    
    print(f"Model: {name}")
    print(f"Accuracy: {accuracy:.2f}, Precision: {precision:.2f}, Recall: {recall:.2f}, F1-Score: {f1:.2f}")
    print("=" * 50)

# Explanation of Results
# ... (Explain findings from data exploration, model evaluation, and how the hypothesis is supported or not)
