In [21]:
import pandas as pd
import statsmodels.api as sm

try:
    df = pd.read_csv('/content/StumptownWholesale.csv')
except FileNotFoundError:
    print("Error: StumptownWholesale.csv not found in /content/. Please upload your data file.")
    # Exit or handle the error appropriately
    df = None # Assign None to df if file not found

dependent_variable = 'buyer'

# Check if df is loaded successfully before proceeding
if df is not None:
  # Get all other variables except 'customer id' and the dependent variable
  independent_variables = [col for col in df.columns if col not in ['customer id', dependent_variable]]

  # Add a constant to the independent variables
  X = sm.add_constant(df[independent_variables])
  y = df[dependent_variable]

  model = sm.Logit(y, X)
  result = model.fit()


  print(result.summary())
else:
  print("Data not loaded. Please upload the file and try again.")

Optimization terminated successfully.
         Current function value: 0.246530
         Iterations 7
                           Logit Regression Results                           
Dep. Variable:                  buyer   No. Observations:                15000
Model:                          Logit   Df Residuals:                    14987
Method:                           MLE   Df Model:                           12
Date:                Fri, 31 Oct 2025   Pseudo R-squ.:                  0.1672
Time:                        19:09:59   Log-Likelihood:                -3697.9
converged:                       True   LL-Null:                       -4440.2
Covariance Type:            nonrobust   LLR p-value:                8.584e-311
                                         coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------------------------------
const                                 -2.6725      0.178    

 Q1: Being a cafe is strongly associated with a higher likelihood of being a buyer. This makes sense as cafes would likely be wholesale customers. Higher total spending is associated with a higher likelihood of being a buyer. This makes sense – customers who spend more are more likely to be wholesale buyers.

 When its comes to the types of coffee:hair_bender_cases and guatemala_el_injerto_bourbon_cases are spositive which mean that an increase in hair bender cases is associated with a higher likelihood of being a buyer. This seems plausible if Hair Bender and guatemala el injerto bourbon is a popular wholesale product.

 The other blends, for example, cold_brew_cases and colombia_el_jordan_cases however, show the opposite effect.This could mean that these two arent a popular wholesale product.

In [22]:
# Calculate predicted probabilities
df['predicted_prob'] = result.predict(X)

# Create deciles based on predicted probabilities
# We want decile 1 to have the highest probabilities, so we'll sort in descending order
df['logit_decile'] = pd.qcut(df['predicted_prob'], q=10, labels=False, duplicates='drop')

# Reverse the decile order so that decile 1 has the highest probabilities
# The labels are 0-9, so we map 0 to 10, 1 to 9, ..., 9 to 1
df['logit_decile'] = 10 - df['logit_decile']

# Report the response rate (proportion of buyers) in each decile
response_rate_by_decile = df.groupby('logit_decile')['buyer'].mean()

print("Response Rate by Decile:")
print(response_rate_by_decile)

Response Rate by Decile:
logit_decile
1     0.350000
2     0.141333
3     0.094667
4     0.089333
5     0.057333
6     0.040667
7     0.036667
8     0.030000
9     0.020000
10    0.012000
Name: buyer, dtype: float64


Q4: The breakeven response rate

In [30]:
sample_cost = 20
# Calculate the profit for a successful reorder
profit = 440 - 30 - 140
print(f"profit: ${profit}")
# Calculate the breakeven response rate
breakeven_res_rate = sample_cost/profit
print(f"breakeven response rate: {breakeven_res_rate * 100 :.2f}%")

profit: $270
breakeven response rate: 7.41%


Q5:

In [24]:
# Calculate the number of customers to be targeted based on breakeven response rate
# Find those with the predicted probability of buying above the break-even response rate among 15000 customers
targeted_df = df.query('predicted_prob >= @breakeven_res_rate')
targeted_rate = len(targeted_df)/len(df)
print(f"Targeted rate: {targeted_rate * 100 :.2f}%")
targeted_df

Targeted rate: 37.05%


Unnamed: 0,customer_id,is_cafe,facility_sqft,first,last,spend_total,cold_brew_cases,hair_bender_cases,holler_mountain_cases,colombia_el_jordan_cases,guatemala_el_injerto_bourbon_cases,seasonal_single_origin_cases,buyer,predicted_prob,logit_decile
3,14680,1,1175,43,19,2161,1,3,0,0,2,2,1,0.488192,1
5,19186,0,775,5,6,354,0,0,0,1,1,0,0,0.077907,4
6,67727,1,1072,29,1,1434,0,1,1,1,0,1,1,0.145538,2
9,94950,1,1127,12,6,737,0,0,1,0,0,1,0,0.091023,3
10,40684,0,1184,18,9,721,0,0,0,0,2,0,0,0.182012,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14991,64884,1,1049,8,7,728,0,1,1,0,0,1,0,0.114073,3
14993,60989,1,1179,26,7,1812,1,1,2,0,0,2,0,0.087584,4
14995,48199,1,1415,17,2,2128,1,0,3,2,2,0,0,0.136580,2
14998,52176,0,1120,5,2,361,0,1,0,0,1,1,0,0.195726,1


In [25]:
# Calculate the number of estimated customers to be targeted for the remaining 45000 customers
remaining_customer = 45000
estimated_targeted_customer = remaining_customer * targeted_rate
print(f"Number of customers to be targeted: {estimated_targeted_customer}")

Number of customers to be targeted: 16671.0


In [26]:
# The expected response rate would be the average of predicted probability among those targeted
expected_response_rate = targeted_df['predicted_prob'].mean()
print(f"Expected response rate: {expected_response_rate * 100 :.2f}%")

Expected response rate: 17.70%


In [32]:
# Expected profit among those targeted
expected_profit = estimated_targeted_customer * expected_response_rate * profit - estimated_targeted_customer * sample_cost
print(f"Expected profit: ${expected_profit:.2f}")

Expected profit: $463210.43


Q6:

In [37]:
# Calculate the average of response rate in test data set (the 15000 customers)
response_rate = df['predicted_prob'].mean()
print(f"Response rate: {response_rate * 100 :.2f}%")
# Calculate the profit if targeting all remaining customers
profit_remaining = remaining_customer * response_rate * profit - remaining_customer * sample_cost
print(f"Profit if target all remaining customers: ${profit_remaining:.2f}")
improvement = expected_profit-profit_remaining
print(f"Improvement: ${improvement:.2f}")

Response rate: 8.72%
Profit if target all remaining customers: $159480.00
Improvement: $303730.43


Based on calculations above, it can be seen that using the predictive model to target customers is significantly more profitable than targeting all customers without a model. The $303730 profit improvement is a clear indicator of the model's value, proving that it is essential for maximizing return on investment in customer marketing campaigns by targeting the right customers. This is far more important than targeting all customers.