## Budget Allocation Analysis
This notebook analyzes budget allocation using statistical models.

## Importing Required Libraries

In [1]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge, Lasso
from sklearn.metrics import mean_absolute_error, mean_squared_error
from scipy.optimize import minimize
from skopt import gp_minimize
from skopt.space import Real
from skopt.utils import use_named_args
from sklearn.model_selection import train_test_split, GridSearchCV


## Statistical Analysis
Applying regression models and other statistical techniques to analyze budget allocation.


In [2]:
# Loading data
data = pd.read_csv("Monthdata.csv")
marketing_features = ['TV', 'Digital', 'Sponsorship', 'Content Marketing', 
                      'Online marketing', ' Affiliates', 'SEM', 'Radio', 'Other']

existing_features = [col for col in marketing_features if col in data.columns]
X = data[existing_features]
y = data['gmv']

# Normalizing features to prevent scale-related issues
X = X / X.max()

# Splitting data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fitting Ridge and Lasso models
ridge = Ridge(alpha=1.0).fit(X_train, y_train)
lasso = Lasso(alpha=0.1).fit(X_train, y_train)

# Predictions
y_pred_ridge = ridge.predict(X_test)
y_pred_lasso = lasso.predict(X_test)

# Model performance evaluation
ridge_score = ridge.score(X_test, y_test)
lasso_score = lasso.score(X_test, y_test)

# Baseline GMV
baseline_gmv = np.mean(y_test)

# Feature importance (weights of each channel)
channel_weights = np.abs(ridge.coef_)
normalized_weights = channel_weights / np.sum(channel_weights)
channel_weights_dict = pd.Series(channel_weights, index=existing_features).to_dict()
normalized_weights_dict = pd.Series(normalized_weights, index=existing_features).to_dict()

# Bayesian Optimization for Budget Allocation
def objective(params):
    budget_allocation = np.array(params)
    predicted_gmv = ridge.predict([budget_allocation * X.max()])[0]  # Scale back allocation
    return -predicted_gmv

space = [Real(0, 1, name=col) for col in X.columns]

@use_named_args(space)
def bayesian_objective(**params):
    return objective(list(params.values()))

res = gp_minimize(bayesian_objective, space, n_calls=50, random_state=42)
best_budget_allocation = np.array(res.x) * X.max()
optimized_gmv_bayesian = -res.fun

# Quadratic Programming for Constrained Budget Optimization
def quadratic_objective(x):
    return -ridge.predict([x * X.max()])[0]  # Scale back allocation

constraints = ({'type': 'eq', 'fun': lambda x: np.sum(x) - 1})  # Budget constraint
bounds = [(0, 1) for _ in range(len(X.columns))]

res_qp = minimize(quadratic_objective, np.ones(len(X.columns)) / len(X.columns), bounds=bounds, constraints=constraints)
best_qp_allocation = res_qp.x * X.max()
optimized_gmv_qp = -res_qp.fun

# Compute percentage increase in GMV
increase_bayesian = ((optimized_gmv_bayesian - baseline_gmv) / baseline_gmv) * 100
increase_qp = ((optimized_gmv_qp - baseline_gmv) / baseline_gmv) * 100

# Compare results
models_scores = {
    "Ridge": ridge_score,
    "Lasso": lasso_score,
    "Bayesian Optimization": optimized_gmv_bayesian,
    "Quadratic Programming": optimized_gmv_qp
}
print(models_scores)

best_model = max(models_scores, key=models_scores.get)

# Output results
results = {
    "Best Model": best_model,
    "Baseline GMV": baseline_gmv,
    "Optimized GMV (Bayesian)": optimized_gmv_bayesian,
    "Optimized GMV (QP)": optimized_gmv_qp,
    "Percentage Increase (Bayesian)": increase_bayesian,
    "Percentage Increase (QP)": increase_qp,
    "Best Bayesian Allocation": pd.DataFrame({
        "Channel": existing_features,
        "Allocation": best_budget_allocation,
        "Weight": [channel_weights_dict[channel] for channel in existing_features],
        "Normalized Weight": [normalized_weights_dict[channel] for channel in existing_features]
    }).to_string(index=False),
    "Channel Weights": channel_weights_dict
}

print("\nOptimized Budget Allocation Results:\n")
for key, value in results.items():
    if isinstance(value, pd.DataFrame):
        print(f"{key}:\n{value}\n")
    else:
        print(f"{key}: {value}\n")


  model = cd_fast.enet_coordinate_descent(


{'Ridge': 0.9223787009771746, 'Lasso': 0.6611607024334565, 'Bayesian Optimization': 518473457.80655193, 'Quadratic Programming': 177141114.60808295}

Optimized Budget Allocation Results:

Best Model: Bayesian Optimization

Baseline GMV: 294061569.0

Optimized GMV (Bayesian): 518473457.80655193

Optimized GMV (QP): 177141114.60808295

Percentage Increase (Bayesian): 76.31459274657952

Percentage Increase (QP): -39.7605354516479

Best Bayesian Allocation:           Channel  Allocation       Weight  Normalized Weight
               TV         1.0 5.535402e+07           0.144152
          Digital         1.0 1.140094e+07           0.029690
      Sponsorship         1.0 7.654264e+07           0.199330
Content Marketing         1.0 2.686592e+07           0.069964
 Online marketing         1.0 8.336787e+07           0.217104
       Affiliates         1.0 8.583886e+07           0.223539
              SEM         1.0 2.111998e+07           0.055000
            Radio         1.0 1.049728e+07    



In [None]:
# Loading data
data = pd.read_csv("Monthdata.csv")

# Marketing features for budget allocation 
marketing_features = ['TV', 'Digital', 'Sponsorship', 'Content Marketing', 
                      'Online marketing', ' Affiliates', 'SEM', 'Radio', 'Other']

# Ensure all features exist in the dataset
existing_features = [col for col in marketing_features if col in data.columns]
X = data[existing_features]
y = data['gmv']

# Normalizing features to prevent scale-related issues
X = X / X.max()

# Splitting data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Hyperparameter tuning for Ridge and Lasso
param_grid = {'alpha': np.logspace(-3, 3, 20)}  # Search in range 0.001 to 1000
ridge_grid = GridSearchCV(Ridge(), param_grid, cv=5, scoring='r2')
lasso_grid = GridSearchCV(Lasso(), param_grid, cv=5, scoring='r2')
ridge_grid.fit(X_train, y_train)
lasso_grid.fit(X_train, y_train)
best_alpha_ridge = ridge_grid.best_params_['alpha']
best_alpha_lasso = lasso_grid.best_params_['alpha']

# Training Ridge and Lasso with best alpha
ridge = Ridge(alpha=best_alpha_ridge).fit(X_train, y_train)
lasso = Lasso(alpha=best_alpha_lasso).fit(X_train, y_train)

# Predictions
y_pred_ridge = ridge.predict(X_test)
y_pred_lasso = lasso.predict(X_test)

# Model performance evaluation
ridge_score = ridge.score(X_test, y_test)
lasso_score = lasso.score(X_test, y_test)

# Baseline GMV
baseline_gmv = np.mean(y_test)

# Feature importance (weights of each channel)
channel_weights = np.abs(ridge.coef_)
normalized_weights = channel_weights / np.sum(channel_weights)
channel_weights_dict = pd.Series(channel_weights, index=existing_features).to_dict()
normalized_weights_dict = pd.Series(normalized_weights, index=existing_features).to_dict()

# Bayesian Optimization for Budget Allocation
def objective(params):
    budget_allocation = np.array(params)
    predicted_gmv = ridge.predict([budget_allocation * X.max()])[0]  # Scale back allocation
    return -predicted_gmv

space = [Real(0, 1, name=col) for col in X.columns]

@use_named_args(space)
def bayesian_objective(**params):
    return objective(list(params.values()))

res = gp_minimize(bayesian_objective, space, n_calls=50, random_state=42)
best_budget_allocation = np.array(res.x) * X.max()
optimized_gmv_bayesian = -res.fun

# Quadratic Programming for Constrained Budget Optimization
def quadratic_objective(x):
    return -ridge.predict([x * X.max()])[0]  # Scale back allocation

constraints = ({'type': 'eq', 'fun': lambda x: np.sum(x) - 1})  # Budget constraint
bounds = [(0, 1) for _ in range(len(X.columns))]

res_qp = minimize(quadratic_objective, np.ones(len(X.columns)) / len(X.columns), bounds=bounds, constraints=constraints)
best_qp_allocation = res_qp.x * X.max()
optimized_gmv_qp = -res_qp.fun

# Compute percentage increase in GMV
increase_bayesian = ((optimized_gmv_bayesian - baseline_gmv) / baseline_gmv) * 100
increase_qp = ((optimized_gmv_qp - baseline_gmv) / baseline_gmv) * 100

# Compare results
models_scores = {
    "Ridge (Optimized Alpha)": ridge_score,
    "Lasso (Optimized Alpha)": lasso_score,
    "Bayesian Optimization": optimized_gmv_bayesian,
    "Quadratic Programming": optimized_gmv_qp
}
print(models_scores)

best_model = max(models_scores, key=models_scores.get)

# Output results
results = {
    "Best Model": best_model,
    "Best Alpha (Ridge)": best_alpha_ridge,
    "Best Alpha (Lasso)": best_alpha_lasso,
    "Baseline GMV": baseline_gmv,
    "Optimized GMV (Bayesian)": optimized_gmv_bayesian,
    "Optimized GMV (QP)": optimized_gmv_qp,
    "Percentage Increase (Bayesian)": increase_bayesian,
    "Percentage Increase (QP)": increase_qp,
    "Best Bayesian Allocation": pd.DataFrame({
        "Channel": existing_features,
        "Allocation": best_budget_allocation,
        "Weight": [channel_weights_dict[channel] for channel in existing_features],
        "Normalized Weight": [normalized_weights_dict[channel] for channel in existing_features]
    }).to_string(index=False),
    "Channel Weights": channel_weights_dict
}

print("\nOptimized Budget Allocation Results:\n")
for key, value in results.items():
    if isinstance(value, pd.DataFrame):
        print(f"{key}:\n{value}\n")
    else:
        print(f"{key}: {value}\n")


 nan nan]
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
 

In [None]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import Ridge, Lasso
from sklearn.metrics import mean_absolute_error, mean_squared_error
from scipy.optimize import minimize
from skopt import gp_minimize
from skopt.space import Real
from skopt.utils import use_named_args

# Loading data
data = pd.read_csv("Monthdata.csv")

# Marketing features for budget allocation
marketing_features = ['TV', 'Digital', 'Sponsorship', 'Content Marketing', 
                      'Online marketing', ' Affiliates', 'SEM', 'Radio', 'Other']

existing_features = [col for col in marketing_features if col in data.columns]
X = data[existing_features]
y = data['gmv']

# Normalizing features to prevent scale-related issues
X = X / X.max()

# Splitting data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Hyperparameter tuning for Ridge and Lasso
param_grid = {'alpha': np.logspace(-3, 3, 20)}  # Search in range 0.001 to 1000
ridge_grid = GridSearchCV(Ridge(), param_grid, cv=5, scoring='r2')
lasso_grid = GridSearchCV(Lasso(), param_grid, cv=5, scoring='r2')
ridge_grid.fit(X_train, y_train)
lasso_grid.fit(X_train, y_train)

best_alpha_ridge = ridge_grid.best_params_['alpha']
best_alpha_lasso = lasso_grid.best_params_['alpha']

# Train Ridge and Lasso with best alpha
ridge = Ridge(alpha=best_alpha_ridge).fit(X_train, y_train)

# Feature importance 
channel_weights = np.abs(ridge.coef_)
normalized_weights = channel_weights / np.sum(channel_weights)
channel_weights_dict = pd.Series(channel_weights, index=existing_features).to_dict()

# Baseline total budget
prev_total_budget = data['Total Investment'].mean()

# Function for GMV prediction with constrained budget allocation
def optimize_budget(total_budget):
    def objective(x):
        return -ridge.predict([x * X.max()])[0]  # Scale back allocation
    
    constraints = ({'type': 'eq', 'fun': lambda x: np.sum(x) - 1})  # Budget constraint
    bounds = [(0, 1) for _ in range(len(X.columns))]

    res = minimize(objective, np.ones(len(X.columns)) / len(X.columns), bounds=bounds, constraints=constraints)
    best_allocation = res.x * total_budget  # Scale budget
    optimized_gmv = -res.fun

    return best_allocation, optimized_gmv

# Iterate over budget levels (from 10% to 100% of previous year)
budget_levels = np.linspace(0.1, 1.0, 10) * prev_total_budget
results = []

for budget in budget_levels:
    allocation, gmv = optimize_budget(budget)
    results.append([budget, gmv] + list(allocation))

# Create DataFrame
columns = ['Total Budget', 'Optimized GMV'] + existing_features
budget_gmv_df = pd.DataFrame(results, columns=columns)

# Display Results
print("\nOptimized Budget Allocation Results for Varying Budgets:\n")
print(budget_gmv_df)


In [None]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge, Lasso
from sklearn.metrics import mean_absolute_error, mean_squared_error
from scipy.optimize import minimize
from skopt import gp_minimize
from skopt.space import Real
from skopt.utils import use_named_args
from sklearn.model_selection import GridSearchCV

# Loading data
data = pd.read_csv("/kaggle/input/monthhhdataaa/Monthdata.csv")

# Marketing features for budget allocation (excluding 'Total Investment')
marketing_features = ['TV', 'Digital', 'Sponsorship', 'Content Marketing', 
                      'Online marketing', ' Affiliates', 'SEM', 'Radio', 'Other']

# Ensure features exist in dataset
existing_features = [col for col in marketing_features if col in data.columns]
X = data[existing_features]
y = data['gmv']

# Normalize features
X = X / X.max()

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Hyperparameter tuning for Ridge and Lasso
ridge_params = {'alpha': np.logspace(-3, 3, 10)}
lasso_params = {'alpha': np.logspace(-3, 3, 10)}

ridge_model = GridSearchCV(Ridge(), ridge_params, cv=5)
lasso_model = GridSearchCV(Lasso(), lasso_params, cv=5)

ridge_model.fit(X_train, y_train)
lasso_model.fit(X_train, y_train)

best_ridge = ridge_model.best_estimator_
best_lasso = lasso_model.best_estimator_

# Predictions
y_pred_ridge = best_ridge.predict(X_test)
y_pred_lasso = best_lasso.predict(X_test)

# Model scores
ridge_score = best_ridge.score(X_test, y_test)
lasso_score = best_lasso.score(X_test, y_test)

# Baseline GMV
baseline_gmv = np.mean(y_test)

# Feature importance
channel_weights = np.abs(best_ridge.coef_)
normalized_weights = channel_weights / np.sum(channel_weights)
channel_weights_dict = pd.Series(channel_weights, index=existing_features).to_dict()
normalized_weights_dict = pd.Series(normalized_weights, index=existing_features).to_dict()

# Bayesian Optimization for Budget Allocation
def objective(params):
    budget_allocation = np.array(params)
    predicted_gmv = best_ridge.predict([budget_allocation * X.max()])[0]  
    return -predicted_gmv

space = [Real(0, 1, name=col) for col in X.columns]

@use_named_args(space)
def bayesian_objective(**params):
    return objective(list(params.values()))

res = gp_minimize(bayesian_objective, space, n_calls=50, random_state=42)
best_budget_allocation = np.array(res.x) * X.max()
optimized_gmv_bayesian = -res.fun

# Quadratic Programming for Budget Optimization
def quadratic_objective(x):
    return -best_ridge.predict([x * X.max()])[0]  

constraints = ({'type': 'eq', 'fun': lambda x: np.sum(x) - 1})
bounds = [(0, 1) for _ in range(len(X.columns))]

res_qp = minimize(quadratic_objective, np.ones(len(X.columns)) / len(X.columns), bounds=bounds, constraints=constraints)
best_qp_allocation = res_qp.x * X.max()
optimized_gmv_qp = -res_qp.fun

# Compute percentage increase in GMV
increase_bayesian = ((optimized_gmv_bayesian - baseline_gmv) / baseline_gmv) * 100
increase_qp = ((optimized_gmv_qp - baseline_gmv) / baseline_gmv) * 100

# Budget scaling from 10% to 100%
budget_factors = np.linspace(0.1, 1.0, 10)

budget_scaled = []
gmv_bayesian = []
gmv_qp = []
increase_bayesian_list = []
increase_qp_list = []

for factor in budget_factors:
    scaled_budget_bayesian = best_budget_allocation * factor
    scaled_budget_qp = best_qp_allocation * factor

    predicted_gmv_bayesian = best_ridge.predict([scaled_budget_bayesian])[0]
    predicted_gmv_qp = best_ridge.predict([scaled_budget_qp])[0]

    increase_bayesian = ((predicted_gmv_bayesian - baseline_gmv) / baseline_gmv) * 100
    increase_qp = ((predicted_gmv_qp - baseline_gmv) / baseline_gmv) * 100

    budget_scaled.append(factor)
    gmv_bayesian.append(predicted_gmv_bayesian)
    gmv_qp.append(predicted_gmv_qp)
    increase_bayesian_list.append(increase_bayesian)
    increase_qp_list.append(increase_qp)

# Create DataFrame
budget_impact_df = pd.DataFrame({
    "Budget Factor": budget_scaled,
    "Optimized GMV (Bayesian)": gmv_bayesian,
    "Optimized GMV (QP)": gmv_qp,
    "Percentage Increase (Bayesian)": increase_bayesian_list,
    "Percentage Increase (QP)": increase_qp_list
})

# Print results
print("\nModel Scores:")
print(f"Ridge Score: {ridge_score:.4f}")
print(f"Lasso Score: {lasso_score:.4f}")

print("\nOptimized Budget Allocation Results:")
print(budget_impact_df)

print("\nBest Model:", "Ridge" if ridge_score > lasso_score else "Lasso")


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LassoCV, LinearRegression
from sklearn.metrics import r2_score
from statsmodels.stats.outliers_influence import variance_inflation_factor

# Define independent variables (marketing channels) and dependent variable (ROI)
df['ROI'] = (df['gmv'] - df['Total Investment']) / df['Total Investment']

# Drop irrelevant or redundant columns
X = df.drop(columns=['ROI', 'Unnamed: 0', 'Year', 'Month', 'NPS', 'Stock Index', 'gmv', 'units',
                     'Mean Temp (°C)', 'Total Rain (mm)', 'Total Snow (cm)', 'Total Precip (mm)', 
                     'payday_week', 'holiday_week', 'Discount%', 'SEM', ' Affiliates', 'Total Investment'])

y = df["ROI"]

# Ensure numeric values
X = X.apply(pd.to_numeric, errors='coerce')
X = X.fillna(0)  # Handle missing values

# Check VIF to detect multicollinearity
vif_data = pd.DataFrame()
vif_data["Feature"] = X.columns
vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
print("\nVariance Inflation Factor (VIF) for Features:")
print(vif_data)

# Drop features with VIF > 10 (high multicollinearity)
X = X.drop(columns=vif_data[vif_data["VIF"] > 50]["Feature"].values)


# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Train Linear Regression on selected features
model = LinearRegression().fit(X_train, y_train)
y_pred = model.predict(X_test)

# Calculate R-squared
r2 = r2_score(y_test, y_pred)
print(f"\nR-squared: {r2:.4f}")

# Get feature importance
feature_importance = pd.Series(model.coef_, index=X.columns).sort_values(key=abs, ascending=False)
print("\nTop Influential Marketing Channels on ROI:")
print(feature_importance)

# Plot feature importance
plt.figure(figsize=(10, 6))
sns.barplot(x=feature_importance.values, y=feature_importance.index, palette="coolwarm")
plt.xlabel("Impact on ROI")
plt.ylabel("Marketing Channels")
plt.title("Marketing Channels Impact on ROI")
plt.show()