<a href="https://colab.research.google.com/github/hamzariffic/A-Red-Teamer-diaries/blob/master/Franchises_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Loading the dataset

In [None]:
import pandas as pd

# Loading the Franchises dataset
data = pd.read_excel('/content/Franchises Dataset.xlsx')

# Checking for data description and if there is any missing value

In [None]:
data.info()

In [None]:
data.describe()

# Decision Tree model for net profit

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error, r2_score

# No missing values have been deteced, but here's how to check for missing values and dropping them
print(data.isnull().sum())
data = data.dropna()

# Encoding categorical variables
le = LabelEncoder()
data['Business Type'] = le.fit_transform(data['Business Type'])
data['Location '] = le.fit_transform(data['Location '])

# Feature Selection
features = ['Counter Sales', 'Drive-through Sales', 'number of customers', 'Business Type', 'Location ']
target = 'Net Profit'

X = data[features]
y = data[target]

# Model Development starting with splitting of data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Training set of the Franchises model
dt_model = DecisionTreeRegressor(random_state=42)
dt_model.fit(X_train, y_train)

# Model Evaluation
y_pred = dt_model.predict(X_test)

# Calculating MSE
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")

# Calculating R-squared
r2 = r2_score(y_test, y_pred)
print(f"R-squared Score: {r2}")

# Calculating accuracy (as a percentage of R-squared)
accuracy = r2 * 100
print(f"Model Accuracy: {accuracy:.2f}%")

# Simulating and visualizing impacts

In [None]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeRegressor, plot_tree
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder

# Simulating the dataset based on the information provided
np.random.seed(42)
n_samples = 1000

data = pd.DataFrame({
    'Counter_Sales': np.random.uniform(0, 100, n_samples),
    'Drive_through_Sales': np.random.uniform(0, 100, n_samples),
    'Number_of_Customers': np.random.randint(0, 500, n_samples),
    'Business_Type': np.random.choice(['Fast Food', 'Casual Dining', 'Fine Dining'], n_samples),
    'Location': np.random.choice(['Urban', 'Suburban', 'Rural'], n_samples)
})

# Simulating Net Profit based on other features
data['Net_Profit'] = (
    0.3 * data['Counter_Sales'] +
    0.4 * data['Drive_through_Sales'] +
    0.1 * data['Number_of_Customers'] +
    np.random.normal(0, 0.1, n_samples)  # Adding some noise
)

# Adjusting Net Profit based on Business Type and Location
data.loc[data['Business_Type'] == 'Fast Food', 'Net_Profit'] *= 1.2
data.loc[data['Business_Type'] == 'Fine Dining', 'Net_Profit'] *= 1.5
data.loc[data['Location'] == 'Urban', 'Net_Profit'] *= 1.3
data.loc[data['Location'] == 'Rural', 'Net_Profit'] *= 0.8

# Encoding categorical variables
le = LabelEncoder()
data['Business_Type'] = le.fit_transform(data['Business_Type'])
data['Location'] = le.fit_transform(data['Location'])

# Prepare features and target
features = ['Counter_Sales', 'Drive_through_Sales', 'Number_of_Customers', 'Business_Type', 'Location']
X = data[features]
y = data['Net_Profit']

# Create and fit the model
dt_model = DecisionTreeRegressor(random_state=42)
dt_model.fit(X, y)

# Visualize the decision tree
plt.figure(figsize=(20,10))
plot_tree(dt_model, feature_names=features, filled=True, rounded=True, fontsize=10)
plt.title("Decision Tree for Net Profit Prediction")
plt.show()

# Feature importance
importance = dt_model.feature_importances_
for i, v in enumerate(importance):
    print(f'Feature: {features[i]}, Score: {v}')

# Partial dependence plots
from sklearn.inspection import PartialDependenceDisplay

fig, ax = plt.subplots(figsize=(12, 8))
display = PartialDependenceDisplay.from_estimator(dt_model, X, features, ax=ax)
plt.show()

# Developing Random Forest Prediction Model For Franchises Dataset

In [None]:
data.head()

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt

# Loading the Franchises dataset again
data = pd.read_excel('/content/Franchises Dataset.xlsx')

# Preparing the features and target
features = ['Counter_Sales', 'Drive_through_Sales', 'Number_of_Customers', 'Business_Type', 'Location']
target = 'Net Profit'

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Creating and training the Random Forest model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Predictions on the test set
y_pred = rf_model.predict(X_test)

# MSE and R-squared
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R-squared Score: {r2}")
print(f"Model Accuracy: {r2 * 100:.2f}%")

# Feature importance
importance = rf_model.feature_importances_
for i, v in enumerate(importance):
    print(f'Feature: {features[i]}, Score: {v}')

# Visualize feature importance
plt.figure(figsize=(10, 6))
plt.bar([x for x in range(len(importance))], importance)
plt.xticks(range(len(features)), features, rotation=45)
plt.xlabel('Features')
plt.ylabel('Importance')
plt.title('Feature Importance in Random Forest Model')
plt.tight_layout()
plt.show()

# Partial dependence plots
from sklearn.inspection import PartialDependenceDisplay

fig, ax = plt.subplots(figsize=(12, 8))
display = PartialDependenceDisplay.from_estimator(rf_model, X_test, features, ax=ax)
plt.show()

# Simulating the Model Parameters, and visualizing then interpretting

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
from sklearn.inspection import PartialDependenceDisplay

# Simulating the dataset (as before)
np.random.seed(42)
n_samples = 1000

data = pd.DataFrame({
    'Counter_Sales': np.random.uniform(0, 100, n_samples),
    'Drive_through_Sales': np.random.uniform(0, 100, n_samples),
    'Number_of_Customers': np.random.randint(0, 500, n_samples),
    'Business_Type': np.random.choice(['Burger store', 'Pizza Store', 'Café'], n_samples),
    'Location': np.random.choice(['Urban', 'Suburban', 'Rural'], n_samples)
})

# Simulating Net Profit
data['Net_Profit'] = (
    0.3 * data['Counter_Sales'] +
    0.4 * data['Drive_through_Sales'] +
    0.1 * data['Number_of_Customers'] +
    np.random.normal(0, 0.1, n_samples)
)
data.loc[data['Business_Type'] == 'Burger store', 'Net_Profit'] *= 1.2
data.loc[data['Business_Type'] == 'Café', 'Net_Profit'] *= 1.5
data.loc[data['Location'] == 'Urban', 'Net_Profit'] *= 1.3
data.loc[data['Location'] == 'Rural', 'Net_Profit'] *= 0.8

# Encoding categorical variables
le = LabelEncoder()
data['Business_Type'] = le.fit_transform(data['Business_Type'])
data['Location'] = le.fit_transform(data['Location'])

# Prepare features and target
features = ['Counter_Sales', 'Drive_through_Sales', 'Number_of_Customers', 'Business_Type', 'Location']
X = data[features]
y = data['Net_Profit']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the parameter grid for RandomizedSearchCV
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Create and fit RandomizedSearchCV
rf = RandomForestRegressor(random_state=42)
random_search = RandomizedSearchCV(estimator=rf, param_distributions=param_grid,
                                   n_iter=20, cv=5, random_state=42, n_jobs=-1)
random_search.fit(X_train, y_train)

# Get the best model
best_rf = random_search.best_estimator_

# Make predictions and evaluate
y_pred = best_rf.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Best parameters: {random_search.best_params_}")
print(f"Mean Squared Error: {mse}")
print(f"R-squared Score: {r2}")
print(f"Model Accuracy: {r2 * 100:.2f}%")

# Feature importance
importance = best_rf.feature_importances_
for i, v in enumerate(importance):
    print(f'Feature: {features[i]}, Score: {v}')

# Visualize feature importance
plt.figure(figsize=(10, 6))
plt.bar([x for x in range(len(importance))], importance)
plt.xticks(range(len(features)), features, rotation=45)
plt.xlabel('Features')
plt.ylabel('Importance')
plt.title('Feature Importance in Random Forest Model')
plt.tight_layout()
plt.show()

# Partial dependence plots
fig, ax = plt.subplots(figsize=(15, 10))
display = PartialDependenceDisplay.from_estimator(best_rf, X_test, features, ax=ax)
plt.tight_layout()
plt.show()

# Predicting with both models

In [None]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder

# Recreate the simulated dataset (simplified version)
np.random.seed(42)
n_samples = 1000

data = pd.DataFrame({
    'Counter_Sales': np.random.uniform(0, 1000000, n_samples),
    'Drive_through_Sales': np.random.uniform(0, 1000000, n_samples),
    'Number_of_Customers': np.random.randint(0, 5000, n_samples),
    'Business_Type': np.random.choice(['Fast Food', 'Casual Dining', 'Fine Dining', 'Pizza'], n_samples),
    'Location': np.random.choice(['Urban', 'Suburban', 'Rural'], n_samples)
})

# Simulating Net Profit (simplified)
data['Net_Profit'] = (
    0.3 * data['Counter_Sales'] +
    0.4 * data['Drive_through_Sales'] +
    0.1 * data['Number_of_Customers'] * 100 +
    np.random.normal(0, 100000, n_samples)
)

# Encoding categorical variables
le_business = LabelEncoder()
le_location = LabelEncoder()
data['Business_Type'] = le_business.fit_transform(data['Business_Type'])
data['Location'] = le_location.fit_transform(data['Location'])

# Prepare features and target
features = ['Counter_Sales', 'Drive_through_Sales', 'Number_of_Customers', 'Business_Type', 'Location']
X = data[features]
y = data['Net_Profit']

# Train Decision Tree model
dt_model = DecisionTreeRegressor(random_state=42)
dt_model.fit(X, y)

# Train Random Forest model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X, y)

# Prepare the new data point
new_data = pd.DataFrame({
    'Counter_Sales': [500000],
    'Drive_through_Sales': [700000],
    'Number_of_Customers': [0],  # We don't have this information, so we'll use 0
    'Business_Type': le_business.transform(['Pizza']),
    'Location': le_location.transform(['Urban'])  # Assuming Richmond is considered urban
})

# Make predictions
dt_prediction = dt_model.predict(new_data)[0]
rf_prediction = rf_model.predict(new_data)[0]

print(f"Decision Tree Prediction: ${dt_prediction:.2f}")
print(f"Random Forest Prediction: ${rf_prediction:.2f}")