In [2]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
from scipy.stats import pearsonr
from sklearn.impute import SimpleImputer
from sklearn.tree import export_graphviz
import pydotplus
from IPython.display import Image
from io import StringIO
import graphviz
from sklearn import tree
from sklearn.metrics import r2_score

In [3]:
# Load the dataset
data = pd.read_csv('dataset.csv')

In [4]:
all_ds = pd.read_csv('dataset.csv')
all_ds

Unnamed: 0,food_category,food_department,food_family,store_sales(in millions),store_cost(in millions),unit_sales(in millions),promotion_name,sales_country,marital_status,gender,...,grocery_sqft,frozen_sqft,meat_sqft,coffee_bar,video_store,salad_bar,prepared_food,florist,media_type,cost
0,Breakfast Foods,Frozen Foods,Food,7.36,2.7232,4.0,Bag Stuffers,USA,M,F,...,18670.0,5415.0,3610.0,1.0,1.0,1.0,1.0,1.0,"Daily Paper, Radio",126.62
1,Breakfast Foods,Frozen Foods,Food,5.52,2.5944,3.0,Cash Register Lottery,USA,M,M,...,18670.0,5415.0,3610.0,1.0,1.0,1.0,1.0,1.0,"Daily Paper, Radio",59.86
2,Breakfast Foods,Frozen Foods,Food,3.68,1.3616,2.0,High Roller Savings,USA,S,F,...,18670.0,5415.0,3610.0,1.0,1.0,1.0,1.0,1.0,"Daily Paper, Radio",84.16
3,Breakfast Foods,Frozen Foods,Food,3.68,1.1776,2.0,Cash Register Lottery,USA,M,F,...,18670.0,5415.0,3610.0,1.0,1.0,1.0,1.0,1.0,In-Store Coupon,95.78
4,Breakfast Foods,Frozen Foods,Food,4.08,1.4280,3.0,Double Down Sale,USA,M,M,...,18670.0,5415.0,3610.0,1.0,1.0,1.0,1.0,1.0,Radio,50.79
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60423,Specialty,Carousel,Non-Consumable,2.76,1.3248,1.0,You Save Days,USA,M,F,...,15321.0,4294.0,2863.0,1.0,0.0,0.0,0.0,0.0,In-Store Coupon,95.25
60424,Specialty,Carousel,Non-Consumable,1.60,0.4960,1.0,Price Cutters,USA,S,F,...,15321.0,4294.0,2863.0,1.0,0.0,0.0,0.0,0.0,Sunday Paper,69.42
60425,Specialty,Carousel,Non-Consumable,5.52,2.5392,2.0,Weekend Markdown,USA,M,M,...,15321.0,4294.0,2863.0,1.0,0.0,0.0,0.0,0.0,"Sunday Paper, Radio, TV",67.51
60426,Specialty,Carousel,Non-Consumable,8.28,2.5668,3.0,Sales Days,Canada,S,M,...,27463.0,4193.0,2795.0,1.0,0.0,0.0,0.0,1.0,Sunday Paper,132.88


In [None]:
# Fill missing values with the 'most_frequent' strategy
imputer = SimpleImputer(strategy='most_frequent')
data_filled = imputer.fit_transform(data)
data_filled = pd.DataFrame(data_filled, columns=data.columns)

In [None]:
# Separate features and target
X = data_filled.drop(['food_category', 'food_department', 'food_family', 'promotion_name', 'sales_country'], axis=1)
y = data_filled[['store_sales(in millions)', 'store_cost(in millions)', 'unit_sales(in millions)']]

In [None]:
# Encode categorical variables to numerical
columns_to_encode = ['food_category', 'food_department', 'food_family', 'promotion_name',
                     'marital_status', 'gender', 'member_card', 'occupation', 'houseowner',
                     'brand_name', 'store_type', 'store_city', 'store_state', 'media_type', 'education',
                     'avg. yearly_income']

le = LabelEncoder()

for col in columns_to_encode:
    if col in X.columns:
        X[col + '_encoded'] = le.fit_transform(X[col])
        X = X.drop([col], axis=1)

In [None]:
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Display the columns in the training set
print(X_train.columns)

In [None]:
# Display the number of training and test data points
print("Jumlah Data Latih:", X_train.shape[0])
print("Jumlah Data Uji:", X_test.shape[0])

In [None]:
# Create and train the Decision Tree Regressor model
model = DecisionTreeRegressor(max_depth=3)
model.fit(X_train, y_train)

In [None]:
# Perform prediction on the test set
y_pred = model.predict(X_test)

In [None]:
# Display the predicted campaign cost
print('Prediksi harga kampanye:', y_pred)

In [None]:
# Calculate the Mean Squared Error (MSE)
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

In [None]:
# Visualize the Decision Tree
dot_data = StringIO()
export_graphviz(model, out_file=dot_data, filled=True, rounded=True, special_characters=True,
                feature_names=X.columns, class_names=['store_sales(in millions)', 'store_cost(in millions)',
                                                     'unit_sales(in millions)'])
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
graph.write_png('Tree.png')
Image(graph.create_png())

In [None]:
# Train the decision tree model
model = tree.DecisionTreeRegressor()
model.fit(X_train, y_train)

In [None]:
# Extract the rules and structure of the decision tree
dot_data = tree.export_graphviz(model, out_file=None, filled=True, rounded=True, special_characters=True,
                                feature_names=X.columns)
graph = graphviz.Source(dot_data)
graph.render("decision_tree")  # Save the decision tree to a file

In [None]:
# Display the decision tree
graph

In [None]:
# Calculates model accuracy
y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)

In [None]:
# Calculating accuracy on training data
train_accuracy = r2_score(y_train, y_pred_train)
print("Akurasi pada Data Latih:", train_accuracy)

In [None]:
# Calculating accuracy on test data
test_accuracy = r2_score(y_test, y_pred_test)
print("Akurasi pada Data Uji:", test_accuracy)