In [12]:
#First, we import all the necessary libraries
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report

# URL of the sales dataset
file_url = (
    'https://raw.githubusercontent.com/'
    'PacktWorkshops/'
    'The-Applied-Artificial-Intelligence-Workshop/'
    'master/Datasets/'
    'Sales_Transactions_Dataset_Weekly.csv'
)

# Load the dataset into a pandas DataFrame
df = pd.read_csv(file_url)

# Display the first few rows to inspect the dataset
df.head()


Unnamed: 0,Product_Code,W0,W1,W2,W3,W4,W5,W6,W7,W8,...,Normalized 42,Normalized 43,Normalized 44,Normalized 45,Normalized 46,Normalized 47,Normalized 48,Normalized 49,Normalized 50,Normalized 51
0,P1,11,12,10,8,13,12,14,21,6,...,0.06,0.22,0.28,0.39,0.5,0.0,0.22,0.17,0.11,0.39
1,P2,7,6,3,2,7,1,6,3,3,...,0.2,0.4,0.5,0.1,0.1,0.4,0.5,0.1,0.6,0.0
2,P3,7,11,8,9,10,8,7,13,12,...,0.27,1.0,0.18,0.18,0.36,0.45,1.0,0.45,0.45,0.36
3,P4,12,8,13,5,9,6,9,13,13,...,0.41,0.47,0.06,0.12,0.24,0.35,0.71,0.35,0.29,0.35
4,P5,8,5,13,11,6,7,9,14,9,...,0.27,0.53,0.27,0.6,0.2,0.2,0.13,0.53,0.33,0.4


In [13]:
# Drop the first 55 columns: Product_Code + W0-W51 + MIN/MAX
df2 = df.drop(df.iloc[:, 0:55], inplace=False, axis=1)

# Display the new DataFrame with only normalized columns
df2.head()

#df2 contains only normalized weekly sales columns (Normalized W0–Normalized W51).
#Dropping the first 55 columns keeps the original df intact.


Unnamed: 0,Normalized 0,Normalized 1,Normalized 2,Normalized 3,Normalized 4,Normalized 5,Normalized 6,Normalized 7,Normalized 8,Normalized 9,...,Normalized 42,Normalized 43,Normalized 44,Normalized 45,Normalized 46,Normalized 47,Normalized 48,Normalized 49,Normalized 50,Normalized 51
0,0.44,0.5,0.39,0.28,0.56,0.5,0.61,1.0,0.17,0.61,...,0.06,0.22,0.28,0.39,0.5,0.0,0.22,0.17,0.11,0.39
1,0.7,0.6,0.3,0.2,0.7,0.1,0.6,0.3,0.3,0.3,...,0.2,0.4,0.5,0.1,0.1,0.4,0.5,0.1,0.6,0.0
2,0.36,0.73,0.45,0.55,0.64,0.45,0.36,0.91,0.82,0.27,...,0.27,1.0,0.18,0.18,0.36,0.45,1.0,0.45,0.45,0.36
3,0.59,0.35,0.65,0.18,0.41,0.24,0.41,0.65,0.65,0.53,...,0.41,0.47,0.06,0.12,0.24,0.35,0.71,0.35,0.29,0.35
4,0.33,0.13,0.67,0.53,0.2,0.27,0.4,0.73,0.4,0.4,...,0.27,0.53,0.27,0.6,0.2,0.2,0.13,0.53,0.33,0.4


In [14]:
# Import KMeans from sklearn
from sklearn.cluster import KMeans

# Create a K-Means model with 8 clusters and random_state=8 for reproducibility
k_means_model = KMeans(n_clusters=8, random_state=8)

# Fit the model to the normalized weekly sales data
k_means_model.fit(df2)

#n_clusters=8: We want to divide the products into 8 clusters based on sales trends.

#random_state=8: Ensures that the clustering results are reproducible.




In [15]:
# Get the cluster labels for each product
labels = k_means_model.labels_

# Preview the first 10 labels
labels[:10]

#Each product gets a label (0–7) representing its cluster.
#Products with similar sales trends will have the same label.



array([6, 6, 2, 6, 6, 1, 6, 6, 2, 2])

In [16]:
# Keep only weekly sales columns and remove Product_Code
df.drop(df.iloc[:, 53:], inplace=True, axis=1)  # remove normalized columns
df.drop('Product_Code', inplace=True, axis=1)

# Add cluster labels as a new column
df['label'] = labels

# Display the updated DataFrame
df.head()

#df now contains weekly sales + cluster label for each product.
#This allows aggregation and analysis of products by cluster.


Unnamed: 0,W0,W1,W2,W3,W4,W5,W6,W7,W8,W9,...,W43,W44,W45,W46,W47,W48,W49,W50,W51,label
0,11,12,10,8,13,12,14,21,6,14,...,7,8,10,12,3,7,6,5,10,6
1,7,6,3,2,7,1,6,3,3,3,...,4,5,1,1,4,5,1,6,0,6
2,7,11,8,9,10,8,7,13,12,6,...,14,5,5,7,8,14,8,8,7,2
3,12,8,13,5,9,6,9,13,13,11,...,10,3,4,6,8,14,8,7,8,6
4,8,5,13,11,6,7,9,14,9,9,...,11,7,12,6,6,5,11,8,9,6


In [17]:
# Sum sales per cluster
df_agg = df.groupby('label').sum()

# Count products per cluster
df_final = df[['label', 'W0']].groupby('label').count()
df_final = df_final.rename(columns={'W0': 'count_product'})

# Add total sales for each cluster
df_final['total_sales'] = df_agg.sum(axis=1)

# Calculate yearly average sales per cluster
df_final['yearly_average_sales'] = df_final['total_sales'] / df_final['count_product']

# Sort clusters by yearly average sales (descending)
df_final.sort_values(by='yearly_average_sales', ascending=False, inplace=True)

# Display the final aggregated DataFrame
df_final

#groupby('label'): Groups products by cluster.
#sum(): Calculates total sales per week.
#yearly_average_sales: Gives average yearly sales per product in the cluster.
#Sorting allows quick identification of top-performing clusters.


Unnamed: 0_level_0,count_product,total_sales,yearly_average_sales
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
5,128,173808,1357.875
6,129,86341,669.310078
2,96,48778,508.104167
1,90,43112,479.022222
4,85,17390,204.588235
7,107,4348,40.635514
0,8,96,12.0
3,168,1414,8.416667


In [18]:


# URL for the UCI Car Evaluation dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/car/car.data"

# Column names as per UCI documentation
columns = ["buying", "maintenance", "doors", "persons", "luggage_boot", "safety", "class"]

# Load dataset
df = pd.read_csv(url, names=columns)

# Preview first 5 rows
df.head()

#Dataset has categorical features describing cars and a target variable (class).


Unnamed: 0,buying,maintenance,doors,persons,luggage_boot,safety,class
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc
3,vhigh,vhigh,2,2,med,low,unacc
4,vhigh,vhigh,2,2,med,med,unacc


In [19]:
# Helper function to encode categorical columns
def encode(data_frame, column):
    from sklearn.preprocessing import LabelEncoder
    label_encoder = LabelEncoder()
    label_encoder.fit(data_frame[column].unique())
    return label_encoder.transform(data_frame[column])

# Encode all columns
df_encoded = df.copy()
for col in df_encoded.columns:
    df_encoded[col] = encode(df_encoded, col)

# Preview encoded dataset
df_encoded.head()

#Machine learning models require numeric input.
#LabelEncoder converts categorical values to numbers (e.g., low → 0, med → 1, high → 2).


Unnamed: 0,buying,maintenance,doors,persons,luggage_boot,safety,class
0,3,3,0,0,2,1,2
1,3,3,0,0,2,2,2
2,3,3,0,0,2,0,2
3,3,3,0,0,1,1,2
4,3,3,0,0,1,2,2


In [20]:
# Extract target variable 'class' and remove it from df_encoded
label = df_encoded.pop("class")

# Remaining columns are features
features = df_encoded

# Split into training and testing sets (90% train, 10% test)
features_train, features_test, label_train, label_test = train_test_split(
    features, label, test_size=0.1, random_state=88
)

# Preview shapes
print("Training features shape:", features_train.shape)
print("Testing features shape:", features_test.shape)
print("Training labels shape:", label_train.shape)
print("Testing labels shape:", label_test.shape)

#pop() removes target from feature set.
#train_test_split ensures model validation on unseen data.

Training features shape: (1555, 6)
Testing features shape: (173, 6)
Training labels shape: (1555,)
Testing labels shape: (173,)


In [21]:
# Instantiate Decision Tree Classifier
decision_tree = DecisionTreeClassifier(random_state=88)

# Train the model
decision_tree.fit(features_train, label_train)

# Make predictions on test set
predictions = decision_tree.predict(features_test)

# Preview first 10 predictions
print("First 10 predictions:", predictions[:10])

#Decision Tree splits data based on feature rules to predict car class.
#Predictions are numeric-encoded classes.


First 10 predictions: [1 1 2 1 0 2 3 2 2 2]


In [22]:
# Calculate accuracy
accuracy = decision_tree.score(features_test, label_test)
print("Decision Tree accuracy on test set:", accuracy)

# Generate classification report
report = classification_report(label_test, predictions)
print("Classification Report:\n", report)

#accuracy shows overall correctness.

#classification_report provides precision, recall, f1-score, and support for each class.




Decision Tree accuracy on test set: 0.953757225433526
Classification Report:
               precision    recall  f1-score   support

           0       0.87      0.98      0.92        42
           1       0.89      0.89      0.89         9
           2       0.99      0.96      0.98       114
           3       1.00      0.75      0.86         8

    accuracy                           0.95       173
   macro avg       0.94      0.89      0.91       173
weighted avg       0.96      0.95      0.95       173

