In [15]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler

# Load the dataset
df = pd.read_csv('sales-data.csv')

print(df.head())

  OrderDate  OrderID                                        ProductName  \
0  1/4/2024   103800  Message Book, Wirebound, Four 5 1/2 X 4 Forms/...   
1  1/5/2024   112326                                          Avery 508   
2  1/5/2024   112326         GBC Standard Plastic Binding Systems Combs   
3  1/5/2024   112326                      SAFCO Boltless Steel Shelving   
4  1/6/2024   141817  Avery Hi-Liter EverBold Pen Style Fluorescent ...   

   Profit  Quantity  Sales  SalesperCustomer  ProfitRatio  
0       6         2     16             16.45         33.8  
1       4         3     12             11.78         36.3  
2      -5         2      4              3.54       -155.0  
3     -65         3    273            272.74        -23.8  
4       5         3     20             19.54         25.0  


In [16]:
# Check the data types of the columns
print("Data types:\n", df.dtypes)

Data types:
 OrderDate            object
OrderID               int64
ProductName          object
Profit                int64
Quantity              int64
Sales                 int64
SalesperCustomer    float64
ProfitRatio         float64
dtype: object


In [17]:
# Convert columns to numeric where necessary (if any are object type)
df['Profit'] = pd.to_numeric(df['Profit'], errors='coerce')
df['Quantity'] = pd.to_numeric(df['Quantity'], errors='coerce')
df['Sales'] = pd.to_numeric(df['Sales'], errors='coerce')
df['SalesperCustomer'] = pd.to_numeric(df['SalesperCustomer'], errors='coerce')
df['ProfitRatio'] = pd.to_numeric(df['ProfitRatio'], errors='coerce')

# Check for missing values
print("Missing values:\n", df.isnull().sum())

Missing values:
 OrderDate           0
OrderID             0
ProductName         0
Profit              0
Quantity            0
Sales               0
SalesperCustomer    0
ProfitRatio         0
dtype: int64


In [18]:
# Fill missing values only for numeric columns
numeric_cols = df.select_dtypes(include='number').columns
df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].mean())

# Feature Engineering - Create a binary label for profitability
df['IsProfitable'] = df['Profit'].apply(lambda x: 1 if x > 0 else 0)

# Select features and target variable
features = ['Profit', 'Quantity', 'Sales', 'SalesperCustomer', 'ProfitRatio']
X = df[features]  # Features
y = df['IsProfitable']  # Target

# Feature Scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Initialize and train the Decision Tree model
clf = DecisionTreeClassifier(random_state=42)
clf.fit(X_train, y_train)

# Make predictions and evaluate the model
y_pred = clf.predict(X_test)
print("Model Evaluation:\n", classification_report(y_test, y_pred))


Model Evaluation:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00       414
           1       1.00      1.00      1.00      1585

    accuracy                           1.00      1999
   macro avg       1.00      1.00      1.00      1999
weighted avg       1.00      1.00      1.00      1999



In [19]:
# Predict profitability for new data
new_data = pd.DataFrame({
    'Profit': [8, -2, 5],
    'Quantity': [10, 5, 8],
    'Sales': [50, 20, 40],
    'SalesperCustomer': [15, 8, 12],
    'ProfitRatio': [20, -100, 30]
})
new_data_scaled = scaler.transform(new_data)

new_predictions = clf.predict(new_data_scaled)
print("Predicted profitability for new products:", new_predictions)

Predicted profitability for new products: [1 0 1]
