In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.preprocessing import OrdinalEncoder, StandardScaler, OneHotEncoder, LabelEncoder
from lazypredict.Supervised import LazyClassifier
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline

from catboost import CatBoostClassifier

import pre_processing_funcs as pre

In [2]:
df = pd.read_csv('data/half_cleaned.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38862 entries, 0 to 38861
Data columns (total 29 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Promotion Name             33034 non-null  object 
 1   Store Kind                 33053 non-null  object 
 2   Store Sales                38847 non-null  float64
 3   Store Cost                 38855 non-null  float64
 4   Is Recyclable?             38862 non-null  bool   
 5   Store Area                 35296 non-null  float64
 6   Grocery Area               35255 non-null  float64
 7   Frozen Area                35309 non-null  float64
 8   Meat Area                  35313 non-null  float64
 9   Cost                       38831 non-null  float64
 10  Marriage                   38862 non-null  object 
 11  Gender                     38862 non-null  object 
 12  Children                   38862 non-null  int64  
 13  Degree                     38862 non-null  obj

In [3]:
cols_to_drop = []

# Amenities Score
df['Amenities Score'] = (
    df['Coffee Bar'].astype(int) + 
    df['Video Store'].astype(int) + 
    df['Bar For Salad'].astype(int) + 
    df['Florist'].astype(int) +
    df['Ready Food'].astype(int)
)
cols_to_drop+=["Bar For Salad", "Ready Food", "Florist", "Coffee Bar", "Video Store"]

df.drop(columns=cols_to_drop,inplace=True)

In [4]:
df = pre.fill_nulls(df)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38862 entries, 0 to 38861
Data columns (total 25 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Promotion Name             33034 non-null  object 
 1   Store Kind                 33053 non-null  object 
 2   Store Sales                38862 non-null  float64
 3   Store Cost                 38862 non-null  float64
 4   Is Recyclable?             38862 non-null  bool   
 5   Store Area                 38862 non-null  float64
 6   Grocery Area               38862 non-null  float64
 7   Frozen Area                38862 non-null  float64
 8   Meat Area                  38862 non-null  float64
 9   Cost                       38862 non-null  float64
 10  Marriage                   38862 non-null  object 
 11  Gender                     38862 non-null  object 
 12  Children                   38862 non-null  int64  
 13  Degree                     38862 non-null  obj

## Split

In [16]:
df["Promotion Name"] = df["Promotion Name"].fillna("missing")
pro_test = df[df["Promotion Name"] == "missing"].drop(columns=["Promotion Name", "Cost"])
pro_train = df[df["Promotion Name"] != "missing"].drop(columns=["Cost"])

In [17]:
pro_train.dropna(inplace=True)

In [5]:
df["Store Kind"] = df["Store Kind"].fillna("missing")
kind_test = df[df["Store Kind"] == "missing"].drop(columns=["Promotion Name", "Cost", "Store Kind"])
kind_train = df[df["Store Kind"] != "missing"].drop(columns=["Promotion Name", "Cost"])

In [6]:
kind_train.dropna(inplace=True)

## Promotion Name as Target

In [18]:
# Encode categorical features using Label Encoding
label_encoders = {}
categorical_columns = ['Marriage', 'Gender', 'Degree', 'Work', 'Store Code', 'Country ISO2', 'Order Brand', 'Order', 'Department', 'Store Kind']
for column in categorical_columns:
    label_encoders[column] = LabelEncoder()
    pro_train[column] = label_encoders[column].fit_transform(pro_train[column])
    
for column in categorical_columns:
    label_encoders[column] = LabelEncoder()
    pro_test[column] = label_encoders[column].fit_transform(pro_test[column])

In [19]:
pro_train.head()

Unnamed: 0,Promotion Name,Store Kind,Store Sales,Store Cost,Is Recyclable?,Store Area,Grocery Area,Frozen Area,Meat Area,Marriage,...,Store Code,Country ISO2,Order Brand,Order,Department,Gross Weight,Net Weight,Package Weight,Min. Person Yearly Income,Amenities Score
0,Dimes Off,0,8760000.0,4292400.0,True,2842.23,2037.64,481.98,323.0,1,...,6,9,88,15,14,28.2,26.6,1.6,10000.0,5
1,Budget Bargains,4,6360000.0,1971600.0,False,2814.95,2049.72,457.36,305.02,1,...,14,7,77,41,19,16.57,14.97,1.6,50000.0,0
2,Shelf Emptiers,4,10860000.0,4452600.0,True,2192.32,1322.21,523.32,348.85,0,...,7,1,35,30,16,28.64,27.18,1.45,30000.0,1
3,Savings Galore,4,1980000.0,673200.0,True,1974.73,1243.14,440.92,293.95,1,...,13,7,19,17,9,14.22,11.29,2.92,30000.0,1
4,Sale Winners,0,11560000.0,4970800.0,False,2862.3,1872.19,593.93,395.95,0,...,8,8,54,44,17,12.62,9.71,2.91,50000.0,5


In [20]:
target = "Promotion Name"
X = pro_train.drop(columns=target)
y = pro_train[target]

X_pro_train, X_pro_val, y_pro_train, y_pro_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [20]:
ord_categorical_columns = X.select_dtypes("object").columns

hot_categorical_columns = ['Marriage', 'Gender', 'Is Recyclable?']

numeric_columns = X.select_dtypes("float").columns

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OrdinalEncoder(), ord_categorical_columns),
        ('num', StandardScaler(), numeric_columns)
    ]
)

In [24]:
pro_clf = RandomForestClassifier(n_estimators=400, random_state=42)
pro_clf.fit(X_pro_train, y_pro_train)

RandomForestClassifier(n_estimators=400, random_state=42)

In [25]:
pro_clf.score(X_pro_val, y_pro_val)

0.20962615407900712

In [28]:
pro_ext = ExtraTreesClassifier(n_estimators=400, random_state=42)
pro_ext.fit(X_pro_train, y_pro_train)

ExtraTreesClassifier(n_estimators=400, random_state=42)

In [29]:
pro_ext.score(X_pro_val, y_pro_val)

0.26835174814590584

In [36]:
y_pro_pred = pro_ext.predict(pro_test)
y_pro_pred

array(['One Day Sale', 'You Save Days', 'Dimes Off', ...,
       'Lottery Cash Registerion', 'Wallet Savers', 'Weekend Discount'],
      dtype=object)

In [37]:
df["Promotion Name"].value_counts()

missing                     5828
Save Price                  1288
Weekend Discount            1279
Two Day Sale                1258
Price Winners               1157
Super Savers                1086
Save It (Sale)              1074
Super Duper Savers          1066
One Day Sale                1010
Roller Savings (High)        992
Sale : Double Down           970
GLD                          966
Price Slashers               941
Shelf Clearing Days          909
Full Free                    892
Lottery Cash Registerion     828
Go For It                    812
Two for One                  803
Money Savers                 790
Big Time Discounts           771
Price Destroyers             739
Saving Days                  721
Budget Bargains              720
Discount Frenzy              667
Best Price Savers            652
Dimes Off                    636
Promo Big                    607
You Save Days                605
Savings Galore               596
Price Cutters                589
Sales Days

In [38]:
df.loc[df['Promotion Name'] == "missing", 'Promotion Name'] = y_pro_pred

In [39]:
df["Promotion Name"].value_counts()

Weekend Discount            1663
Save Price                  1481
Two Day Sale                1474
Price Winners               1381
Super Duper Savers          1308
Save It (Sale)              1277
Super Savers                1275
Roller Savings (High)       1179
One Day Sale                1176
GLD                         1164
Lottery Cash Registerion    1161
Sale : Double Down          1112
Shelf Clearing Days         1076
Price Slashers              1068
Full Free                   1050
Go For It                    944
Two for One                  932
Money Savers                 920
Big Time Discounts           879
Price Destroyers             849
Saving Days                  836
Budget Bargains              829
Best Price Savers            792
Dimes Off                    776
Discount Frenzy              768
You Save Days                708
Promo Big                    708
Price Cutters                707
Sales Days                   704
Savings Galore               668
Bag Stuffe

In [40]:
df.isna().sum()

Promotion Name               0
Store Kind                   0
Store Sales                  0
Store Cost                   0
Is Recyclable?               0
Store Area                   0
Grocery Area                 0
Frozen Area                  0
Meat Area                    0
Cost                         0
Marriage                     0
Gender                       0
Children                     0
Degree                       0
Work                         0
Store Code                   0
Country ISO2                 0
Order Brand                  2
Order                        2
Department                   2
Gross Weight                 0
Net Weight                   0
Package Weight               0
Min. Person Yearly Income    0
Amenities Score              0
dtype: int64

In [None]:
clf = LazyClassifier(verbose=1,ignore_warnings=True, custom_metric=None)
models,predictions = clf.fit(X_pro_train, X_pro_val, y_pro_train, y_pro_val)

print(models)

  3%|██▊                                                                                | 1/29 [00:10<05:04, 10.89s/it]

{'Model': 'AdaBoostClassifier', 'Accuracy': 0.06553655214166793, 'Balanced Accuracy': 0.05549800099100037, 'ROC AUC': None, 'F1 Score': 0.02309763285709612, 'Time taken': 10.882581233978271}


  7%|█████▋                                                                             | 2/29 [00:16<03:25,  7.60s/it]

{'Model': 'BaggingClassifier', 'Accuracy': 0.16936582412592704, 'Balanced Accuracy': 0.1693295875472231, 'ROC AUC': None, 'F1 Score': 0.16513341778943344, 'Time taken': 5.293038606643677}


 10%|████████▌                                                                          | 3/29 [00:16<01:53,  4.38s/it]

{'Model': 'BernoulliNB', 'Accuracy': 0.11079158468291206, 'Balanced Accuracy': 0.13869676790651464, 'ROC AUC': None, 'F1 Score': 0.05066174699567309, 'Time taken': 0.542926549911499}


In [None]:
cat = CatBoostClassifier(
    iterations=500,
    random_state=42,
    loss_function='MultiClass',
)
cat.fit(
    X_pro_train,
    y_pro_train,
    cat_features=list(X_pro_train.select_dtypes("object").columns),
    eval_set=(X_pro_val, y_pro_val),
    plot=True
)

## Store Kind as Target

In [7]:
# Encode categorical features using Label Encoding
label_encoders = {}
categorical_columns = ['Marriage', 'Gender', 'Degree', 'Work', 'Store Code', 'Country ISO2', 'Order Brand', 'Order', 'Department']
for column in categorical_columns:
    label_encoders[column] = LabelEncoder()
    kind_train[column] = label_encoders[column].fit_transform(kind_train[column])
    
for column in categorical_columns:
    label_encoders[column] = LabelEncoder()
    kind_test[column] = label_encoders[column].fit_transform(kind_test[column])

In [8]:
target = "Store Kind"
X = kind_train.drop(columns=target)
y = kind_train[target]

X_kind_train, X_kind_val, y_kind_train, y_kind_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
kind_clf = RandomForestClassifier(random_state=42)
kind_clf.fit(X_kind_train, y_kind_train)

RandomForestClassifier(random_state=42)

In [10]:
kind_clf.score(X_kind_val, y_kind_val)

0.9998487369535622

In [11]:
from sklearn.metrics import accuracy_score
accuracy_score(y_kind_train, kind_clf.predict(X_kind_train))

1.0

In [12]:
y_kind_pred = kind_clf.predict(kind_test)
y_kind_pred

array(['Supermarket', 'Supermarket', 'Supermarket', ..., 'Small Grocery',
       'Deluxe', 'Gourmet'], dtype=object)

In [13]:
df["Store Kind"].value_counts()

Supermarket      14315
Deluxe           12602
missing           5809
Gourmet           3554
Mid-Size          1497
Small Grocery     1085
Name: Store Kind, dtype: int64

In [14]:
df.loc[df['Store Kind'] == "missing", 'Store Kind'] = y_kind_pred

In [15]:
df["Store Kind"].value_counts()

Supermarket      16855
Deluxe           14779
Gourmet           4190
Mid-Size          1785
Small Grocery     1253
Name: Store Kind, dtype: int64

In [16]:
df.isna().sum()

Promotion Name               5828
Store Kind                      0
Store Sales                     0
Store Cost                      0
Is Recyclable?                  0
Store Area                      0
Grocery Area                    0
Frozen Area                     0
Meat Area                       0
Cost                            0
Marriage                        0
Gender                          0
Children                        0
Degree                          0
Work                            0
Store Code                      0
Country ISO2                    0
Order Brand                     2
Order                           2
Department                      2
Gross Weight                    0
Net Weight                      0
Package Weight                  0
Min. Person Yearly Income       0
Amenities Score                 0
dtype: int64

In [41]:
df.to_csv("data/half_test.csv", index=False)

In [None]:


# Load your dataset
# Replace 'your_dataset.csv' with the actual filename of your dataset

# Split the data into features (X) and target variables (y)
# Fill missing values with a placeholder (e.g., 'Missing') for the target columns
X = df.drop(['Promotion Name', 'Store Kind'], axis=1)
y_promotion = df['Promotion Name'].fillna('Missing')
y_store_kind = df['Store Kind'].fillna('Missing')

# Encode categorical features using Label Encoding
label_encoders = {}
categorical_columns = ['Marriage', 'Gender', 'Degree', 'work', 'Store Code', 'Country ISO2', 'Order Brand', 'Order', 'Department']
for column in categorical_columns:
    label_encoders[column] = LabelEncoder()
    X[column] = label_encoders[column].fit_transform(X[column])

# Split the data into train and test sets
X_train, X_test, y_promotion_train, y_promotion_test = train_test_split(X, y_promotion, test_size=0.2, random_state=42)
X_train, X_test, y_store_kind_train, y_store_kind_test = train_test_split(X, y_store_kind, test_size=0.2, random_state=42)

# Train a Random Forest Classifier for Promotion Name
promotion_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
promotion_classifier.fit(X_train, y_promotion_train)

# Train a Random Forest Classifier for Store Kind
store_kind_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
store_kind_classifier.fit(X_train, y_store_kind_train)

# Predict missing values for Promotion Name and Store Kind
y_promotion_pred = promotion_classifier.predict(X_test)
y_store_kind_pred = store_kind_classifier.predict(X_test)

# Fill missing values in the original dataframe
df.loc[df['Promotion Name'].isnull(), 'Promotion Name'] = y_promotion_pred
df.loc[df['Store Kind'].isnull(), 'Store Kind'] = y_store_kind_pred

# Now, df contains the missing values in "Promotion Name" and "Store Kind" filled with predictions

# You can save the filled dataframe to a new CSV file if needed
df.to_csv('filled_dataset.csv', index=False)
