In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import re
from glob import glob

In [2]:
files = glob("data/Train_Batch_*.csv")
files

['data\\Train_Batch_1.csv',
 'data\\Train_Batch_2.csv',
 'data\\Train_Batch_3.csv']

In [3]:
dfs = [pd.read_csv(file) for file in files]
print(dfs[0].shape)
print(dfs[1].shape)
print(dfs[2].shape)

(19440, 17)
(12956, 18)
(6466, 19)


There is a problem we can't concat our data because files have different columns we need to pre-process every single file alone then concat them!

In [4]:
df1 = dfs[0].copy()
df2 = dfs[1].copy()
df3 = dfs[2].copy()

In [5]:
print(df1.shape)
print(df1.info())
df1.head(2)

(19440, 17)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19440 entries, 0 to 19439
Data columns (total 17 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   Unnamed: 0                     19440 non-null  object 
 1   Person Description             19440 non-null  object 
 2   Place Code                     19440 non-null  object 
 3   Customer Order                 19440 non-null  object 
 4   Additional Features in market  15140 non-null  object 
 5   Promotion Name                 16530 non-null  object 
 6   Store Kind                     16530 non-null  object 
 7   Store Sales                    19440 non-null  object 
 8   Store Cost                     19440 non-null  object 
 9   Product Weights Data in (KG)   19440 non-null  object 
 10  Is Recyclable?                 19440 non-null  object 
 11  Min. Yearly Income             19437 non-null  object 
 12  Store Area                     174

Unnamed: 0.1,Unnamed: 0,Person Description,Place Code,Customer Order,Additional Features in market,Promotion Name,Store Kind,Store Sales,Store Cost,Product Weights Data in (KG),Is Recyclable?,Min. Yearly Income,Store Area,Grocery Area,Frozen Area,Meat Area,Cost
0,mc_ID_0,"Single Female with four children, education: b...",H11go_ZA,"Cleaning Supplies from Household department, O...","['Video Store', 'Florist', 'Ready Food', 'Coff...",Dimes Off,Deluxe,8.76 Millions,4.2924 Millions,"{'Gross Weight': 28.1997, 'Net Weight': 26.600...",recyclable,10K+,2842.23,2037.64,481.98,323.0,602.7575
1,mc_ID_1,"Single Female with three children, education: ...",S04ne_WA,"Snack Foods from Snack Foods department, Order...",,Budget Bargains,Supermarket,6.36 Millions,1.9716 Millions,"{'Gross Weight': 16.571, 'Net Weight': 14.972,...",non recyclable,50K+,2814.95,2049.72,457.36,,708.665


change id table name and set index to id

In [6]:
def set_index(df):
    df = (
        df
        .rename(columns={"Unnamed: 0": "id"})
        .set_index("id")
    )
    return df

In [7]:
df1 = set_index(df1)
df1.head(2)

Unnamed: 0_level_0,Person Description,Place Code,Customer Order,Additional Features in market,Promotion Name,Store Kind,Store Sales,Store Cost,Product Weights Data in (KG),Is Recyclable?,Min. Yearly Income,Store Area,Grocery Area,Frozen Area,Meat Area,Cost
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
mc_ID_0,"Single Female with four children, education: b...",H11go_ZA,"Cleaning Supplies from Household department, O...","['Video Store', 'Florist', 'Ready Food', 'Coff...",Dimes Off,Deluxe,8.76 Millions,4.2924 Millions,"{'Gross Weight': 28.1997, 'Net Weight': 26.600...",recyclable,10K+,2842.23,2037.64,481.98,323.0,602.7575
mc_ID_1,"Single Female with three children, education: ...",S04ne_WA,"Snack Foods from Snack Foods department, Order...",,Budget Bargains,Supermarket,6.36 Millions,1.9716 Millions,"{'Gross Weight': 16.571, 'Net Weight': 14.972,...",non recyclable,50K+,2814.95,2049.72,457.36,,708.665


In [8]:
def split_person_description(df):
    
    df[["personal", "deg_work"]] = (
        df["Person Description"]
        .str
        .split(", education: ", expand=True)
    )
    
    df[["Marriage", "Gender", "with", "Children", "tc"]] = (
        df["personal"]
        .str
        .split(expand=True)
    )
    
    df[["Degree", "Work"]] = (
        df["deg_work"]
        .str
        .split("working as", expand=True)
    )
    
    df = df.drop(columns=["Person Description", "personal", "with", "tc", "deg_work"])
    return df

In [9]:
df1 = split_person_description(df1)
df1.head(2)

Unnamed: 0_level_0,Place Code,Customer Order,Additional Features in market,Promotion Name,Store Kind,Store Sales,Store Cost,Product Weights Data in (KG),Is Recyclable?,Min. Yearly Income,Store Area,Grocery Area,Frozen Area,Meat Area,Cost,Marriage,Gender,Children,Degree,Work
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
mc_ID_0,H11go_ZA,"Cleaning Supplies from Household department, O...","['Video Store', 'Florist', 'Ready Food', 'Coff...",Dimes Off,Deluxe,8.76 Millions,4.2924 Millions,"{'Gross Weight': 28.1997, 'Net Weight': 26.600...",recyclable,10K+,2842.23,2037.64,481.98,323.0,602.7575,Single,Female,four,bachelors degree,professional
mc_ID_1,S04ne_WA,"Snack Foods from Snack Foods department, Order...",,Budget Bargains,Supermarket,6.36 Millions,1.9716 Millions,"{'Gross Weight': 16.571, 'Net Weight': 14.972,...",non recyclable,50K+,2814.95,2049.72,457.36,,708.665,Single,Female,three,bachelors degree,management


In [10]:
df1["Place Code"].value_counts()

T02ma_WA    1802
S01em_OR    1771
P07nd_OR    1668
S03le_WA    1626
H11go_ZA    1508
M10da_YU    1452
S04ne_WA    1411
B06ls_CA    1355
L05es_CA    1322
B14on_WA    1109
V13er_BC    1073
O09ba_VE     861
C15ho_ZA     769
A17co_GU     503
M12ty_DF     429
S18co_CA     253
B08am_WA     228
V19ia_BC     177
G16ra_JA     123
Name: Place Code, dtype: int64

In [11]:
def split_customer_order(df):
    df[["ord_dep", "Oreder Brand"]] = (
        df["Customer Order"]
        .str
        .split(", Ordered Brand : ", expand=True)
    )
    
    df[["Product", "Department", "blank"]] = (
        df["ord_dep"]
        .str
        .split("from | department", expand=True)
    )
        
    df = df.drop(columns=["Customer Order", "ord_dep", "blank"])
    return df

In [12]:
df1 = split_customer_order(df1)
df1.head(3)

Unnamed: 0_level_0,Place Code,Additional Features in market,Promotion Name,Store Kind,Store Sales,Store Cost,Product Weights Data in (KG),Is Recyclable?,Min. Yearly Income,Store Area,...,Meat Area,Cost,Marriage,Gender,Children,Degree,Work,Oreder Brand,Product,Department
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
mc_ID_0,H11go_ZA,"['Video Store', 'Florist', 'Ready Food', 'Coff...",Dimes Off,Deluxe,8.76 Millions,4.2924 Millions,"{'Gross Weight': 28.1997, 'Net Weight': 26.600...",recyclable,10K+,2842.23,...,323.0,602.7575,Single,Female,four,bachelors degree,professional,Red Wing,Cleaning Supplies,Household
mc_ID_1,S04ne_WA,,Budget Bargains,Supermarket,6.36 Millions,1.9716 Millions,"{'Gross Weight': 16.571, 'Net Weight': 14.972,...",non recyclable,50K+,2814.95,...,,708.665,Single,Female,three,bachelors degree,management,Nationeel,Snack Foods,Snack Foods
mc_ID_2,L05es_CA,['Florist'],Shelf Emptiers,Supermarket,10.86 Millions,4.4526 Millions,"{'Gross Weight': 28.6358, 'Net Weight': 27.182...",recyclable,30K+,2192.32,...,348.85,564.2647,Married,Male,two,high school degree,skilled manual,Excel,Magazines,Periodicals


In [13]:
def encode_market_features(df):
    unique_feat = set()
    
    for feat_list in df["Additional Features in market"] :
        if pd.notna(feat_list):
            string_data = feat_list.strip("[]")
            elements = string_data.split(', ')
            elements = [element.strip("'") for element in elements]
            unique_feat.update(elements)

    for feat in unique_feat:
        df[feat] = (
            df["Additional Features in market"]
            .apply(lambda x: 1 if pd.notna(x) and feat in x else 0)
        )
        
    df = df.drop(columns="Additional Features in market")
    return df

In [14]:
df1 = encode_market_features(df1)
df1.head(3)

Unnamed: 0_level_0,Place Code,Promotion Name,Store Kind,Store Sales,Store Cost,Product Weights Data in (KG),Is Recyclable?,Min. Yearly Income,Store Area,Grocery Area,...,Degree,Work,Oreder Brand,Product,Department,Bar For Salad,Coffee Bar,Ready Food,Video Store,Florist
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
mc_ID_0,H11go_ZA,Dimes Off,Deluxe,8.76 Millions,4.2924 Millions,"{'Gross Weight': 28.1997, 'Net Weight': 26.600...",recyclable,10K+,2842.23,2037.64,...,bachelors degree,professional,Red Wing,Cleaning Supplies,Household,1,1,1,1,1
mc_ID_1,S04ne_WA,Budget Bargains,Supermarket,6.36 Millions,1.9716 Millions,"{'Gross Weight': 16.571, 'Net Weight': 14.972,...",non recyclable,50K+,2814.95,2049.72,...,bachelors degree,management,Nationeel,Snack Foods,Snack Foods,0,0,0,0,0
mc_ID_2,L05es_CA,Shelf Emptiers,Supermarket,10.86 Millions,4.4526 Millions,"{'Gross Weight': 28.6358, 'Net Weight': 27.182...",recyclable,30K+,2192.32,1322.21,...,high school degree,skilled manual,Excel,Magazines,Periodicals,0,0,0,0,1


In [15]:
df1["Promotion Name"].value_counts()

Save Price                  676
Weekend Discount            660
Two Day Sale                633
Price Winners               579
Super Savers                568
Save It (Sale)              541
One Day Sale                519
Super Duper Savers          508
Roller Savings (High)       497
GLD                         478
Price Slashers              476
Full Free                   463
Shelf Clearing Days         459
Sale : Double Down          457
Lottery Cash Registerion    418
Two for One                 407
Big Time Discounts          400
Go For It                   394
Money Savers                379
Price Destroyers            369
Budget Bargains             360
Saving Days                 351
Discount Frenzy             332
Best Price Savers           327
Price Cutters               323
Dimes Off                   301
Savings Galore              298
Bag Stuffer                 287
Promo Big                   285
You Save Days               283
Sales Days                  279
I Cant B

In [16]:
df1["Store Kind"].value_counts()

Supermarket      7209
Deluxe           6220
Gourmet          1821
Mid-Size          761
Small Grocery     519
Name: Store Kind, dtype: int64

In [17]:
def transform_cost_sales(df):
    df["Store Sales"] = df["Store Sales"].str.split(expand=True)[0].astype(float) * 1e6
    df["Store Cost"] = df["Store Cost"].str.split(expand=True)[0].astype(float) * 1e6
    return df

In [18]:
df1 = transform_cost_sales(df1)
df1.head(3)

Unnamed: 0_level_0,Place Code,Promotion Name,Store Kind,Store Sales,Store Cost,Product Weights Data in (KG),Is Recyclable?,Min. Yearly Income,Store Area,Grocery Area,...,Degree,Work,Oreder Brand,Product,Department,Bar For Salad,Coffee Bar,Ready Food,Video Store,Florist
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
mc_ID_0,H11go_ZA,Dimes Off,Deluxe,8760000.0,4292400.0,"{'Gross Weight': 28.1997, 'Net Weight': 26.600...",recyclable,10K+,2842.23,2037.64,...,bachelors degree,professional,Red Wing,Cleaning Supplies,Household,1,1,1,1,1
mc_ID_1,S04ne_WA,Budget Bargains,Supermarket,6360000.0,1971600.0,"{'Gross Weight': 16.571, 'Net Weight': 14.972,...",non recyclable,50K+,2814.95,2049.72,...,bachelors degree,management,Nationeel,Snack Foods,Snack Foods,0,0,0,0,0
mc_ID_2,L05es_CA,Shelf Emptiers,Supermarket,10860000.0,4452600.0,"{'Gross Weight': 28.6358, 'Net Weight': 27.182...",recyclable,30K+,2192.32,1322.21,...,high school degree,skilled manual,Excel,Magazines,Periodicals,0,0,0,0,1


In [19]:
def extract_product_weights(df):
    if "Product Weights Data in (KG)" in df.columns:
        df[["b1", "Gross Weight", "Net Weight", "Package Weight", "b2"]] = (
            df["Product Weights Data in (KG)"]
            .str
            .split("{'Gross Weight': |, 'Net Weight': |, 'Package Weight': |}", expand=True)
        )
        df = df.drop(columns=["b1", "b2", "Product Weights Data in (KG)"])
    elif "Weights Data" in df.columns:
        df[["b1", "Gross Weight", "Net Weight", "Package Weight", "b2"]] = (
            df["Weights Data"]
            .str
            .split("{'Gross Weight': |, 'Net Weight': |, 'Package Weight': |}", expand=True)
        )
        df = df.drop(columns=["b1", "b2", "Weights Data"])
    return df

In [20]:
df1 = extract_product_weights(df1)
df1.head(3)

Unnamed: 0_level_0,Place Code,Promotion Name,Store Kind,Store Sales,Store Cost,Is Recyclable?,Min. Yearly Income,Store Area,Grocery Area,Frozen Area,...,Product,Department,Bar For Salad,Coffee Bar,Ready Food,Video Store,Florist,Gross Weight,Net Weight,Package Weight
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
mc_ID_0,H11go_ZA,Dimes Off,Deluxe,8760000.0,4292400.0,recyclable,10K+,2842.23,2037.64,481.98,...,Cleaning Supplies,Household,1,1,1,1,1,28.1997,26.6008,1.599
mc_ID_1,S04ne_WA,Budget Bargains,Supermarket,6360000.0,1971600.0,non recyclable,50K+,2814.95,2049.72,457.36,...,Snack Foods,Snack Foods,0,0,0,0,0,16.571,14.972,1.599
mc_ID_2,L05es_CA,Shelf Emptiers,Supermarket,10860000.0,4452600.0,recyclable,30K+,2192.32,1322.21,523.32,...,Magazines,Periodicals,0,0,0,0,1,28.6358,27.1822,1.4536


In [21]:
def transform_recyclable(df):
    mapping = {'recyclable': 'yes', 'non recyclable': 'no'}
    df["Is Recyclable?"] = df["Is Recyclable?"].map(mapping)
    return df

In [22]:
df1 = transform_recyclable(df1)
df1.head(3)

Unnamed: 0_level_0,Place Code,Promotion Name,Store Kind,Store Sales,Store Cost,Is Recyclable?,Min. Yearly Income,Store Area,Grocery Area,Frozen Area,...,Product,Department,Bar For Salad,Coffee Bar,Ready Food,Video Store,Florist,Gross Weight,Net Weight,Package Weight
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
mc_ID_0,H11go_ZA,Dimes Off,Deluxe,8760000.0,4292400.0,yes,10K+,2842.23,2037.64,481.98,...,Cleaning Supplies,Household,1,1,1,1,1,28.1997,26.6008,1.599
mc_ID_1,S04ne_WA,Budget Bargains,Supermarket,6360000.0,1971600.0,no,50K+,2814.95,2049.72,457.36,...,Snack Foods,Snack Foods,0,0,0,0,0,16.571,14.972,1.599
mc_ID_2,L05es_CA,Shelf Emptiers,Supermarket,10860000.0,4452600.0,yes,30K+,2192.32,1322.21,523.32,...,Magazines,Periodicals,0,0,0,0,1,28.6358,27.1822,1.4536


In [23]:
def transform_income(df):
    if "Min. Yearly Income" in df.columns:
        df["Min. Person Yearly Income"] = (
            df["Min. Yearly Income"]
            .str
            .split("K+", expand=True)[0]
            .astype(float) * 1000
        )
        df = df.drop(columns="Min. Yearly Income")
    elif "Min. Person Yearly Income" in df.columns:
        df["Min. Person Yearly Income"] = (
            df["Min. Person Yearly Income"]
            .str
            .split("K+", expand=True)[0]
            .astype(float) * 1000
        )
    elif "Yearly Income" in df.columns:
        df["Min. Person Yearly Income"] = (
            df["Yearly Income"]
            .str
            .split("K+", expand=True)[0]
            .astype(float) * 1000
        )
        df = df.drop(columns="Yearly Income")
    return df

In [24]:
df1 = transform_income(df1)
df1.head()

Unnamed: 0_level_0,Place Code,Promotion Name,Store Kind,Store Sales,Store Cost,Is Recyclable?,Store Area,Grocery Area,Frozen Area,Meat Area,...,Department,Bar For Salad,Coffee Bar,Ready Food,Video Store,Florist,Gross Weight,Net Weight,Package Weight,Min. Person Yearly Income
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
mc_ID_0,H11go_ZA,Dimes Off,Deluxe,8760000.0,4292400.0,yes,2842.23,2037.64,481.98,323.0,...,Household,1,1,1,1,1,28.1997,26.6008,1.599,10000.0
mc_ID_1,S04ne_WA,Budget Bargains,Supermarket,6360000.0,1971600.0,no,2814.95,2049.72,457.36,,...,Snack Foods,0,0,0,0,0,16.571,14.972,1.599,50000.0
mc_ID_2,L05es_CA,Shelf Emptiers,Supermarket,10860000.0,4452600.0,yes,2192.32,1322.21,523.32,348.85,...,Periodicals,0,0,0,0,1,28.6358,27.1822,1.4536,30000.0
mc_ID_3,S03le_WA,Savings Galore,,1980000.0,673200.0,yes,1974.73,,440.92,293.95,...,Dairy,0,1,0,0,0,14.2161,11.2944,2.9217,30000.0
mc_ID_4,M10da_YU,Sale Winners,Deluxe,11560000.0,4970800.0,no,2862.3,1872.19,593.93,395.95,...,Produce,1,1,1,1,1,12.6172,9.71,2.9072,50000.0


In [25]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
Index: 19440 entries, mc_ID_0 to mc_ID_19354
Data columns (total 28 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Place Code                 19440 non-null  object 
 1   Promotion Name             16530 non-null  object 
 2   Store Kind                 16530 non-null  object 
 3   Store Sales                19440 non-null  float64
 4   Store Cost                 19440 non-null  float64
 5   Is Recyclable?             19440 non-null  object 
 6   Store Area                 17483 non-null  object 
 7   Grocery Area               17460 non-null  object 
 8   Frozen Area                17506 non-null  float64
 9   Meat Area                  17492 non-null  object 
 10  Cost                       19416 non-null  float64
 11  Marriage                   19440 non-null  object 
 12  Gender                     19440 non-null  object 
 13  Children                   19440 non-nu

In [26]:
def transform_columns_type(df):
    df["Store Area"] = (
        df["Store Area"]
        .replace('missing', float('nan'))
        .astype(float)
    )
    
    df["Grocery Area"] = (
        df["Grocery Area"]
        .str
        .strip('"')
        .replace('missing', float('nan'))
        .astype(float)
    )
    
    df["Meat Area"] = (
        df["Meat Area"]
        .str
        .strip('"')
        .astype(float)
    )
    
    trans = ["Gross Weight", "Net Weight", "Package Weight"]
    
    df[trans] = df[trans].astype(float)
    
    return df

In [27]:
df1 = transform_columns_type(df1)
df1.head()

Unnamed: 0_level_0,Place Code,Promotion Name,Store Kind,Store Sales,Store Cost,Is Recyclable?,Store Area,Grocery Area,Frozen Area,Meat Area,...,Department,Bar For Salad,Coffee Bar,Ready Food,Video Store,Florist,Gross Weight,Net Weight,Package Weight,Min. Person Yearly Income
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
mc_ID_0,H11go_ZA,Dimes Off,Deluxe,8760000.0,4292400.0,yes,2842.23,2037.64,481.98,323.0,...,Household,1,1,1,1,1,28.1997,26.6008,1.599,10000.0
mc_ID_1,S04ne_WA,Budget Bargains,Supermarket,6360000.0,1971600.0,no,2814.95,2049.72,457.36,,...,Snack Foods,0,0,0,0,0,16.571,14.972,1.599,50000.0
mc_ID_2,L05es_CA,Shelf Emptiers,Supermarket,10860000.0,4452600.0,yes,2192.32,1322.21,523.32,348.85,...,Periodicals,0,0,0,0,1,28.6358,27.1822,1.4536,30000.0
mc_ID_3,S03le_WA,Savings Galore,,1980000.0,673200.0,yes,1974.73,,440.92,293.95,...,Dairy,0,1,0,0,0,14.2161,11.2944,2.9217,30000.0
mc_ID_4,M10da_YU,Sale Winners,Deluxe,11560000.0,4970800.0,no,2862.3,1872.19,593.93,395.95,...,Produce,1,1,1,1,1,12.6172,9.71,2.9072,50000.0


In [28]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
Index: 19440 entries, mc_ID_0 to mc_ID_19354
Data columns (total 28 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Place Code                 19440 non-null  object 
 1   Promotion Name             16530 non-null  object 
 2   Store Kind                 16530 non-null  object 
 3   Store Sales                19440 non-null  float64
 4   Store Cost                 19440 non-null  float64
 5   Is Recyclable?             19440 non-null  object 
 6   Store Area                 17482 non-null  float64
 7   Grocery Area               17459 non-null  float64
 8   Frozen Area                17506 non-null  float64
 9   Meat Area                  17492 non-null  float64
 10  Cost                       19416 non-null  float64
 11  Marriage                   19440 non-null  object 
 12  Gender                     19440 non-null  object 
 13  Children                   19440 non-nu

In [29]:
df2.head()

Unnamed: 0.1,Unnamed: 0,Person Description,Place Code,Customer Order,Additional Features in market,Promotion Name,Store Kind,Store Sales,Store Cost,Gross Weight,Net Weight,Is Recyclable?,Min. Person Yearly Income,Store Area,Grocery Area,Frozen Area,Meat Area,Cost
0,mc_ID_0,"Single Male with two children, education: high...",S01em_OR,"Baking Goods from Baking Goods department, Ord...","['Video Store', 'Florist', 'Ready Food', 'Bar ...",Tip Top Savings,Deluxe,3.28 M,0.984 M,21.9493,19.0421,yes,30K+,2577.5,1734.5,503.07,335.38,287.127
1,mc_ID_1,"Married Male with four children, education: hi...",T02ma_WA,"Canned Sardines from Canned Foods department, ...","['Florist', 'Bar For Salad', 'Ready Food', 'Co...",Discount Frenzy,Deluxe,9.35 M,4.2075 M,19.1874,16.1349,yes,30K+,3145.51,2057.74,654.13,,779.6884
2,mc_ID_2,"Married Female with three children, education:...",S04ne_WA,"Frozen Desserts from Frozen Foods department, ...",[],Tip Top Savings,Supermarket,8.4 M,3.192 M,16.7163,12.3555,no,30K+,2811.99,2049.72,457.36,304.91,557.8318
3,mc_ID_3,"Single Male with four children, education: par...",V13er_BC,"Frozen Desserts from Frozen Foods department, ...","['Coffee Bar', 'Ready Food', 'Bar For Salad', ...",Price Cutters,,4.6 M,1.518 M,24.1296,,yes,10K+,2147.17,,373.1,248.79,642.0871
4,mc_ID_4,"Married Female with four children, education: ...",B14on_WA,"Meat from Deli department, Ordered Brand : Moms","['Ready Food', 'Bar For Salad']",Money Savers,Supermarket,6.81 M,3.2007 M,17.8792,16.4256,no,50K+,3685.3,2265.9,853.22,568.75,432.0503


In [30]:
def calculate_package_weight(df):
    df["Package Weight"] = df["Gross Weight"] - df["Net Weight"]
    return df

In [31]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12956 entries, 0 to 12955
Data columns (total 18 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   Unnamed: 0                     12956 non-null  object 
 1   Person Description             12956 non-null  object 
 2   Place Code                     12956 non-null  object 
 3   Customer Order                 12954 non-null  object 
 4   Additional Features in market  12956 non-null  object 
 5   Promotion Name                 11665 non-null  object 
 6   Store Kind                     11683 non-null  object 
 7   Store Sales                    12955 non-null  object 
 8   Store Cost                     12956 non-null  object 
 9   Gross Weight                   11014 non-null  float64
 10  Net Weight                     11018 non-null  float64
 11  Is Recyclable?                 12956 non-null  object 
 12  Min. Person Yearly Income      12952 non-null 

In [32]:
df2 = set_index(df2)
df2 = split_person_description(df2)
df2 = split_customer_order(df2)
df2 = encode_market_features(df2)
df2 = transform_cost_sales(df2)
df2 = transform_income(df2)
df2 = calculate_package_weight(df2)
#df2 = transform_columns_type(df2)
df2["Frozen Area"] = df2["Frozen Area"][(df2["Frozen Area"] != '""')].str.rstrip('.').astype(float)
df2 = df2.drop(columns="")
df2.head()

Unnamed: 0_level_0,Place Code,Promotion Name,Store Kind,Store Sales,Store Cost,Gross Weight,Net Weight,Is Recyclable?,Min. Person Yearly Income,Store Area,...,Work,Oreder Brand,Product,Department,Bar For Salad,Coffee Bar,Ready Food,Video Store,Florist,Package Weight
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
mc_ID_0,S01em_OR,Tip Top Savings,Deluxe,3280000.0,984000.0,21.9493,19.0421,yes,30000.0,2577.5,...,manual,Landslide,Baking Goods,Baking Goods,1,1,1,1,1,2.9072
mc_ID_1,T02ma_WA,Discount Frenzy,Deluxe,9350000.0,4207500.0,19.1874,16.1349,yes,30000.0,3145.51,...,manual,Pleasant,Canned Sardines,Canned Foods,1,1,1,0,1,3.0525
mc_ID_2,S04ne_WA,Tip Top Savings,Supermarket,8400000.0,3192000.0,16.7163,12.3555,no,30000.0,2811.99,...,professional,Carrington,Frozen Desserts,Frozen Foods,0,0,0,0,0,4.3608
mc_ID_3,V13er_BC,Price Cutters,,4600000.0,1518000.0,24.1296,,yes,10000.0,2147.17,...,manual,PigTail,Frozen Desserts,Frozen Foods,1,1,1,1,1,
mc_ID_4,B14on_WA,Money Savers,Supermarket,6810000.0,3200700.0,17.8792,16.4256,no,50000.0,3685.3,...,skilled manual,Moms,Meat,Deli,1,0,1,0,0,1.4536


In [33]:
df3.head()

Unnamed: 0.1,Unnamed: 0,Person%20Description,Place%20Code,Customer%20Order,Additional%20Features%20in%20market,Promotion%20Name,Store%20Kind,Store%20Sales,Store%20Cost,Gross%20Weight,Net%20Weight,Weights%20Data,Is%20Recyclable?,Yearly%20Income,Store%20Area,Grocery%20Area,Frozen%20Area,Meat%20Area,Cost
0,mc_ID_0,"Single Female with two children, education: pa...",T02ma_WA,"Meat from Deli department, Ordered Brand : Red...","['Coffee Bar', 'Florist', 'Ready Food', 'Bar F...",Sale : Double Down,Deluxe,7.12 Millions,2.5632 Millions,23.2575,20.3503,"{'Gross Weight': 23.2575, 'Net Weight': 20.350...",yes,90K+,3145.51,2056.79,654.13,436.09,500.7202
1,mc_ID_1,"Single Female with five children, education: p...",M10da_YU,"Specialty from Produce department, Ordered Bra...","['Coffee Bar', 'Florist', 'Bar For Salad', 'Vi...",GLD,Deluxe,14.72 Millions,7.0656 Millions,16.7163,12.3555,"{'Gross Weight': 16.7163, 'Net Weight': 12.355...",yes,30K+,2856.68,1871.16,595.93,395.51,484.1411
2,mc_ID_2,"Married Male with three children, education: h...",T02ma_WA,"Paper Products from Household department, Orde...","['Coffee Bar', 'Bar For Salad', 'Ready Food', ...",Promo Big,Deluxe,4.98 Millions,2.4402 Millions,10.5531,7.6459,"{'Gross Weight': 10.5531, 'Net Weight': 7.6459...",no,30K+,3140.99,2055.29,654.13,436.09,267.3576
3,mc_ID_3,"Married Female with five children, education: ...",H11go_ZA,"Snack Foods from Snack Foods department, Order...","['Video Store', 'Coffee Bar', 'Ready Food', 'F...",Price Destroyers,,5.24 Millions,2.0436 Millions,23.6936,,"{'Gross Weight': 23.6936, 'Net Weight': 20.641...",no,10K+,2841.35,2038.11,481.98,322.22,777.2826
4,mc_ID_4,"Married Female with No children, education: pa...",O09ba_VE,"Canned Soup from Canned Foods department, Orde...","['Ready Food', 'Bar For Salad']",Save Price,Supermarket,8.84 Millions,3.4476 Millions,13.6056,10.6839,"{'Gross Weight': 13.6056, 'Net Weight': 10.683...",no,110K+,3236.33,2448.37,472.27,313.87,406.4756


In [34]:
df3.columns = df3.columns.str.replace('%20', ' ')
df3.head()

Unnamed: 0.1,Unnamed: 0,Person Description,Place Code,Customer Order,Additional Features in market,Promotion Name,Store Kind,Store Sales,Store Cost,Gross Weight,Net Weight,Weights Data,Is Recyclable?,Yearly Income,Store Area,Grocery Area,Frozen Area,Meat Area,Cost
0,mc_ID_0,"Single Female with two children, education: pa...",T02ma_WA,"Meat from Deli department, Ordered Brand : Red...","['Coffee Bar', 'Florist', 'Ready Food', 'Bar F...",Sale : Double Down,Deluxe,7.12 Millions,2.5632 Millions,23.2575,20.3503,"{'Gross Weight': 23.2575, 'Net Weight': 20.350...",yes,90K+,3145.51,2056.79,654.13,436.09,500.7202
1,mc_ID_1,"Single Female with five children, education: p...",M10da_YU,"Specialty from Produce department, Ordered Bra...","['Coffee Bar', 'Florist', 'Bar For Salad', 'Vi...",GLD,Deluxe,14.72 Millions,7.0656 Millions,16.7163,12.3555,"{'Gross Weight': 16.7163, 'Net Weight': 12.355...",yes,30K+,2856.68,1871.16,595.93,395.51,484.1411
2,mc_ID_2,"Married Male with three children, education: h...",T02ma_WA,"Paper Products from Household department, Orde...","['Coffee Bar', 'Bar For Salad', 'Ready Food', ...",Promo Big,Deluxe,4.98 Millions,2.4402 Millions,10.5531,7.6459,"{'Gross Weight': 10.5531, 'Net Weight': 7.6459...",no,30K+,3140.99,2055.29,654.13,436.09,267.3576
3,mc_ID_3,"Married Female with five children, education: ...",H11go_ZA,"Snack Foods from Snack Foods department, Order...","['Video Store', 'Coffee Bar', 'Ready Food', 'F...",Price Destroyers,,5.24 Millions,2.0436 Millions,23.6936,,"{'Gross Weight': 23.6936, 'Net Weight': 20.641...",no,10K+,2841.35,2038.11,481.98,322.22,777.2826
4,mc_ID_4,"Married Female with No children, education: pa...",O09ba_VE,"Canned Soup from Canned Foods department, Orde...","['Ready Food', 'Bar For Salad']",Save Price,Supermarket,8.84 Millions,3.4476 Millions,13.6056,10.6839,"{'Gross Weight': 13.6056, 'Net Weight': 10.683...",no,110K+,3236.33,2448.37,472.27,313.87,406.4756


In [35]:
df3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6466 entries, 0 to 6465
Data columns (total 19 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   Unnamed: 0                     6466 non-null   object 
 1   Person Description             6466 non-null   object 
 2   Place Code                     6466 non-null   object 
 3   Customer Order                 6466 non-null   object 
 4   Additional Features in market  6466 non-null   object 
 5   Promotion Name                 4839 non-null   object 
 6   Store Kind                     4840 non-null   object 
 7   Store Sales                    6452 non-null   object 
 8   Store Cost                     6459 non-null   object 
 9   Gross Weight                   5491 non-null   float64
 10  Net Weight                     5511 non-null   float64
 11  Weights Data                   6466 non-null   object 
 12  Is Recyclable?                 6464 non-null   o

In [36]:
df3 = set_index(df3)
df3 = split_person_description(df3)
df3 = split_customer_order(df3)
df3 = encode_market_features(df3)
df3 = transform_cost_sales(df3)
df3 = transform_income(df3)
df3 = calculate_package_weight(df3)
#df3 = transform_columns_type(df3)
df3 = df3.drop(columns=["", "Weights Data"])
df3.head()

Unnamed: 0_level_0,Place Code,Promotion Name,Store Kind,Store Sales,Store Cost,Gross Weight,Net Weight,Is Recyclable?,Store Area,Grocery Area,...,Oreder Brand,Product,Department,Bar For Salad,Coffee Bar,Ready Food,Video Store,Florist,Min. Person Yearly Income,Package Weight
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
mc_ID_0,T02ma_WA,Sale : Double Down,Deluxe,7120000.0,2563200.0,23.2575,20.3503,yes,3145.51,2056.79,...,Red Spade,Meat,Deli,1,1,1,0,1,90000.0,2.9072
mc_ID_1,M10da_YU,GLD,Deluxe,14720000.0,7065600.0,16.7163,12.3555,yes,2856.68,1871.16,...,Tell Tale,Specialty,Produce,1,1,1,1,1,30000.0,4.3608
mc_ID_2,T02ma_WA,Promo Big,Deluxe,4980000.0,2440200.0,10.5531,7.6459,no,3140.99,2055.29,...,High Quality,Paper Products,Household,1,1,1,0,1,30000.0,2.9072
mc_ID_3,H11go_ZA,Price Destroyers,,5240000.0,2043600.0,23.6936,,no,2841.35,2038.11,...,Nationeel,Snack Foods,Snack Foods,1,1,1,1,1,10000.0,
mc_ID_4,O09ba_VE,Save Price,Supermarket,8840000.0,3447600.0,13.6056,10.6839,no,3236.33,2448.37,...,Better,Canned Soup,Canned Foods,1,0,1,0,0,110000.0,2.9217


In [37]:
all_df = pd.concat([df1, df2, df3])
all_df.head()

Unnamed: 0_level_0,Place Code,Promotion Name,Store Kind,Store Sales,Store Cost,Is Recyclable?,Store Area,Grocery Area,Frozen Area,Meat Area,...,Department,Bar For Salad,Coffee Bar,Ready Food,Video Store,Florist,Gross Weight,Net Weight,Package Weight,Min. Person Yearly Income
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
mc_ID_0,H11go_ZA,Dimes Off,Deluxe,8760000.0,4292400.0,yes,2842.23,2037.64,481.98,323.0,...,Household,1,1,1,1,1,28.1997,26.6008,1.599,10000.0
mc_ID_1,S04ne_WA,Budget Bargains,Supermarket,6360000.0,1971600.0,no,2814.95,2049.72,457.36,,...,Snack Foods,0,0,0,0,0,16.571,14.972,1.599,50000.0
mc_ID_2,L05es_CA,Shelf Emptiers,Supermarket,10860000.0,4452600.0,yes,2192.32,1322.21,523.32,348.85,...,Periodicals,0,0,0,0,1,28.6358,27.1822,1.4536,30000.0
mc_ID_3,S03le_WA,Savings Galore,,1980000.0,673200.0,yes,1974.73,,440.92,293.95,...,Dairy,0,1,0,0,0,14.2161,11.2944,2.9217,30000.0
mc_ID_4,M10da_YU,Sale Winners,Deluxe,11560000.0,4970800.0,no,2862.3,1872.19,593.93,395.95,...,Produce,1,1,1,1,1,12.6172,9.71,2.9072,50000.0


In [38]:
all_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 38862 entries, mc_ID_0 to mc_ID_6465
Data columns (total 28 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Place Code                 38862 non-null  object 
 1   Promotion Name             33034 non-null  object 
 2   Store Kind                 33053 non-null  object 
 3   Store Sales                38847 non-null  float64
 4   Store Cost                 38855 non-null  float64
 5   Is Recyclable?             38860 non-null  object 
 6   Store Area                 35296 non-null  float64
 7   Grocery Area               35255 non-null  float64
 8   Frozen Area                35309 non-null  float64
 9   Meat Area                  35313 non-null  float64
 10  Cost                       38831 non-null  float64
 11  Marriage                   38862 non-null  object 
 12  Gender                     38862 non-null  object 
 13  Children                   38862 non-nul

In [39]:
def fill_nulls(df):
    number_cols = all_df.select_dtypes([float, int]).columns
    cat_cols = all_df.select_dtypes("object").columns
    
    df[number_cols] = (
        df[number_cols]
        .fillna(df[number_cols].mean())
    )
    """
    for col in cat_cols:
        df[col] = (
            df[col]
            .fillna(df[col].mode()[0])
        )
"""
    return df

In [40]:
all_df = fill_nulls(all_df)

In [41]:
all_df.isna().sum()

Place Code                      0
Promotion Name               5828
Store Kind                   5809
Store Sales                     0
Store Cost                      0
Is Recyclable?                  2
Store Area                      0
Grocery Area                    0
Frozen Area                     0
Meat Area                       0
Cost                            0
Marriage                        0
Gender                          0
Children                        0
Degree                          0
Work                            0
Oreder Brand                    2
Product                         2
Department                      2
Bar For Salad                   0
Coffee Bar                      0
Ready Food                      0
Video Store                     0
Florist                         0
Gross Weight                    0
Net Weight                      0
Package Weight                  0
Min. Person Yearly Income       0
dtype: int64

In [42]:
all_df.dropna(inplace=True)

In [39]:
# all_df.dropna(inplace=True)

In [43]:
# all_df.to_csv("data/half_cleaned.csv")

In [43]:
all_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 28225 entries, mc_ID_0 to mc_ID_6465
Data columns (total 28 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Place Code                 28225 non-null  object 
 1   Promotion Name             28225 non-null  object 
 2   Store Kind                 28225 non-null  object 
 3   Store Sales                28225 non-null  float64
 4   Store Cost                 28225 non-null  float64
 5   Is Recyclable?             28225 non-null  object 
 6   Store Area                 28225 non-null  float64
 7   Grocery Area               28225 non-null  float64
 8   Frozen Area                28225 non-null  float64
 9   Meat Area                  28225 non-null  float64
 10  Cost                       28225 non-null  float64
 11  Marriage                   28225 non-null  object 
 12  Gender                     28225 non-null  object 
 13  Children                   28225 non-nul

In [44]:
all_df.to_csv("data/cleaned_data.csv")

هبد من هنا لبكرا هنقله في نوتبوك لوحده