In [2]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import IsolationForest

In [4]:
# Load the CSV file
dfStore = pd.read_csv('../data/store.csv')

In [5]:
# Get the number of rows and columns
num_rows = dfStore.shape[0]
num_cols = dfStore.shape[1]

# Print the number of rows and columns
print("Number of Rows:", num_rows)
print("Number of Columns:", num_cols)

Number of Rows: 1115
Number of Columns: 10


In [6]:
dfStore.head()

Unnamed: 0,Store,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval
0,1,c,a,1270.0,9.0,2008.0,0,,,
1,2,a,a,570.0,11.0,2007.0,1,13.0,2010.0,"Jan,Apr,Jul,Oct"
2,3,a,a,14130.0,12.0,2006.0,1,14.0,2011.0,"Jan,Apr,Jul,Oct"
3,4,c,c,620.0,9.0,2009.0,0,,,
4,5,a,a,29910.0,4.0,2015.0,0,,,


In [7]:
# Define the pipeline
pipeline = [
    ('drop_empty_rows', lambda dfStore: dfStore.dropna(how='any')),
    ('drop_duplicate_store', lambda dfStore: dfStore.drop_duplicates(subset='Store')),
    ('remove_invalid_storetype', lambda dfStore: dfStore[dfStore['StoreType'].isin(['a', 'b', 'c', 'd'])]),
    ('remove_invalid_assortment', lambda dfStore: dfStore[dfStore['Assortment'].isin(['a', 'b', 'c'])])
]

In [9]:
# Apply the pipeline
for step, func in pipeline:
    dfStore = func(dfStore)


In [10]:
# Check Promo2 column values if there is differen't value than 0,1
promo2_values = dfStore['Promo2'].unique()
if len(promo2_values) > 2 or any(value not in [0, 1] for value in promo2_values):
    # Show the different values and their counts
    different_values = dfStore.loc[~dfStore['Promo2'].isin([0, 1]), 'Promo2']
    different_counts = different_values.value_counts()
    print("Different Promo2 values and their counts:")
    print(different_counts)

    # Drop rows with different Promo2 values
    dfStore = dfStore.loc[dfStore['Promo2'].isin([0, 1])]


In [13]:
# Print the updated DataFrame 
print("Updated DataFrame:")
num_cols = dfStore.shape[1]
print("Number of Rows:", num_rows)
print("Number of Columns:", num_cols)


Updated DataFrame:
Number of Rows: 1115
Number of Columns: 10
