In [42]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin

In [39]:
# Load the CSV file
dfStore = pd.read_csv('../data/store.csv')
test_df = pd.read_csv('../data/test.csv')


In [52]:
# Get the number of rows and columns
num_rows = dfStore.shape[0]
num_cols = dfStore.shape[1]

# Print the number of rows and columns
print("Store Data")
print("Number of Rows:", num_rows)
print("Number of Columns:", num_cols)

Store Data
Number of Rows: 365
Number of Columns: 10


In [61]:
# Get the number of rows and columns
num_rows = test_df.shape[0]
num_cols = test_df.shape[1]

# Print the number of rows and columns
print("Test Data")
print("Number of Rows:", num_rows)
print("Number of Columns:", num_cols)

Test Data
Number of Rows: 41088
Number of Columns: 8


StoreCSv

In [43]:
# Remove rows with NaN values
dfStore = dfStore.dropna()

# Create a pipeline for numeric columns
numeric_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median'))
])

# Create a pipeline for non-numeric columns
non_numeric_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent'))
])

In [30]:
# Identify numeric and non-numeric columns
numeric_cols = dfStore.select_dtypes(include='number').columns
non_numeric_cols = dfStore.select_dtypes(exclude='number').columns

# Apply pipelines to numeric columns
dfStore[numeric_cols] = numeric_pipeline.fit_transform(dfStore[numeric_cols])

# Apply pipelines to non-numeric columns
dfStore[non_numeric_cols] = non_numeric_pipeline.fit_transform(dfStore[non_numeric_cols])

In [45]:
num_rows = dfStore.shape[0]
num_cols = dfStore.shape[1]
print("Store Data")
print("Number of Rows:", num_rows)
print("Number of Columns:", num_cols)

Store Data
Number of Rows: 365
Number of Columns: 10


TestCSV

In [62]:
# Custom transformer to filter rows based on Store column
class FilterStore(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return X[X['Store'].isin(dfStore['Store'])]

# Custom transformer to filter rows based on Open column
class FilterOpen(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return X[X['Open'].isin([0, 1])]

# Custom transformer to filter rows based on StateHoliday column
class FilterStateHoliday(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return X[X['StateHoliday'].isin(['a', 'b', 'c', 0])]

# Custom transformer to remove rows with NaN values
class RemoveNaN(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return X.dropna()

In [63]:
# Create a pipeline to perform the tasks
pipeline = Pipeline([
    ('filter_store', FilterStore()),
    ('filter_open', FilterOpen()),
    ('filter_state_holiday', FilterStateHoliday()),
    ('remove_nan', RemoveNaN())
])

In [64]:
# Apply the pipeline to the test_data
filtered_data = pipeline.fit_transform(test_df)