# Cleaning Example - Cleaner

In [1]:
#!pip install feature_engineering==2.1.3

In [2]:
from feature_engineering import Cleaner

In [3]:
# Sample Data
import pandas as pd
import numpy as np

# Sample dataset
data = pd.DataFrame({
    'NumericalFeature': [1, 2, np.nan, 4, 5, np.nan],
    'CategoricalFeature': ['Red', 'Blue', 'Red', np.nan, 'Green', np.nan]
})

print("Original Dataset:")
data


Original Dataset:


Unnamed: 0,NumericalFeature,CategoricalFeature
0,1.0,Red
1,2.0,Blue
2,,Red
3,4.0,
4,5.0,Green
5,,


In [4]:
cleaned_data = Cleaner(data.copy(), 'NumericalFeature', method=['numerical', 'zero'])
print("\nNumerical Feature After Filling Missing Values with 0:")
cleaned_data



Numerical Feature After Filling Missing Values with 0:


Unnamed: 0,NumericalFeature,CategoricalFeature
0,1.0,Red
1,2.0,Blue
2,0.0,Red
3,4.0,
4,5.0,Green
5,0.0,


In [5]:
cleaned_data = Cleaner(data.copy(), 'CategoricalFeature', method=['categorical', 'missing'])
print("\nCategorical Feature After Filling Missing Values with 'Missing':")
cleaned_data



Categorical Feature After Filling Missing Values with 'Missing':


Unnamed: 0,NumericalFeature,CategoricalFeature
0,1.0,Red
1,2.0,Blue
2,,Red
3,4.0,Missing
4,5.0,Green
5,,Missing


In [6]:
encoded_data = Cleaner(data.copy(), 'CategoricalFeature', method=['categorical', 'encode'])
print("\nCategorical Feature After One-Hot Encoding:")
encoded_data



Categorical Feature After One-Hot Encoding:


Unnamed: 0,NumericalFeature,CategoricalFeature_Blue,CategoricalFeature_Green,CategoricalFeature_Red,CategoricalFeature_nan
0,1.0,False,False,True,False
1,2.0,True,False,False,False
2,,False,False,True,False
3,4.0,False,False,False,True
4,5.0,False,True,False,False
5,,False,False,False,True


# Feature Engineering Example - FeatureEngineering

In [7]:
from feature_engineering import FeatureEngineering

In [8]:
import pandas as pd
import numpy as np

# Set the random seed for reproducibility
np.random.seed(42)

# Create a DataFrame with synthetic data
df = pd.DataFrame({
    'CustomerID': range(1, 101),
    'Gender': np.random.choice(['Male', 'Female'], 100),
    'Age': np.random.randint(18, 70, size=100),
    'Tenure': np.random.randint(1, 60, size=100),
    'ServiceTier': np.random.choice(['Basic', 'Standard', 'Premium'], 100),
    'MonthlyCharges': np.random.uniform(29.99, 120.99, size=100),
})

# Correctly calculate TotalCharges
df['TotalCharges'] = df['Tenure'] * df['MonthlyCharges']

# Manually introduce some missing values
df.loc[::10, 'Age'] = np.nan
df.loc[::15, 'MonthlyCharges'] = np.nan

# Add the 'Churn' column
df['Churn'] = np.random.choice([0, 1], 100)

# Add a 'SignUpDate' column with dates spread over a year
start_date = pd.to_datetime('2020-01-01')
end_date = pd.to_datetime('2020-12-31')
date_range = pd.date_range(start_date, end_date, periods=100)
df['SignUpDate'] = np.random.choice(date_range, size=100)

# Display the first few rows of the DataFrame
df.head()


Unnamed: 0,CustomerID,Gender,Age,Tenure,ServiceTier,MonthlyCharges,TotalCharges,Churn,SignUpDate
0,1,Male,,7,Basic,,792.698617,0,2020-10-25 15:16:21.818181820
1,2,Female,43.0,59,Basic,63.674442,3756.792062,1,2020-12-16 06:03:38.181818184
2,3,Male,61.0,51,Premium,31.396552,1601.224157,1,2020-03-25 19:09:05.454545455
3,4,Male,51.0,22,Standard,114.466989,2518.273762,0,2020-03-22 02:40:00.000000000
4,5,Male,27.0,28,Basic,68.954757,1930.73321,0,2020-08-12 21:34:32.727272728


In [9]:
# Assuming the FeatureEngineering class and the df DataFrame are already defined

# Define the columns based on the dataset structure
numeric_columns = ['Age', 'Tenure', 'MonthlyCharges', 'TotalCharges']  # Numeric features in the dataset
categorical_columns = ['Gender', 'ServiceTier']  # Categorical features in the dataset
date_columns = ['SignUpDate']  # Date features in the dataset
target = 'Churn'  # The target variable we want to predict

# Feature Engineering parameters
handle_outliers=None # The parameter used in data preprocessing to specify the strategy for dealing with outliers in the dataset.
imputation_strategy = 'IterativeImputer'  # Use mean value to impute missing values in numeric columns
task = 'classification'  # The task is a classification problem (predicting churn)
n_features_to_select = None  # Not specifying a limit on the number of features to select
threshold_correlation = 0.8  # Set a threshold for removing highly correlated features
filter_method = True  # Enable filtering based on variance and correlation
selection_strategy = 'embedded'  # Use an embedded method for feature selection
sampling_strategies = None  # No specific sampling strategy for handling class imbalance
include_holiday = None  # Not incorporating holiday information into the date processing
custom_transformers = {}  # Assuming no custom transformer is defined
hyperparameter_tuning = None  # No hyperparameter tuning in this example
evaluation_metric = 'accuracy'  # Use accuracy as the evaluation metric for model performance
verbose = True  # Enable verbose output to track progress
min_class_size = None  # Not specifying a minimum class size for cross-validation splits
remainder = 'passthrough'  # Include columns not explicitly selected in transformations

# Instantiate the FeatureEngineering class with the specified arguments
feature_engineering = FeatureEngineering(
    df=df,
    target=target,
    numeric_columns=numeric_columns,
    categorical_columns=categorical_columns,
    date_columns=date_columns,
    handle_outliers=handle_outliers,
    imputation_strategy=imputation_strategy,
    task=task,
    n_features_to_select=n_features_to_select,
    threshold_correlation=threshold_correlation,
    filter_method=filter_method,
    selection_strategy=selection_strategy,
    sampling_strategies=sampling_strategies,
    include_holiday=include_holiday,
    custom_transformers=custom_transformers,
    hyperparameter_tuning=hyperparameter_tuning,
    evaluation_metric=evaluation_metric,
    verbose=verbose,
    min_class_size=min_class_size,
    remainder=remainder
)

# Running the feature engineering process
df_selected, selected_features = feature_engineering.run()


FloatProgress(value=0.0, description='Initializing...', style=ProgressStyle(bar_color='#00AEFF', description_w…

In [10]:
df_selected

Unnamed: 0,num__TotalCharges,num__SignUpDate_weekofyear,num__MonthlyCharges,num__SignUpDate_day,num__Age,remainder__CustomerID,Churn
0,-1.005319,1.070998,0.248503,0.871873,0.018839,1.0,0
1,0.782159,1.586521,-0.476860,-0.164700,-0.074927,2.0,1
2,-0.517743,-0.862211,-1.720016,0.871873,1.270133,3.0,1
3,0.035278,-0.926652,1.479373,0.526349,0.522877,4.0,0
4,-0.319035,0.426595,-0.273493,-0.625399,-1.270537,5.0,0
...,...,...,...,...,...,...,...
95,-1.143152,0.813237,0.691984,0.987048,1.195408,96.0,0
96,-0.513008,1.393200,-0.112309,0.756698,0.149249,97.0,0
97,-1.225356,-1.055532,-1.431300,-1.201273,0.672328,98.0,0
98,0.679705,-0.346689,-0.051178,0.295999,-1.046360,99.0,1


In [11]:
selected_features

Index(['num__TotalCharges', 'num__SignUpDate_weekofyear',
       'num__MonthlyCharges', 'num__SignUpDate_day', 'num__Age',
       'remainder__CustomerID'],
      dtype='object')

In [12]:
## Assuming SquareTransformer is defined as a Custom Transformer and Imputation Strategy is a median

In [13]:
import pandas as pd
import numpy as np

# Set the random seed for reproducibility
np.random.seed(42)

# Create a DataFrame with synthetic data
df = pd.DataFrame({
    'CustomerID': range(1, 101),
    'Gender': np.random.choice(['Male', 'Female'], 100),
    'Age': np.random.randint(18, 70, size=100),
    'Tenure': np.random.randint(1, 60, size=100),
    'ServiceTier': np.random.choice(['Basic', 'Standard', 'Premium'], 100),
    'MonthlyCharges': np.random.uniform(29.99, 120.99, size=100),
})

# Correctly calculate TotalCharges
df['TotalCharges'] = df['Tenure'] * df['MonthlyCharges']

# Manually introduce some missing values
df.loc[::10, 'Age'] = np.nan
df.loc[::15, 'MonthlyCharges'] = np.nan

# Add the 'Churn' column
df['Churn'] = np.random.choice([0, 1], 100)

# Add a 'SignUpDate' column with dates spread over a year
start_date = pd.to_datetime('2020-01-01')
end_date = pd.to_datetime('2020-12-31')
date_range = pd.date_range(start_date, end_date, periods=100)
df['SignUpDate'] = np.random.choice(date_range, size=100)

# Display the first few rows of the DataFrame
df.head()


Unnamed: 0,CustomerID,Gender,Age,Tenure,ServiceTier,MonthlyCharges,TotalCharges,Churn,SignUpDate
0,1,Male,,7,Basic,,792.698617,0,2020-10-25 15:16:21.818181820
1,2,Female,43.0,59,Basic,63.674442,3756.792062,1,2020-12-16 06:03:38.181818184
2,3,Male,61.0,51,Premium,31.396552,1601.224157,1,2020-03-25 19:09:05.454545455
3,4,Male,51.0,22,Standard,114.466989,2518.273762,0,2020-03-22 02:40:00.000000000
4,5,Male,27.0,28,Basic,68.954757,1930.73321,0,2020-08-12 21:34:32.727272728


In [14]:
from sklearn.base import BaseEstimator, TransformerMixin

class SquareTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X ** 2


In [15]:
# Assuming the FeatureEngineering class and the df DataFrame are already defined

# Define the columns based on the dataset structure
numeric_columns = ['Age', 'Tenure', 'MonthlyCharges', 'TotalCharges']  # Numeric features in the dataset
categorical_columns = ['Gender', 'ServiceTier']  # Categorical features in the dataset
date_columns = ['SignUpDate']  # Date features in the dataset
target = 'Churn'  # The target variable we want to predict

# Feature Engineering parameters
handle_outliers=None # The parameter used in data preprocessing to specify the strategy for dealing with outliers in the dataset.
imputation_strategy = 'median'  # Use mean value to impute missing values in numeric columns
task = 'classification'  # The task is a classification problem (predicting churn)
n_features_to_select = None  # Not specifying a limit on the number of features to select
threshold_correlation = 0.8  # Set a threshold for removing highly correlated features
filter_method = True  # Enable filtering based on variance and correlation
selection_strategy = 'embedded'  # Use an embedded method for feature selection
sampling_strategies = None  # No specific sampling strategy for handling class imbalance
include_holiday = None  # Not incorporating holiday information into the date processing
custom_transformers = {'Age': SquareTransformer()}  # Assuming no custom transformer is defined
hyperparameter_tuning = None  # No hyperparameter tuning in this example
evaluation_metric = 'accuracy'  # Use accuracy as the evaluation metric for model performance
verbose = True  # Enable verbose output to track progress
min_class_size = None  # Not specifying a minimum class size for cross-validation splits
remainder = 'passthrough'  # Include columns not explicitly selected in transformations

# Instantiate the FeatureEngineering class with the specified arguments
feature_engineering = FeatureEngineering(
    df=df,
    target=target,
    numeric_columns=numeric_columns,
    categorical_columns=categorical_columns,
    date_columns=date_columns,
    handle_outliers=handle_outliers,
    imputation_strategy=imputation_strategy,
    task=task,
    n_features_to_select=n_features_to_select,
    threshold_correlation=threshold_correlation,
    filter_method=filter_method,
    selection_strategy=selection_strategy,
    sampling_strategies=sampling_strategies,
    include_holiday=include_holiday,
    custom_transformers=custom_transformers,
    hyperparameter_tuning=hyperparameter_tuning,
    evaluation_metric=evaluation_metric,
    verbose=verbose,
    min_class_size=min_class_size,
    remainder=remainder
)

# Running the feature engineering process
df_selected, selected_features = feature_engineering.run()


FloatProgress(value=0.0, description='Initializing...', style=ProgressStyle(bar_color='#00AEFF', description_w…

In [16]:
df_selected

Unnamed: 0,TotalCharges__TotalCharges,SignUpDate_weekofyear__SignUpDate_weekofyear,MonthlyCharges__MonthlyCharges,SignUpDate_day__SignUpDate_day,Age__Age,remainder__CustomerID,Churn
0,-1.005319,1.070998,0.036029,0.871873,-0.152486,1.0,0
1,0.782159,1.586521,-0.495814,-0.164700,-0.226567,2.0,1
2,-0.517743,-0.862211,-1.768977,0.871873,1.367438,3.0,1
3,0.035278,-0.926652,1.507637,0.526349,0.413760,4.0,0
4,-0.319035,0.426595,-0.287538,-0.625399,-1.180245,5.0,0
...,...,...,...,...,...,...,...
95,-1.143152,0.813237,0.701243,0.987048,1.264407,96.0,0
96,-0.513008,1.393200,-0.122464,0.756698,0.000783,97.0,0
97,-1.225356,-1.055532,-1.473292,-1.201273,0.590872,98.0,0
98,0.679705,-0.346689,-0.059857,0.295999,-1.034639,99.0,1


In [17]:
selected_features

Index(['TotalCharges__TotalCharges',
       'SignUpDate_weekofyear__SignUpDate_weekofyear',
       'MonthlyCharges__MonthlyCharges', 'SignUpDate_day__SignUpDate_day',
       'Age__Age', 'remainder__CustomerID'],
      dtype='object')

In [18]:
import pandas as pd
import numpy as np

# Set the random seed for reproducibility
np.random.seed(42)

# Create a DataFrame with synthetic data
df = pd.DataFrame({
    'CustomerID': range(1, 101),
    'Gender': np.random.choice(['Male', 'Female'], 100),
    'Age': np.random.randint(18, 70, size=100),
    'Tenure': np.random.randint(1, 60, size=100),
    'ServiceTier': np.random.choice(['Basic', 'Standard', 'Premium'], 100),
    'MonthlyCharges': np.random.uniform(29.99, 120.99, size=100),
})

# Correctly calculate TotalCharges
df['TotalCharges'] = df['Tenure'] * df['MonthlyCharges']

# Manually introduce some missing values
df.loc[::10, 'Age'] = np.nan
df.loc[::15, 'MonthlyCharges'] = np.nan

# Add the 'Churn' column
df['Churn'] = np.random.choice([0, 1], 100)

# Add a 'SignUpDate' column with dates spread over a year
start_date = pd.to_datetime('2020-01-01')
end_date = pd.to_datetime('2020-12-31')
date_range = pd.date_range(start_date, end_date, periods=100)
df['SignUpDate'] = np.random.choice(date_range, size=100)

# Display the first few rows of the DataFrame
df.head()


Unnamed: 0,CustomerID,Gender,Age,Tenure,ServiceTier,MonthlyCharges,TotalCharges,Churn,SignUpDate
0,1,Male,,7,Basic,,792.698617,0,2020-10-25 15:16:21.818181820
1,2,Female,43.0,59,Basic,63.674442,3756.792062,1,2020-12-16 06:03:38.181818184
2,3,Male,61.0,51,Premium,31.396552,1601.224157,1,2020-03-25 19:09:05.454545455
3,4,Male,51.0,22,Standard,114.466989,2518.273762,0,2020-03-22 02:40:00.000000000
4,5,Male,27.0,28,Basic,68.954757,1930.73321,0,2020-08-12 21:34:32.727272728


In [19]:
## Assuming SMOTE is defined as a Sampling Strategy and Imputation Strategy is defined as a mean

In [20]:
# Assuming the FeatureEngineering class and the df DataFrame are already defined

# Define the columns based on the dataset structure
numeric_columns = ['Age', 'Tenure', 'MonthlyCharges', 'TotalCharges']  # Numeric features in the dataset
categorical_columns = ['Gender', 'ServiceTier']  # Categorical features in the dataset
date_columns = ['SignUpDate']  # Date features in the dataset
target = 'Churn'  # The target variable we want to predict

# Feature Engineering parameters
handle_outliers=None # The parameter used in data preprocessing to specify the strategy for dealing with outliers in the dataset.
imputation_strategy = 'mean'  # Use mean value to impute missing values in numeric columns
task = 'classification'  # The task is a classification problem (predicting churn)
n_features_to_select = None  # Not specifying a limit on the number of features to select
threshold_correlation = 0.8  # Set a threshold for removing highly correlated features
filter_method = True  # Enable filtering based on variance and correlation
selection_strategy = 'embedded'  # Use an embedded method for feature selection
sampling_strategies = [
    {'name': 'SMOTE', 'random_state': 42, 'k_neighbors': 2}  # Example specifying custom parameters for SMOTE
]                                                            # No specific sampling strategy for handling class imbalance
include_holiday = None  # Not incorporating holiday information into the date processing
custom_transformers = {'Age': SquareTransformer()}  # Assuming no custom transformer is defined
hyperparameter_tuning = None  # No hyperparameter tuning in this example
evaluation_metric = 'accuracy'  # Use accuracy as the evaluation metric for model performance
verbose = True  # Enable verbose output to track progress
min_class_size = None  # Not specifying a minimum class size for cross-validation splits
remainder = 'passthrough'  # Include columns not explicitly selected in transformations

# Instantiate the FeatureEngineering class with the specified arguments
feature_engineering = FeatureEngineering(
    df=df,
    target=target,
    numeric_columns=numeric_columns,
    categorical_columns=categorical_columns,
    date_columns=date_columns,
    handle_outliers=handle_outliers,
    imputation_strategy=imputation_strategy,
    task=task,
    n_features_to_select=n_features_to_select,
    threshold_correlation=threshold_correlation,
    filter_method=filter_method,
    selection_strategy=selection_strategy,
    sampling_strategies=sampling_strategies,
    include_holiday=include_holiday,
    custom_transformers=custom_transformers,
    hyperparameter_tuning=hyperparameter_tuning,
    evaluation_metric=evaluation_metric,
    verbose=verbose,
    min_class_size=min_class_size,
    remainder=remainder
)

# Running the feature engineering process
df_selected, selected_features = feature_engineering.run()


FloatProgress(value=0.0, description='Initializing...', style=ProgressStyle(bar_color='#00AEFF', description_w…

In [21]:
df_selected

Unnamed: 0,TotalCharges__TotalCharges,SignUpDate_weekofyear__SignUpDate_weekofyear,MonthlyCharges__MonthlyCharges,Age__Age,Churn
0,-1.005319,1.070998,0.000000,0.000000,0.0
1,0.782159,1.586521,-0.493126,-0.243825,1.0
2,-0.517743,-0.862211,-1.766351,1.352243,1.0
3,0.035278,-0.926652,1.510423,0.397331,0.0
4,-0.319035,0.426595,-0.284840,-1.198737,0.0
...,...,...,...,...,...
99,-0.778264,0.619916,-0.122278,0.226811,1.0
100,0.392520,1.032853,0.166183,-0.155344,
101,-0.611322,1.302715,0.008762,0.181224,
102,-0.692828,0.086615,-1.107194,-0.049340,


In [22]:
selected_features

Index(['TotalCharges__TotalCharges',
       'SignUpDate_weekofyear__SignUpDate_weekofyear',
       'MonthlyCharges__MonthlyCharges', 'Age__Age'],
      dtype='object')

In [23]:
import pandas as pd
import numpy as np

# Set the random seed for reproducibility
np.random.seed(42)

# Create a DataFrame with synthetic data
df = pd.DataFrame({
    'CustomerID': range(1, 101),
    'Gender': np.random.choice(['Male', 'Female'], 100),
    'Age': np.random.randint(18, 70, size=100),
    'Tenure': np.random.randint(1, 60, size=100),
    'ServiceTier': np.random.choice(['Basic', 'Standard', 'Premium'], 100),
    'MonthlyCharges': np.random.uniform(29.99, 120.99, size=100),
})

# Correctly calculate TotalCharges
df['TotalCharges'] = df['Tenure'] * df['MonthlyCharges']

# Manually introduce some missing values
df.loc[::10, 'Age'] = np.nan
df.loc[::15, 'MonthlyCharges'] = np.nan

# Add the 'Churn' column
df['Churn'] = np.random.choice([0, 1], 100)

# Add a 'SignUpDate' column with dates spread over a year
start_date = pd.to_datetime('2020-01-01')
end_date = pd.to_datetime('2020-12-31')
date_range = pd.date_range(start_date, end_date, periods=100)
df['SignUpDate'] = np.random.choice(date_range, size=100)

# Display the first few rows of the DataFrame
df.head()


Unnamed: 0,CustomerID,Gender,Age,Tenure,ServiceTier,MonthlyCharges,TotalCharges,Churn,SignUpDate
0,1,Male,,7,Basic,,792.698617,0,2020-10-25 15:16:21.818181820
1,2,Female,43.0,59,Basic,63.674442,3756.792062,1,2020-12-16 06:03:38.181818184
2,3,Male,61.0,51,Premium,31.396552,1601.224157,1,2020-03-25 19:09:05.454545455
3,4,Male,51.0,22,Standard,114.466989,2518.273762,0,2020-03-22 02:40:00.000000000
4,5,Male,27.0,28,Basic,68.954757,1930.73321,0,2020-08-12 21:34:32.727272728


In [24]:
## Assuming IQR is defined as a handle_outlier, task is regression and remainder is drop

In [25]:
# Assuming the FeatureEngineering class and the df DataFrame are already defined

# Define the columns based on the dataset structure
numeric_columns = ['Age', 'Tenure', 'MonthlyCharges', 'TotalCharges']  # Numeric features in the dataset
categorical_columns = ['Gender', 'ServiceTier']  # Categorical features in the dataset
date_columns = ['SignUpDate']  # Date features in the dataset
target = 'Churn'  # The target variable we want to predict

# Feature Engineering parameters
handle_outliers=['IQR','remove'] # The parameter used in data preprocessing to specify the strategy for dealing with outliers in the dataset.
imputation_strategy = 'mean'  # Use mean value to impute missing values in numeric columns
task = 'regression'  # The task is a classification problem (predicting churn)
n_features_to_select = None  # Not specifying a limit on the number of features to select
threshold_correlation = 0.8  # Set a threshold for removing highly correlated features
filter_method = True  # Enable filtering based on variance and correlation
selection_strategy = 'embedded'  # Use an embedded method for feature selection
sampling_strategies = [
    {'name': 'SMOTE', 'random_state': 42, 'k_neighbors': 2}  # Example specifying custom parameters for SMOTE
]                                                            # No specific sampling strategy for handling class imbalance
include_holiday = None  # Not incorporating holiday information into the date processing
custom_transformers = {'Age': SquareTransformer()}  # Assuming no custom transformer is defined
hyperparameter_tuning = None  # No hyperparameter tuning in this example
evaluation_metric = 'accuracy'  # Use accuracy as the evaluation metric for model performance
verbose = True  # Enable verbose output to track progress
min_class_size = None  # Not specifying a minimum class size for cross-validation splits
remainder = 'drop'  # Include columns not explicitly selected in transformations

# Instantiate the FeatureEngineering class with the specified arguments
feature_engineering = FeatureEngineering(
    df=df,
    target=target,
    numeric_columns=numeric_columns,
    categorical_columns=categorical_columns,
    date_columns=date_columns,
    handle_outliers=handle_outliers,
    imputation_strategy=imputation_strategy,
    task=task,
    n_features_to_select=n_features_to_select,
    threshold_correlation=threshold_correlation,
    filter_method=filter_method,
    selection_strategy=selection_strategy,
    sampling_strategies=sampling_strategies,
    include_holiday=include_holiday,
    custom_transformers=custom_transformers,
    hyperparameter_tuning=hyperparameter_tuning,
    evaluation_metric=evaluation_metric,
    verbose=verbose,
    min_class_size=min_class_size,
    remainder=remainder
)

# Running the feature engineering process
df_selected, selected_features = feature_engineering.run()


FloatProgress(value=0.0, description='Initializing...', style=ProgressStyle(bar_color='#00AEFF', description_w…

In [26]:
df_selected

Unnamed: 0,SignUpDate_dayofweek__SignUpDate_dayofweek,SignUpDate_weekofyear__SignUpDate_weekofyear,MonthlyCharges__MonthlyCharges,Age__Age,Gender__Gender_Female,ServiceTier__ServiceTier_Premium,Churn
0,1.388982,1.070998,0.000000,0.000000,0.0,0.000000,0.0
1,-0.616768,1.586521,-0.493126,-0.243825,1.0,0.000000,1.0
2,-0.616768,-0.862211,-1.766351,1.352243,0.0,1.000000,1.0
3,1.388982,-0.926652,1.510423,0.397331,0.0,0.000000,0.0
4,-0.616768,0.426595,-0.284840,-1.198737,0.0,0.000000,0.0
...,...,...,...,...,...,...,...
99,-1.619643,0.619916,-0.122278,0.226811,0.0,0.000000,1.0
100,0.085917,0.839964,0.571121,-0.754562,1.0,0.000000,
101,-0.961738,1.443469,-0.061454,0.034900,1.0,0.156019,
102,0.965766,0.307767,-1.332439,-0.071153,1.0,0.000000,


In [27]:
selected_features

Index(['SignUpDate_dayofweek__SignUpDate_dayofweek',
       'SignUpDate_weekofyear__SignUpDate_weekofyear',
       'MonthlyCharges__MonthlyCharges', 'Age__Age', 'Gender__Gender_Female',
       'ServiceTier__ServiceTier_Premium'],
      dtype='object')