# **Feature Engineering and Model Evaluation**

## _**Load the Dataset**_

In [None]:

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

# Load the dataset
df = pd.read_csv('C:/Users/jckat/Downloads/Portfolio/Feature Enginerring/clean_data_after_eda.csv')
df.head()


## Step 2: Remove Irrelevant Columns

In [None]:

# Remove irrelevant columns
irrelevant_columns = ['id', 'date_end', 'date_modif_prod', 'date_renewal']
df.drop(columns=irrelevant_columns, inplace=True)
df.head()


## Step 3: Expand the Dataset with New Features

In [None]:

# Extract components from date columns
df['year_activ'] = pd.to_datetime(df['date_activ']).dt.year
df['month_activ'] = pd.to_datetime(df['date_activ']).dt.month
df['day_activ'] = pd.to_datetime(df['date_activ']).dt.day
df.drop(columns=['date_activ'], inplace=True)
df.head()


## Step 4: Combine Columns to Create Better Features

In [None]:

# Create a feature for total consumption by summing consumption columns
df['total_consumption'] = df['cons_12m'] + df['cons_gas_12m'] + df['cons_last_month']

# Create the new feature 'price_diff_dec_jan'
df['price_off_peak_dec'] = df['var_6m_price_off_peak'] * 1.02  # Dummy calculation for December prices
df['price_off_peak_jan'] = df['var_6m_price_off_peak'] * 0.98  # Dummy calculation for January prices
df['price_diff_dec_jan'] = df['price_off_peak_dec'] - df['price_off_peak_jan']
df[['price_off_peak_dec', 'price_off_peak_jan', 'price_diff_dec_jan']].head()


## Step 5: Prepare the Final Dataset

In [None]:

# Prepare the dataset
features = df.drop(columns=['churn'])
target = df['churn']

# Identify categorical columns
categorical_cols = features.select_dtypes(include=['object']).columns

# Define preprocessing pipeline for categorical features
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine preprocessing for numeric and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_cols)
    ],
    remainder='passthrough'
)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.3, random_state=42)


## Step 6: Train and Evaluate the Model Without the New Feature

In [None]:

# Exclude the new feature from training
X_train_without_new = X_train.drop(columns=['price_diff_dec_jan'])
X_test_without_new = X_test.drop(columns=['price_diff_dec_jan'])

# Define the model
model = RandomForestClassifier(random_state=42)

# Create and evaluate pipeline without the new feature
pipeline_without_new = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', model)
])

# Train the model without the new feature
pipeline_without_new.fit(X_train_without_new, y_train)
predictions_without_new = pipeline_without_new.predict(X_test_without_new)
accuracy_without_new = accuracy_score(y_test, predictions_without_new)
roc_auc_without_new = roc_auc_score(y_test, predictions_without_new)

accuracy_without_new, roc_auc_without_new


## Step 7: Train and Evaluate the Model With the New Feature

In [None]:

# Create and evaluate pipeline with the new feature
pipeline_with_new = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', model)
])

# Train the model with the new feature
pipeline_with_new.fit(X_train, y_train)
predictions_with_new = pipeline_with_new.predict(X_test)
accuracy_with_new = accuracy_score(y_test, predictions_with_new)
roc_auc_with_new = roc_auc_score(y_test, predictions_with_new)

accuracy_with_new, roc_auc_with_new


## Step 8: Compare the Performance of Both Models

In [None]:

print(f'Accuracy without new feature: {accuracy_without_new}')
print(f'ROC AUC without new feature: {roc_auc_without_new}')
print(f'Accuracy with new feature: {accuracy_with_new}')
print(f'ROC AUC with new feature: {roc_auc_with_new}')
