# Feature Selection and Unique Variable Analysis

In [11]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

# Preprocessing & Clustering
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split

# Feature Selection
from sklearn.feature_selection import SelectKBest, f_classif, chi2
from sklearn.ensemble import RandomForestClassifier


In [12]:
df = pd.read_csv("online_shoppers_intention.csv")


In [13]:
# from EDA

cont_features = [
    'Administrative_Duration', 'Informational_Duration', 'ProductRelated_Duration',
    'BounceRates', 'ExitRates', 'PageValues', 'SpecialDay'
]

# counts
count_features = ['Administrative', 'Informational', 'ProductRelated']

# cats that are numeric already
categorical_numeric = ['OperatingSystems', 'Browser', 'Region', 'TrafficType']

# Categorical features for one-hot encoding
categorical_to_encode = ['Month', 'VisitorType']

# Boolean features for label encoding
boolean_features = ['Weekend', 'Revenue']

# encode
df_processed = df.copy()
le = LabelEncoder()
for col in boolean_features:
    df_processed[col] = le.fit_transform(df_processed[col])
    
# one hot nominals 
df_processed = pd.get_dummies(df_processed, columns=categorical_to_encode, drop_first=True)

print(f"Original shape: {df.shape}")
print(f"Processed shape after OHE: {df_processed.shape}")
df_processed.head()


# from before, we see k=3 and k=5 are good choices for clustering. I re ran the K Means on the scaled 
features_for_clustering = cont_features + count_features

scaler = StandardScaler()
df_scaled_cluster = scaler.fit_transform(df_processed[features_for_clustering])

# k=3
kmeans_3 = KMeans(n_clusters=3, random_state=42, n_init=10)
df_processed['c3'] = kmeans_3.fit_predict(df_scaled_cluster)

# k=5 
kmeans_5 = KMeans(n_clusters=5, random_state=42, n_init=10)
df_processed['c5'] = kmeans_5.fit_predict(df_scaled_cluster)

# add clusters to list of categorical numeric features
categorical_numeric.extend(['c3', 'c5'])

print("Cluster 5 vs Revenue distribution:\n")
print(df_processed[['c5', 'Revenue']].groupby(['c5', 'Revenue']).size().unstack(fill_value=0))


# F test and correlation!

# Corr matrix and univariate feature selection
plt.figure(figsize=(15, 10))
corr_features = cont_features + count_features + ['Revenue']
corr_matrix = df_processed[corr_features].corr()
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Matrix of Numerical Features and Revenue')
plt.savefig('correlation_heatmap.png')
plt.close()

# corr matrix 
print(" Correlation on Revenue\n")
corr_revenue = corr_matrix['Revenue'].sort_values(ascending=False)
print(corr_revenue)

X_numerical = df_processed[cont_features + count_features]
y = df_processed['Revenue']

# scale for f test anova
scaler_anova = StandardScaler()
X_num_scaled = scaler_anova.fit_transform(X_numerical)

fs_f_classif = SelectKBest(score_func=f_classif, k='all')
fs_f_classif.fit(X_num_scaled, y)

f_scores = pd.Series(fs_f_classif.scores_, index=X_numerical.columns).sort_values(ascending=False)
print("\n ANOVA F-test Scores for Numerical...")
print(f_scores)


# same for categorical features

ohe_cols = [col for col in df_processed.columns if col.startswith('Month_') or col.startswith('VisitorType_')]

# cat features
all_categorical_features = categorical_numeric + ['Weekend'] + ohe_cols
X_categorical = df_processed[all_categorical_features]

#  chi sq
fs_chi2 = SelectKBest(score_func=chi2, k='all')
fs_chi2.fit(X_categorical, y)
chi2_scores = pd.Series(fs_chi2.scores_, index=X_categorical.columns).sort_values(ascending=False)

print("\nChi Square Scores for Categorical...\n")
print(chi2_scores)


Original shape: (12330, 18)
Processed shape after OHE: (12330, 27)
Cluster 5 vs Revenue distribution:

Revenue     0    1
c5                
0        1607  678
1         408  175
2         881   58
3        6648  992
4         878    5
 Correlation on Revenue

Revenue                    1.000000
PageValues                 0.492569
ProductRelated             0.158538
ProductRelated_Duration    0.152373
Administrative             0.138917
Informational              0.095200
Administrative_Duration    0.093587
Informational_Duration     0.070345
SpecialDay                -0.082305
BounceRates               -0.150673
ExitRates                 -0.207071
Name: Revenue, dtype: float64

 ANOVA F-test Scores for Numerical...
PageValues                 3949.262960
ExitRates                   552.286502
ProductRelated              317.844350
ProductRelated_Duration     293.027603
BounceRates                 286.375674
Administrative              242.586667
Informational               112.751843
A

In [14]:

# RF Features for interactions. reducing the impurity using Gini Index


X = df_processed.drop('Revenue', axis=1)
y = df_processed['Revenue']

# feature names 
feature_names = X.columns

# spplit test train
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# scale only continuous and count features
features_to_scale = cont_features + count_features

scaler_rf = StandardScaler()
X_train[features_to_scale] = scaler_rf.fit_transform(X_train[features_to_scale])
X_test[features_to_scale] = scaler_rf.transform(X_test[features_to_scale])

# trained
rf = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1, class_weight='balanced')
rf.fit(X_train, y_train)

# plot feature importances
importances = rf.feature_importances_
importance_series = pd.Series(importances, index=feature_names).sort_values(ascending=False)

plt.figure(figsize=(12, 10))
sns.barplot(x=importance_series, y=importance_series.index)
plt.title('Random Forest Feature Importance')
plt.xlabel('Importance Score')
plt.ylabel('Features')
plt.tight_layout()
plt.savefig('random_forest_importance.png')
print("\nRandom Forest importance plot saved as 'random_forest_importance.png'")
plt.close()



print("Most Important Features (Random Forest)")
print(importance_series.head(15))

print("\nFeature Selection Analysis Complete")


Random Forest importance plot saved as 'random_forest_importance.png'
Most Important Features (Random Forest)
PageValues                 0.373323
ExitRates                  0.099799
ProductRelated_Duration    0.086202
ProductRelated             0.067038
BounceRates                0.053765
Administrative_Duration    0.045828
Administrative             0.033730
TrafficType                0.027453
Region                     0.025359
Month_Nov                  0.025231
c5                         0.023942
Informational_Duration     0.020871
Browser                    0.015838
OperatingSystems           0.015208
Informational              0.015161
dtype: float64

Feature Selection Analysis Complete
