In [None]:
from cleaning import data_new, X_train, X_test, y_train, y_test

In [9]:
from sklearn.feature_selection import SelectFromModel, RFE
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# Assume X and y are your data
# X, y = ...

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define categorical and numerical columns
categorical_cols = X_train.select_dtypes(include='object').columns
numerical_cols = X_train.select_dtypes(include=np.number).columns

# Define preprocessing pipelines
categorical_transformer = Pipeline([
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

numerical_transformer = Pipeline([
    ('scaler', StandardScaler())
])

# Combine preprocessors using ColumnTransformer
preprocessor = ColumnTransformer([
    ('cat', categorical_transformer, categorical_cols),
    ('num', numerical_transformer, numerical_cols)
])

# Create Random Forest model
rf_model = RandomForestClassifier(random_state=42)

# --- Feature selection using SelectFromModel ---

# Use SelectFromModel with Random Forest
select_model = SelectFromModel(rf_model, threshold='mean')

# Create a model pipeline with preprocessing and feature selection
model_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('feature_selector', select_model),
    ('model', rf_model)
])

# Train the model
model_pipeline.fit(X_train, y_train)

# Predict on test data
y_pred = model_pipeline.predict(X_test)

# Calculate accuracy and print classification report
accuracy = accuracy_score(y_test, y_pred)
print("Random Forest Model (SelectFromModel Feature Selection)")
print("Accuracy:", accuracy)
print(classification_report(y_test, y_pred))

# --- Feature selection using RFE ---

# Use Recursive Feature Elimination (RFE) with Random Forest
rfe = RFE(estimator=rf_model, n_features_to_select=10)  # Adjust n_features_to_select as needed

# Create a model pipeline with preprocessing and RFE
model_pipeline_rfe = Pipeline([
    ('preprocessor', preprocessor),
    ('feature_selector', rfe),
    ('model', rf_model)
])

# Train the model
model_pipeline_rfe.fit(X_train, y_train)

# Predict on test data
y_pred_rfe = model_pipeline_rfe.predict(X_test)

# Calculate accuracy and print classification report
accuracy_rfe = accuracy_score(y_test, y_pred_rfe)
print("Random Forest Model (RFE Feature Selection)")
print("Accuracy:", accuracy_rfe)
print(classification_report(y_test, y_pred_rfe))


Random Forest Model (SelectFromModel Feature Selection)
Accuracy: 0.7272727272727273
              precision    recall  f1-score   support

           0       0.73      0.81      0.77       136
           2       0.72      0.62      0.67       106

    accuracy                           0.73       242
   macro avg       0.73      0.72      0.72       242
weighted avg       0.73      0.73      0.72       242

Random Forest Model (RFE Feature Selection)
Accuracy: 0.6239669421487604
              precision    recall  f1-score   support

           0       0.65      0.71      0.68       136
           2       0.58      0.51      0.54       106

    accuracy                           0.62       242
   macro avg       0.62      0.61      0.61       242
weighted avg       0.62      0.62      0.62       242

