In [1]:
from cleaning import data_new, X_train, X_test, y_train, y_test,X,y,np,pd

In [2]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

# Define your training data (X_train) and target variable (y_train)
# Define your testing data (X_test) and target variable (y_test)
# Make sure these variables are defined before running this code

# Define the preprocessor
numeric_features = X_train.select_dtypes(include=np.number).columns
categorical_features = X_train.select_dtypes(include=['object']).columns

numeric_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer([
    ('num', numeric_transformer, numeric_features),
    ('cat', categorical_transformer, categorical_features)
])

# Define the Random Forest Classifier model
rf_model = RandomForestClassifier()

# Define the feature selection using the model
select_model = SelectFromModel(estimator=rf_model)

# Create a pipeline
model_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('feature_selector', select_model),
    ('model', rf_model)
])

# Check the format of the target variable
if not pd.api.types.is_categorical_dtype(y_train):
    # If the target variable is not categorical, convert it to categorical
    le = LabelEncoder()
    y_train = le.fit_transform(y_train)
    y_test = le.transform(y_test)

# Train the model pipeline
model_pipeline.fit(X_train, y_train)

# Predict on test data
y_pred = model_pipeline.predict(X_test)

# Evaluate model performance (example using accuracy)
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy of model:", accuracy)


Accuracy of model: 0.7479338842975206


In [3]:
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

# Assuming X_train, y_train, X_test, and y_test are defined

# Define your feature selection model
random_classifier = RandomForestClassifier(random_state=42)
select_model = SelectFromModel(estimator=random_classifier)

# Define the preprocessor
numeric_features = X_train.select_dtypes(include=np.number).columns
categorical_features = X_train.select_dtypes(include=['object']).columns

numeric_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer([
    ('num', numeric_transformer, numeric_features),
    ('cat', categorical_transformer, categorical_features)
])

# Create a pipeline
# Remove the feature selection step
model_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', random_classifier)
])


# Train the model pipeline
model_pipeline.fit(X_train, y_train)

# Predict on test data
y_pred = model_pipeline.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy of model:", accuracy)

Accuracy of model: 0.743801652892562
