In [None]:



# Helper function to print top 30 features
def print_top_features(model, feature_names, top_n=30):
    if hasattr(model, 'coef_'):
        # For Logistic Regression
        coef = model.coef_[0]
        top_features = np.argsort(np.abs(coef))[-top_n:][::-1]
        print("Top 30 Features:")
        for i in top_features:
            print(f"{feature_names[i]}: {coef[i]}")
    elif hasattr(model, 'feature_importances_'):
        # For Decision Trees
        importances = model.feature_importances_
        top_features = np.argsort(importances)[-top_n:][::-1]
        print("Top 30 Features:")
        for i in top_features:
            print(f"{feature_names[i]}: {importances[i]}")


# Function to train and evaluate a model
def train_and_evaluate(model, X_train, X_test, y_train, y_test, feature_names):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print("Accuracy:", accuracy)
    print("Classification Report:")
    print(classification_report(y_test, y_pred))
    print_top_features(model, feature_names)


# Datasets and Models
datasets = ["mnist_784", "Spambase", "20newsgroups"]
models = [
    ("Logistic Regression", LogisticRegression(penalty='l2', solver='liblinear', max_iter=1000)),
    ("Decision Tree", DecisionTreeClassifier())
]

# Run classification for each dataset and model
for dataset_name in datasets:
    print(f"\n### Dataset: {dataset_name} ###")
    if dataset_name == "mnist_784":
        data = fetch_openml("mnist_784", version=1)
        X, y = data.data, data.target.astype(int)
        feature_names = [f"pixel_{i}" for i in range(X.shape[1])]
    elif dataset_name == "Spambase":
        data = fetch_openml("spambase", version=1)
        X, y = data.data, data.target.astype(int)
        feature_names = data.feature_names
    elif dataset_name == "20newsgroups":
        from sklearn.datasets import fetch_20newsgroups
        newsgroups = fetch_20newsgroups(subset='all')
        vectorizer = TfidfVectorizer(max_features=5000)
        X = vectorizer.fit_transform(newsgroups.data)
        y = newsgroups.target
        feature_names = vectorizer.get_feature_names_out()

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

    for model_name, model in models:
        print(f"\nRunning {model_name} on {dataset_name}...")
        train_and_evaluate(model, X_train, X_test, y_train, y_test, feature_names)

        # Extra for Decision Tree on 20NG: Different Tree Sizes
        if dataset_name == "20newsgroups" and model_name == "Decision Tree":
            for max_depth in [5, 10]:
                print(f"\nDecision Tree with max_depth={max_depth} on {dataset_name}...")
                model = DecisionTreeClassifier(max_depth=max_depth)
                train_and_evaluate(model, X_train, X_test, y_train, y_test, feature_names)



### Dataset: mnist_784 ###

Running Logistic Regression on mnist_784...
