In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

# Load dataset
df = pd.read_csv("dataset_MLproject.csv")  # Change path if needed

# Step 1: Preprocess the data
X = df.drop("Pathway text", axis=1)
y = df["Pathway text"]

# Encode labels
le = LabelEncoder()
y_encoded = le.fit_transform(y)

# Normalize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_encoded, test_size=0.2, random_state=42)

# Step 2: Define models
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "SVM": SVC(),
    "KNN": KNeighborsClassifier()
}

# Step 3: Train and evaluate models
for name, model in models.items():
    print(f"\n=== {name} ===")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred))
    
    # Confusion Matrix
    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(6, 4))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
    plt.title(f"{name} - Confusion Matrix")
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.show()

# Step 4: Hyperparameter tuning (example on Random Forest)
print("\n=== Hyperparameter Tuning: Random Forest ===")
param_grid_rf = {
    'n_estimators': [50, 100],
    'max_depth': [None, 10, 20]
}
grid_rf = GridSearchCV(RandomForestClassifier(), param_grid_rf, cv=3, verbose=1)
grid_rf.fit(X_train, y_train)
print("Best Params (RF):", grid_rf.best_params_)
print("Best Score (RF):", grid_rf.best_score_)

# Step 5: Hyperparameter tuning (example on SVM)
print("\n=== Hyperparameter Tuning: SVM ===")
param_grid_svm = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf']
}
grid_svm = GridSearchCV(SVC(), param_grid_svm, cv=3, verbose=1)
grid_svm.fit(X_train, y_train)
print("Best Params (SVM):", grid_svm.best_params_)
print("Best Score (SVM):", grid_svm.best_score_)



=== Logistic Regression ===


MemoryError: Unable to allocate 13.6 GiB for an array with shape (42730, 42730) and data type float64

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

# Load dataset
df = pd.read_csv("dataset_MLproject.csv")  # Update path if needed

# Step 1: Preprocess the data
X = df.drop("Pathway text", axis=1)
y = df["Pathway text"]

# Encode target labels
le = LabelEncoder()
y_encoded = le.fit_transform(y)

# Normalize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Dimensionality Reduction with PCA (retain 95% variance)
pca = PCA(n_components=0.95, random_state=42)
X_reduced = pca.fit_transform(X_scaled)

print(f"Original shape: {X_scaled.shape}, Reduced shape: {X_reduced.shape}")

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_reduced, y_encoded, test_size=0.2, random_state=42)

# Step 2: Define models
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000, solver='saga'),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "SVM": SVC(),
    "KNN": KNeighborsClassifier()
}

# Step 3: Train and evaluate models
for name, model in models.items():
    print(f"\n=== {name} ===")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred))

    # Confusion Matrix
    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(6, 4))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
    plt.title(f"{name} - Confusion Matrix")
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.tight_layout()
    plt.show()

# Step 4: Hyperparameter tuning (example on Random Forest)
print("\n=== Hyperparameter Tuning: Random Forest ===")
param_grid_rf = {
    'n_estimators': [50, 100],
    'max_depth': [None, 10, 20]
}
grid_rf = GridSearchCV(RandomForestClassifier(), param_grid_rf, cv=3, verbose=1)
grid_rf.fit(X_train, y_train)
print("Best Params (RF):", grid_rf.best_params_)
print("Best Score (RF):", grid_rf.best_score_)

# Step 5: Hyperparameter tuning (example on SVM)
print("\n=== Hyperparameter Tuning: SVM ===")
param_grid_svm = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf']
}
grid_svm = GridSearchCV(SVC(), param_grid_svm, cv=3, verbose=1)
grid_svm.fit(X_train, y_train)
print("Best Params (SVM):", grid_svm.best_params_)
print("Best Score (SVM):", grid_svm.best_score_)


Original shape: (53413, 19), Reduced shape: (53413, 8)

=== Logistic Regression ===
