# Import Required Libraries
Import the necessary libraries, including pandas, matplotlib, seaborn, and scikit-learn.

In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Load and Explore Dataset
Load the dataset and perform initial exploration, including checking for missing values and basic statistics.

In [None]:
# Load the dataset
df = pd.read_csv('credit_data.csv')

# Display the first few rows of the dataset
df.head()

# Check for missing values
missing_values = df.isnull().sum()
print("Missing values in each column:\n", missing_values)

# Display basic statistics of the dataset
basic_stats = df.describe()
print("Basic statistics of the dataset:\n", basic_stats)

# Visualize the distribution of the target variable
sns.countplot(x='creditability', data=df)
plt.title('Distribution of Creditability')
plt.show()

# Preprocess Data
Preprocess the data by handling missing values, encoding categorical variables, and scaling numerical features.

In [None]:
# Handle missing values by filling them with the median of the column
df.fillna(df.median(), inplace=True)

# Encode categorical variables using one-hot encoding
df = pd.get_dummies(df, drop_first=True)

# Separate features and target variable
X = df.drop('creditability', axis=1)
y = df['creditability']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale numerical features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Split Data into Training and Testing Sets
Split the dataset into training and testing sets using train_test_split from scikit-learn.

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale numerical features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Train Multiple Classification Models
Train multiple classification models such as Logistic Regression, Decision Tree, Random Forest, and Support Vector Machine.

In [None]:
# Train Logistic Regression model
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)
y_pred_log_reg = log_reg.predict(X_test)
log_reg_accuracy = accuracy_score(y_test, y_pred_log_reg)
print(f"Logistic Regression Accuracy: {log_reg_accuracy}")

# Train Decision Tree model
decision_tree = DecisionTreeClassifier()
decision_tree.fit(X_train, y_train)
y_pred_decision_tree = decision_tree.predict(X_test)
decision_tree_accuracy = accuracy_score(y_test, y_pred_decision_tree)
print(f"Decision Tree Accuracy: {decision_tree_accuracy}")

# Train Random Forest model
random_forest = RandomForestClassifier()
random_forest.fit(X_train, y_train)
y_pred_random_forest = random_forest.predict(X_test)
random_forest_accuracy = accuracy_score(y_test, y_pred_random_forest)
print(f"Random Forest Accuracy: {random_forest_accuracy}")

# Train Support Vector Machine model
from sklearn.svm import SVC
svm = SVC()
svm.fit(X_train, y_train)
y_pred_svm = svm.predict(X_test)
svm_accuracy = accuracy_score(y_test, y_pred_svm)
print(f"SVM Accuracy: {svm_accuracy}")

# Compare model accuracies
model_accuracies = {
    "Logistic Regression": log_reg_accuracy,
    "Decision Tree": decision_tree_accuracy,
    "Random Forest": random_forest_accuracy,
    "SVM": svm_accuracy
}

best_model = max(model_accuracies, key=model_accuracies.get)
print(f"Best model: {best_model} with accuracy {model_accuracies[best_model]}")

# Evaluate Model Performance
Evaluate the performance of each model using metrics such as accuracy, precision, recall, and F1-score.

In [None]:
# Evaluate Model Performance

# Evaluate Logistic Regression model
log_reg_report = classification_report(y_test, y_pred_log_reg)
log_reg_conf_matrix = confusion_matrix(y_test, y_pred_log_reg)
print("Logistic Regression Classification Report:\n", log_reg_report)
print("Logistic Regression Confusion Matrix:\n", log_reg_conf_matrix)

# Evaluate Decision Tree model
decision_tree_report = classification_report(y_test, y_pred_decision_tree)
decision_tree_conf_matrix = confusion_matrix(y_test, y_pred_decision_tree)
print("Decision Tree Classification Report:\n", decision_tree_report)
print("Decision Tree Confusion Matrix:\n", decision_tree_conf_matrix)

# Evaluate Random Forest model
random_forest_report = classification_report(y_test, y_pred_random_forest)
random_forest_conf_matrix = confusion_matrix(y_test, y_pred_random_forest)
print("Random Forest Classification Report:\n", random_forest_report)
print("Random Forest Confusion Matrix:\n", random_forest_conf_matrix)

# Evaluate Support Vector Machine model
svm_report = classification_report(y_test, y_pred_svm)
svm_conf_matrix = confusion_matrix(y_test, y_pred_svm)
print("SVM Classification Report:\n", svm_report)
print("SVM Confusion Matrix:\n", svm_conf_matrix)

# Select the Best Model
Compare the performance metrics of the models and select the best one based on the evaluation.

In [None]:
# Select the Best Model

# Compare the performance metrics of the models and select the best one based on the evaluation

# Create a DataFrame to store the evaluation metrics
evaluation_metrics = pd.DataFrame({
    "Model": ["Logistic Regression", "Decision Tree", "Random Forest", "SVM"],
    "Accuracy": [log_reg_accuracy, decision_tree_accuracy, random_forest_accuracy, svm_accuracy],
    "Precision": [classification_report(y_test, y_pred_log_reg, output_dict=True)['weighted avg']['precision'],
                  classification_report(y_test, y_pred_decision_tree, output_dict=True)['weighted avg']['precision'],
                  classification_report(y_test, y_pred_random_forest, output_dict=True)['weighted avg']['precision'],
                  classification_report(y_test, y_pred_svm, output_dict=True)['weighted avg']['precision']],
    "Recall": [classification_report(y_test, y_pred_log_reg, output_dict=True)['weighted avg']['recall'],
               classification_report(y_test, y_pred_decision_tree, output_dict=True)['weighted avg']['recall'],
               classification_report(y_test, y_pred_random_forest, output_dict=True)['weighted avg']['recall'],
               classification_report(y_test, y_pred_svm, output_dict=True)['weighted avg']['recall']],
    "F1-Score": [classification_report(y_test, y_pred_log_reg, output_dict=True)['weighted avg']['f1-score'],
                 classification_report(y_test, y_pred_decision_tree, output_dict=True)['weighted avg']['f1-score'],
                 classification_report(y_test, y_pred_random_forest, output_dict=True)['weighted avg']['f1-score'],
                 classification_report(y_test, y_pred_svm, output_dict=True)['weighted avg']['f1-score']]
})

# Display the evaluation metrics
print("Evaluation Metrics for Different Models:\n", evaluation_metrics)

# Select the best model based on F1-Score
best_model = evaluation_metrics.loc[evaluation_metrics['F1-Score'].idxmax()]
print(f"Best Model: {best_model['Model']} with F1-Score: {best_model['F1-Score']}")

# Plot the evaluation metrics for comparison
evaluation_metrics.set_index('Model', inplace=True)
evaluation_metrics.plot(kind='bar', figsize=(10, 6))
plt.title('Comparison of Model Performance Metrics')
plt.ylabel('Score')
plt.xlabel('Model')
plt.xticks(rotation=45)
plt.legend(loc='lower right')
plt.show()