In [None]:
import pandas as pd
import numpy as np
from decisionTree import DecisionTree
from randomForestt import RandomForest

In [None]:
df_credit = pd.read_csv("dataset/creditcard.zip")

In [None]:
sample_df = df_credit.sample(n=10000, random_state=42)

In [None]:
X = sample_df.drop(["Class"], axis=1).values #Setting the X to do the split
y = sample_df["Class"].values # transforming the values in array

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
unique_values_train, counts_train = np.unique(y_train, return_counts=True)
print("Unique values in y_train:", unique_values_train)
print("Counts of unique values in y_train:", counts_train)

unique_values_test, counts_test = np.unique(y_test, return_counts=True)
print("Unique values in y_test:", unique_values_test)
print("Counts of unique values in y_test:", counts_test)

In [None]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)

# Apply SMOTE to the training data
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
from sklearn.metrics import precision_recall_curve, auc
 
custom_tree = DecisionTree()

param_grid = {
    'max_depth': [None, 2, 5, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

def aupr_score(y_true, y_score):
    precision, recall, _ = precision_recall_curve(y_true, y_score)
    aupr = auc(recall, precision)
    return aupr

scorer = make_scorer(aupr_score, greater_is_better=True)

grid_search = GridSearchCV(custom_tree, param_grid, cv=5, scoring=scorer)
grid_search.fit(X_train, y_train)

In [None]:
best_tree_params = grid_search.best_params_
print("Best Decision Tree Parameters:", best_tree_params)

In [None]:
tree = DecisionTree(max_depth=None, min_samples_split=10, min_samples_leaf=2)
tree.fit(X_train, y_train)

In [None]:
# Train a single decision tree with the best parameters
best_tree = DecisionTree(**best_tree_params)
best_tree.fit(X_train, y_train)

In [None]:
y_pred_single_tree = best_tree.predict(X_test)
accuracy_single_tree = accuracy(y_test, y_pred_single_tree)
print("Accuracy of Single Decision Tree:", accuracy_single_tree)

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_recall_curve, auc

import matplotlib.pyplot as plt

from sklearn.metrics import f1_score

In [None]:

# Make predictions on the test data
y_pred = tree.predict(X_test)

# Evaluate the classifier's performance
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

In [None]:

# Step 2: Calculate precision and recall
precision, recall, thresholds = precision_recall_curve(y_test, y_pred)

# Step 3: Plot the Precision-Recall Curve

plt.plot(recall, precision, marker='.')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')

# Step 4: Compute AUPR
aupr = auc(recall, precision)
print("Area Under Precision-Recall Curve (AUPR):", aupr)

In [None]:

# Assuming y_true contains the true labels and y_pred contains the predicted labels
f1 = f1_score(y_test, y_pred)

print("F1 Score:", f1)

In [None]:
rf_classifier = RandomForest(n_estimators=50,
                                        max_depth=None,
                                        min_samples_split=10,
                                        min_samples_leaf=2)
rf_classifier.fit(X_train, y_train)

# Make predictions
y_pred = rf_classifier.predict(X_test)

# Evaluate the classifier's performance
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

In [None]:

# Step 2: Calculate precision and recall
precision, recall, thresholds = precision_recall_curve(y_test, y_pred)

# Step 3: Plot the Precision-Recall Curve

plt.plot(recall, precision, marker='.')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')

# Step 4: Compute AUPR
aupr = auc(recall, precision)
print("Area Under Precision-Recall Curve (AUPR):", aupr)

In [None]:
# Assuming y_true contains the true labels and y_pred contains the predicted labels
f1 = f1_score(y_test, y_pred)

print("F1 Score:", f1)

In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics import average_precision_score
import matplotlib.pyplot as plt
from sklearn.metrics import precision_recall_curve, auc
from sklearn.metrics import f1_score

df1 = df_credit.sample(10000)

# Create copies of df1 for different experiments
df2 = df1.copy()
df3 = df1.copy()
df4 = df1.drop(columns=['Class']).copy()

# Function to classify based on threshold
def classify_anomaly_score(score, threshold):
    if score <= threshold:
        return 1  # Fraud
    else:
        return 0  # Normal

# Lists to store results
no_of_trees_list = [50, 100, 150]
sample_splits = [2, 5, 10]
sample_leafs = [1, 2, 5]


# Loop over different values of number of trees and sample size
for no_of_trees in no_of_trees_list:
    aupr_scores = []
    aupr_scores_1 = []
    f1_scores = [] 
    for sample_split in sample_splits:
        for sample_leaf in sample_leafs:

        
            rf_classifier = RandomForest(n_estimators=no_of_trees,
                                        max_depth=None,
                                        min_samples_split=sample_split,
                                        min_samples_leaf=sample_leaf)
            rf_classifier.fit(X_train, y_train)

            # Make predictions
            y_pred = rf_classifier.predict(X_test)

            aupr_score = auc(recall, precision)
            aupr_scores.append(aupr_score)
            
            f1 = f1_score(y_test, y_pred)
            f1_scores.append(f1)

    # Reshape the AUPR scores for plotting
    aupr_scores = np.array(aupr_scores).reshape(len(no_of_trees_list), len(sample_size_list))
    f1_scores = np.array(f1_scores).reshape(len(no_of_trees_list), len(sample_size_list))

    # Plot AUPR scores
    plt.figure(figsize=(12, 6))
    plt.subplot(1, 2, 1)
    plt.imshow(aupr_scores, cmap='viridis', interpolation='nearest')
    plt.title('AUPR Scores')
    plt.xlabel('Sample Size')
    plt.ylabel('Number of Trees')
    plt.xticks(np.arange(len(sample_splits)), sample_splits)
    plt.yticks(np.arange(len(sample_leafs)), sample_leafs)
    plt.colorbar(label='AUPR Score')
    plt.tight_layout()

    plt.figure(figsize=(12, 6))
    plt.subplot(1, 2, 2)
    plt.imshow(f1_scores, cmap='viridis', interpolation='nearest')
    plt.title('F1 Scores')
    plt.xlabel('Sample Size')
    plt.ylabel('Number of Trees')
    plt.xticks(np.arange(len(sample_splits)), sample_splits)
    plt.yticks(np.arange(len(sample_leafs)), sample_leafs)
    plt.colorbar(label='F1 Score')
    plt.tight_layout()

    plt.show()