<a href="https://colab.research.google.com/github/irentala/fake-profile-detection-transformer/blob/master/RF_Cat_DT_NB.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
# Install necessary packages in Google Colab
!pip install catboost

# Import required libraries
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import numpy as np
from google.colab import files

Collecting catboost
  Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl (98.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.7


In [4]:
# Upload the dataset
uploaded = files.upload()

file_path = 'cleansed_50.csv'
df = pd.read_csv(file_path, sep=',')

Saving cleansed_50.csv to cleansed_50.csv


In [5]:
# Feature extraction: Creating new features for modeling
# Compute Hold Time as (release_time - press_time)
df['hold_time'] = df['release_time'] - df['press_time']

# Compute Flight Time (if applicable)
# Assuming we have the same user session sorted by keystrokes, compute flight time between keystrokes
df['flight_time'] = df['press_time'].diff().fillna(0)  # Calculate the difference between press times
df.loc[df['session_id'] != df['session_id'].shift(), 'flight_time'] = 0  # Reset flight time at session boundaries

# Compute Preceding Flight Time (time between release of previous key and press of next key)
df['preceding_flight_time'] = df['press_time'] - df['release_time'].shift().fillna(0)
df.loc[df['session_id'] != df['session_id'].shift(), 'preceding_flight_time'] = 0  # Reset preceding flight time at session boundaries

# Compute Following Flight Time (time between release of current key and press of next key)
df['following_flight_time'] = df['press_time'].shift(-1) - df['release_time']
df['following_flight_time'] = df['following_flight_time'].fillna(0)
df.loc[df['session_id'] != df['session_id'].shift(-1), 'following_flight_time'] = 0  # Reset following flight time at session boundaries

# Feature selection: Adding hold time and flight time to features
features = ['hold_time', 'flight_time', 'preceding_flight_time', 'following_flight_time']
x = df[features]
y = df['user_ids']

In [6]:
# Apply StandardScaler
standard_scaler = StandardScaler()
x_standard = standard_scaler.fit_transform(x)

# Apply MinMaxScaler
minmax_scaler = MinMaxScaler()
x_minmax = minmax_scaler.fit_transform(x)

# Apply Extended MinMaxScaler (scaling to a custom range)
extended_minmax_scaler = MinMaxScaler(feature_range=(-1, 1))
x_extended_minmax = extended_minmax_scaler.fit_transform(x)

In [7]:
# Define a function to train and evaluate classifiers with different scalers
def evaluate_model(x_scaled, scaler_name):
    print(f"\nResults for {scaler_name}:\n")

    # Split the data into training and test sets
    x_train, x_test, y_train, y_test = train_test_split(x_scaled, y, test_size=0.2, random_state=42)

    results = {}

    # Random Forest Classifier
    rf_classifier = RandomForestClassifier(n_estimators=50, max_depth=10, min_samples_split=5, min_samples_leaf=3,
                                           random_state=42)
    rf_classifier.fit(x_train, y_train)
    y_pred_rf = rf_classifier.predict(x_test)
    rf_accuracy = accuracy_score(y_test, y_pred_rf)
    rf_report = classification_report(y_test, y_pred_rf, zero_division=1, output_dict=True)
    results["Random Forest"] = {
        "accuracy": rf_accuracy,
        "precision": rf_report['weighted avg']['precision'],
        "recall": rf_report['weighted avg']['recall'],
        "f1-score": rf_report['weighted avg']['f1-score']
    }

    # CatBoost Classifier
    catboost_classifier = CatBoostClassifier(iterations=50, learning_rate=0.05, depth=4, l2_leaf_reg=3, verbose=0)
    catboost_classifier.fit(x_train, y_train)
    y_pred_catboost = catboost_classifier.predict(x_test)
    catboost_accuracy = accuracy_score(y_test, y_pred_catboost)
    catboost_report = classification_report(y_test, y_pred_catboost, zero_division=1, output_dict=True)
    results["CatBoost"] = {
        "accuracy": catboost_accuracy,
        "precision": catboost_report['weighted avg']['precision'],
        "recall": catboost_report['weighted avg']['recall'],
        "f1-score": catboost_report['weighted avg']['f1-score']
    }

    # Decision Tree Classifier
    dt_classifier = DecisionTreeClassifier(max_depth=10, min_samples_split=5, random_state=42)
    dt_classifier.fit(x_train, y_train)
    y_pred_dt = dt_classifier.predict(x_test)
    dt_accuracy = accuracy_score(y_test, y_pred_dt)
    dt_report = classification_report(y_test, y_pred_dt, zero_division=1, output_dict=True)
    results["Decision Tree"] = {
        "accuracy": dt_accuracy,
        "precision": dt_report['weighted avg']['precision'],
        "recall": dt_report['weighted avg']['recall'],
        "f1-score": dt_report['weighted avg']['f1-score']
    }

    # Naive Bayes Classifier
    nb_classifier = GaussianNB()
    nb_classifier.fit(x_train, y_train)
    y_pred_nb = nb_classifier.predict(x_test)
    nb_accuracy = accuracy_score(y_test, y_pred_nb)
    nb_report = classification_report(y_test, y_pred_nb, zero_division=1, output_dict=True)
    results["Naive Bayes"] = {
        "accuracy": nb_accuracy,
        "precision": nb_report['weighted avg']['precision'],
        "recall": nb_report['weighted avg']['recall'],
        "f1-score": nb_report['weighted avg']['f1-score']
    }

    return results

In [None]:
# Evaluate models for each scaler
results_list = []
results_list.append(("StandardScaler", evaluate_model(x_standard, "StandardScaler")))
results_list.append(("MinMaxScaler", evaluate_model(x_minmax, "MinMaxScaler")))
results_


Results for StandardScaler:


Results for MinMaxScaler:



In [None]:
# Prepare results for Excel format
excel_data = {
    "Scaler": [],
    "Random Forest": [],
    "Decision Tree": [],
    "CatBoost": [],
    "Naive Bayes": []
}

for scaler_name, results in results_list:
    excel_data["Scaler"].append(scaler_name)
    for classifier_name in ["Random Forest", "Decision Tree", "CatBoost", "Naive Bayes"]:
        metrics = results[classifier_name]
        formatted_result = f"Accuracy: {metrics['accuracy']:.2f}, Precision: {metrics['precision']:.2f}, Recall: {metrics['recall']:.2f}, F1-score: {metrics['f1-score']:.2f}"
        excel_data[classifier_name].append(formatted_result)

# Create DataFrame and print in Excel format
excel_df = pd.DataFrame(excel_data)
print("\nResults in Excel Format:\n")
print(excel_df.to_string(index=False))