In [None]:
import pandas as pd

def transform_age_classes_one_hot(csv_path, output_csv):
    """
    Transforms the age column into one-hot encoded classes and maps them as follows:
    10-20 -> AGE_CLASS_0
    20-30 -> AGE_CLASS_1
    30-40 -> AGE_CLASS_2
    40-50 -> AGE_CLASS_3
    50+   -> AGE_CLASS_4

    Parameters:
    - csv_path (str): The path to the input CSV file.
    - output_csv (str): The path to save the transformed CSV file.
    """
    # Load the CSV file into a pandas DataFrame
    df = pd.read_csv(csv_path)

    # Define the bins and corresponding labels (age classes)
    bins = [10, 20, 30, 40, 50, float('inf')]  # float('inf') is used to cover ages greater than 50
    labels = [0, 1, 2, 3, 4]  # New labels for the classes

    # Create a new column 'AGE_CLASS' based on the age bins
    df['AGE_CLASS'] = pd.cut(df['AGE'], bins=bins, labels=labels, right=False)

    # Drop rows where the AGE column is not in the defined range (i.e., NaN in AGE_CLASS)
    df = df.dropna(subset=['AGE_CLASS'])

    # Convert AGE_CLASS to integer type (as cut returns category type)
    df['AGE_CLASS'] = df['AGE_CLASS'].astype(int)

    # One-hot encode the AGE_CLASS column
    one_hot_encoded_df = pd.get_dummies(df, columns=['AGE_CLASS'], prefix='AGE_CLASS')

    # Save the one-hot encoded DataFrame to a new CSV file
    one_hot_encoded_df.to_csv(output_csv, index=False)

    print(f"Transformed data with one-hot encoded age classes saved to {output_csv}")

# Example usage:
csv_path = 'demographics_csv/demo_keystroke.csv'  # Your input CSV path
output_csv = 'demographics_csv/age_one_hot_all.csv'  # Output CSV path

transform_age_classes_one_hot(csv_path, output_csv)


In [None]:
import pandas as pd

# Load the CSV file into a pandas DataFrame
csv_path = 'demographics_csv/age_one_hot_all.csv'  # Replace with your actual CSV file path
df = pd.read_csv(csv_path)

# Display the first 5 rows of the DataFrame
print(df.head())


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, accuracy_score

def balance_dataset(csv_path, output_csv):
    """
    Balances the dataset so that each class has an equal number of samples by undersampling.
    
    Parameters:
    - csv_path (str): The path to the input CSV file with one-hot encoded age classes.
    - output_csv (str): The path to save the balanced CSV file.
    """
    # Load the CSV file into a pandas DataFrame
    df = pd.read_csv(csv_path)

    # Assuming that AGE_CLASS columns are one-hot encoded, first identify the classes
    age_classes_columns = [col for col in df.columns if col.startswith('AGE_CLASS_')]
    
    # Create a new column 'AGE_CLASS' that contains the original class number (not one-hot encoded)
    df['AGE_CLASS'] = df[age_classes_columns].idxmax(axis=1).apply(lambda x: int(x.split('_')[-1]))

    # Find the minimum number of samples in any age class
    min_samples = df['AGE_CLASS'].value_counts().min()

    # Balance the dataset by undersampling each class to have `min_samples` number of rows
    balanced_df = df.groupby('AGE_CLASS').apply(lambda x: x.sample(n=min_samples, random_state=42)).reset_index(drop=True)

    # Drop the temporary 'AGE_CLASS' column since you still have one-hot encoded columns
    balanced_df = balanced_df.drop(columns=['AGE_CLASS'])

    # Save the balanced DataFrame to a new CSV file
    balanced_df.to_csv(output_csv, index=False)

    print(f"Balanced dataset saved to {output_csv}")

# Step 1: Create the balanced dataset
input_csv = 'demographics_csv/age_one_hot_all.csv'  # Input CSV path
output_csv = 'demographics_csv/balanced_age.csv'  # Output CSV path

balance_dataset(input_csv, output_csv)

In [None]:
# KNN
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, accuracy_score

# Load the dataset
df = pd.read_csv('demographics_csv/balanced_age.csv')

# Separate features and target
X = df.drop(columns=[col for col in df.columns if col.startswith('AGE_CLASS_')])
y = df[[col for col in df.columns if col.startswith('AGE_CLASS_')]].idxmax(axis=1).apply(lambda x: int(x.split('_')[-1]))

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features (standardization)
scaler = StandardScaler()
X_train = scaler.fit_transform


In [None]:
# RF
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

# Random Forest classifier
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Make predictions
y_pred_rf = rf.predict(X_test)

# Evaluate the model
print("Random Forest Classifier Accuracy: ", accuracy_score(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf))


In [None]:
# ANN
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense
from sklearn.preprocessing import OneHotEncoder

# One-hot encode the target variable for ANN
encoder = OneHotEncoder(sparse=False)
y_train_encoded = encoder.fit_transform(y_train.values.reshape(-1, 1))
y_test_encoded = encoder.transform(y_test.values.reshape(-1, 1))

# Build the ANN model
model = Sequential()
model.add(Dense(64, activation='relu', input_shape=(X_train.shape[1],)))
model.add(Dense(32, activation='relu'))
model.add(Dense(16, activation='relu'))
model.add(Dense(y_train_encoded.shape[1], activation='softmax'))

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(X_train, y_train_encoded, epochs=50, batch_size=32, validation_data=(X_test, y_test_encoded))

# Evaluate the model
test_loss, test_acc = model.evaluate(X_test, y_test_encoded)
print("ANN Accuracy: ", test_acc)

# Plotting training & validation accuracy
import matplotlib.pyplot as plt

plt.plot(history.history['accuracy'], label='train accuracy')
plt.plot(history.history['val_accuracy'], label='test accuracy')
plt.title('ANN Model Accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend()
plt.show()
