# 1.0 Read local folder, make subfolders
### Read all txt files from local folder on pc and divide into folders containing 100 files each

In [1]:
# from a local folder read 10k files and put them in project here
# can adjust file amount 

import os
import shutil


def organize_samples(source_dir, project_dir, total_files=10000, files_per_folder=100):
    """
    Organizes .txt files from the source directory into multiple folders
    in the 'samples' folder in the project directory, each containing a specified number of files.

    Parameters:
    - source_dir (str): The path to the directory containing the original .txt files.
    - project_dir (str): The root path to your project directory (which contains the 'samples' folder).
    - total_files (int): The total number of files to process (default is 10,000).
    - files_per_folder (int): The number of files per folder (default is 100).
    """
    # Path to the 'samples' folder in the project directory
    samples_dir = os.path.join(project_dir, 'samples')

    # Ensure the 'samples' directory exists in the project directory
    os.makedirs(samples_dir, exist_ok=True)

    # Get a list of all .txt files in the source directory
    all_files = [f for f in os.listdir(source_dir) if f.endswith('.txt')]

    # Sort the files (optional, depending on whether you want them in a specific order)
    all_files.sort()

    # Limit to the total number of files specified
    files_to_copy = all_files[:total_files]

    folder_number = 1
    file_count = 0
    total_copied = 0

    # Create the first folder (samples_01)
    current_folder_name = f'samples_{folder_number:02d}'
    current_folder_path = os.path.join(samples_dir, current_folder_name)
    os.makedirs(current_folder_path, exist_ok=True)

    for file_name in files_to_copy:
        source_file = os.path.join(source_dir, file_name)
        target_file = os.path.join(current_folder_path, file_name)

        # Copy the file
        shutil.copy2(source_file, target_file)
        file_count += 1
        total_copied += 1

        # Check if the current folder has reached the desired number of files
        if file_count >= files_per_folder:
            folder_number += 1
            if total_copied >= total_files:
                break  # Stop if we've copied the total desired number of files
            # Reset file count and create a new folder
            file_count = 0
            current_folder_name = f'samples_{folder_number:02d}'
            current_folder_path = os.path.join(samples_dir, current_folder_name)
            os.makedirs(current_folder_path, exist_ok=True)

    print(f'Total files copied: {total_copied}')
    print(f'Files organized into {folder_number} folders.')


# Example usage:
source_directory = r"insert path here"  # Replace with the path to your local folder containing .txt files
project_directory = 'samples'  # Replace with the root path to your PyCharm project

organize_samples(source_directory, project_directory)


Total files copied: 10000
Files organized into 101 folders.


# 2.0 Read txt files, make csv file

In [9]:
# generate a csv file with all data from the files

import os
import pandas as pd


def try_multiple_delimiters(file_path):
    """
    Attempts to read a file using different delimiters.
    """
    delimiters = ['\t', ',', ';']  # Common delimiters to try
    for delimiter in delimiters:
        try:
            df = pd.read_csv(file_path, sep=delimiter, encoding='utf-8', on_bad_lines='skip',
                             dtype={'PARTICIPANT_ID': str})
            # If we get more than one column, we assume we have the right delimiter
            if df.shape[1] > 1:
                return df
        except Exception as e:
            pass  # Try the next delimiter
    raise ValueError("Could not determine delimiter")


def read_keystroke_data(samples_dir, output_csv='all_samples_combined1.csv'):
    """
    Reads keystroke data from multiple subfolders, handles errors, and saves valid data to CSV.

    Parameters:
    - samples_dir (str): The directory where the 'samples' folder resides, which contains multiple subfolders with .txt files.
    - output_csv (str): The output path for the CSV file to save valid data.

    Returns:
    - A pandas DataFrame containing valid keystroke data.
    """
    all_data = []  # To hold data from all valid files
    total_files_processed = 0  # Counter to keep track of the total number of files processed
    total_rows_processed = 0  # Counter for the total number of rows across all files
    skipped_files = 0  # Counter for the number of skipped files
    malformed_rows = 0  # Counter for the number of malformed rows

    # Expected columns in the file
    required_columns = ['PARTICIPANT_ID', 'TEST_SECTION_ID', 'SENTENCE', 'USER_INPUT',
                        'KEYSTROKE_ID', 'PRESS_TIME', 'RELEASE_TIME', 'LETTER', 'KEYCODE']

    # Traverse each subfolder (e.g., samples_01, samples_02, ..., samples_100)
    for subdir in os.listdir(samples_dir):
        subdir_path = os.path.join(samples_dir, subdir)

        if os.path.isdir(subdir_path):
            # Traverse each .txt file in the subfolder
            for file_name in os.listdir(subdir_path):
                if file_name.endswith('.txt'):
                    file_path = os.path.join(subdir_path, file_name)
                    total_files_processed += 1  # Increment the counter for each file

                    print(f"Processing file: {file_path}")  # Debugging statement

                    try:
                        # Try reading the file using multiple delimiters
                        df = try_multiple_delimiters(file_path)

                        # Log the number of rows in the current file
                        print(f"File {file_name} has {df.shape[0]} rows")

                        # Ensure required columns exist
                        if not all(col in df.columns for col in required_columns):
                            print(f"Missing columns in {file_path}")
                            skipped_files += 1
                            continue  # Skip this file if it doesn't have the required columns

                        # If the number of columns is 1, it means that the file might be incorrectly formatted
                        if len(df.columns) == 1:
                            print(f"Malformed data in {file_path}, skipping.")
                            skipped_files += 1
                            continue  # Skip files with misformatted rows

                        # Filter out rows where all values are under one column
                        malformed_df = df[df.apply(lambda row: row.count() == 1, axis=1)]
                        malformed_rows += len(malformed_df)
                        df = df[df.apply(lambda row: row.count() > 1, axis=1)]

                        # Append the number of rows to the total rows processed
                        total_rows_processed += df.shape[0]

                        # Append the valid DataFrame to the list
                        all_data.append(df)

                    except Exception as e:
                        print(f"Error processing file {file_path}: {e}")
                        skipped_files += 1
                        continue  # Skip files with errors like encoding issues or missing data

    # Concatenate all valid data into one DataFrame
    if not all_data:
        print("No valid data found.")
        return pd.DataFrame()  # Return an empty DataFrame if no valid data was found

    full_data = pd.concat(all_data, ignore_index=True)

    # Verify the number of rows before saving to CSV
    print(f"Total rows in concatenated DataFrame: {full_data.shape[0]}")
    print(f"Total malformed rows removed: {malformed_rows}")

    # Save the data to CSV
    full_data.to_csv(output_csv, index=False)
    print(f"Valid data saved to {output_csv}")

    # Print the total number of files processed and total rows processed
    print(f"Total number of .txt files processed: {total_files_processed}")
    print(f"Total number of rows processed: {total_rows_processed}")
    print(f"Total number of skipped files: {skipped_files}")

    return full_data


# Example usage:
samples_directory = "samples"  # Replace with your actual path
read_keystroke_data(samples_directory, output_csv='demographics_csv/uncleaned_all.csv')

Processing file: samples\samples_01\100001_keystrokes.txt
File 100001_keystrokes.txt has 658 rows
Processing file: samples\samples_01\100003_keystrokes.txt
File 100003_keystrokes.txt has 806 rows
Processing file: samples\samples_01\100007_keystrokes.txt
File 100007_keystrokes.txt has 801 rows
Processing file: samples\samples_01\100008_keystrokes.txt
File 100008_keystrokes.txt has 687 rows
Processing file: samples\samples_01\100013_keystrokes.txt
File 100013_keystrokes.txt has 744 rows
Processing file: samples\samples_01\100016_keystrokes.txt
File 100016_keystrokes.txt has 776 rows
Processing file: samples\samples_01\10001_keystrokes.txt
File 10001_keystrokes.txt has 763 rows
Processing file: samples\samples_01\100020_keystrokes.txt
File 100020_keystrokes.txt has 649 rows
Processing file: samples\samples_01\100030_keystrokes.txt
File 100030_keystrokes.txt has 654 rows
Processing file: samples\samples_01\100031_keystrokes.txt
File 100031_keystrokes.txt has 545 rows
Processing file: sampl

Unnamed: 0,PARTICIPANT_ID,TEST_SECTION_ID,SENTENCE,USER_INPUT,KEYSTROKE_ID,PRESS_TIME,RELEASE_TIME,LETTER,KEYCODE
0,100001,1090979,Was wondering if you and Natalie connected?,Was wondering if you and Natalie connected?,51891207.0,1.473275e+12,1473275372663,SHIFT,16
1,100001,1090979,Was wondering if you and Natalie connected?,Was wondering if you and Natalie connected?,51891214.0,1.473275e+12,1473275372703,W,87
2,100001,1090979,Was wondering if you and Natalie connected?,Was wondering if you and Natalie connected?,51891219.0,1.473275e+12,1473275372903,a,65
3,100001,1090979,Was wondering if you and Natalie connected?,Was wondering if you and Natalie connected?,51891226.0,1.473275e+12,1473275372975,s,83
4,100001,1090979,Was wondering if you and Natalie connected?,Was wondering if you and Natalie connected?,51891231.0,1.473275e+12,1473275373079,,32
...,...,...,...,...,...,...,...,...,...
140436,100550,1097261,You must therefore take full responsibility fo...,You must therefore take full responsibility fo...,52190490.0,1.473280e+12,1473280428505,i,73
140437,100550,1097261,You must therefore take full responsibility fo...,You must therefore take full responsibility fo...,52190496.0,1.473280e+12,1473280428527,n,78
140438,100550,1097261,You must therefore take full responsibility fo...,You must therefore take full responsibility fo...,52190503.0,1.473280e+12,1473280428607,g,71
140439,100550,1097261,You must therefore take full responsibility fo...,You must therefore take full responsibility fo...,52190720.0,1.473280e+12,1473280428786,.,190


In [10]:
# clean the csv file by removing rows with more than one column with NaN or <unset> values

import pandas as pd


def clean_csv(input_csv, output_csv):
    """
    Cleans a CSV file by removing rows where more than one column has NaN or <unset> values.

    Parameters:
    - input_csv (str): The path to the input CSV file.
    - output_csv (str): The path to the output CSV file to save cleaned data.
    """
    # Read the CSV file into a DataFrame
    df = pd.read_csv(input_csv, sep=',')

    # Define a function to check for NaN or <unset> values
    def is_unset_or_nan(value):
        return pd.isna(value) or value == '<unset>'

    # Identify and filter out rows where more than one column has NaN or <unset> values
    malformed_rows = df[df.apply(lambda row: sum(is_unset_or_nan(val) for val in row) > 1, axis=1)]
    cleaned_df = df[df.apply(lambda row: sum(is_unset_or_nan(val) for val in row) <= 1, axis=1)]

    # Log the number of malformed rows removed
    print(f"Total malformed rows removed: {len(malformed_rows)}")
    print("Malformed rows:")
    print(malformed_rows)

    # Save the cleaned DataFrame to a new CSV file
    cleaned_df.to_csv(output_csv, index=False)
    print(f"Cleaned data saved to {output_csv}")


# Example usage:
input_csv_path = 'demographics_csv/uncleaned_all.csv'  # Replace with your actual input CSV path
output_csv_path = 'demographics_csv/cleaned_samples_combined.csv'  # Replace with your desired output CSV path
clean_csv(input_csv_path, output_csv_path)

Total malformed rows removed: 0
Malformed rows:
Empty DataFrame
Columns: [PARTICIPANT_ID, TEST_SECTION_ID, SENTENCE, USER_INPUT, KEYSTROKE_ID, PRESS_TIME, RELEASE_TIME, LETTER, KEYCODE]
Index: []
Cleaned data saved to demographics_csv/cleaned_samples_combined.csv


# 2.1 adding keystroke values
### generate D1U2, D1U3, D1D2, D1D3, D1U1_MEAN, D1U2_MEAN, D1U3_MEAN, D1D2_MEAN, D1D3_MEAN, U1D2, U1D2_MEAN, Z_SCORE

In [3]:
import pandas as pd
import numpy as np

def add_new_columns(input_csv, output_csv):
    """
    Adds new columns to a CSV file for each participant and saves the updated data to a new CSV file.

    Parameters:
    - input_csv (str): The path to the input CSV file.
    - output_csv (str): The path to the output CSV file to save updated data.
    """
    # Read the cleaned CSV file into a DataFrame
    df = pd.read_csv(input_csv)

    # Define a function to calculate new columns for each participant
    def calculate_new_columns(group):
        group['D1U1'] = group['RELEASE_TIME'] - group['PRESS_TIME']
        group['D1U2'] = group['RELEASE_TIME'].shift(-1) - group['PRESS_TIME']
        group['D1U3'] = group['RELEASE_TIME'].shift(-2) - group['PRESS_TIME']
        group['D1D2'] = group['PRESS_TIME'].shift(-1) - group['PRESS_TIME']
        group['D1D3'] = group['PRESS_TIME'].shift(-2) - group['PRESS_TIME']
        group['U1D2'] = group['PRESS_TIME'].shift(-1) - group['RELEASE_TIME']
        
        group['D1U1_MEAN'] = group['D1U1'].mean()
        group['D1U2_MEAN'] = group['D1U2'].mean()
        group['D1U3_MEAN'] = group['D1U3'].mean()
        group['D1D2_MEAN'] = group['D1D2'].mean()
        group['D1D3_MEAN'] = group['D1D3'].mean()
        group['U1D2_MEAN'] = group['U1D2'].mean()
        
        group['Z_SCORE'] = (group['PRESS_TIME'] - group['PRESS_TIME'].mean()) / group['PRESS_TIME'].std()
        return group

    # Apply the function to each participant group
    df = df.groupby('PARTICIPANT_ID').apply(calculate_new_columns)

    # Save the updated DataFrame to a new CSV file
    df.to_csv(output_csv, index=False)
    print(f"Updated data saved to {output_csv}")

# Example usage:
input_csv_path = 'demographics_csv/part1_cleaned_samples_combined.csv'  # Replace with your actual input CSV path
output_csv_path = 'demographics_csv/added_key_values.csv'  # Replace with your desired output CSV path
add_new_columns(input_csv_path, output_csv_path)

Updated data saved to demographics_csv/added_key_values.csv


In [5]:
import pandas as pd
import logging

def reduce_participant_ids(input_csv, output_csv, num_ids=800):
    """
    Reduces the number of unique participant IDs in the CSV file to the specified number.

    Parameters:
    - input_csv (str): The path to the input CSV file.
    - output_csv (str): The path to the output CSV file.
    - num_ids (int): The number of unique participant IDs to retain.
    """
    # Set up logging
    logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
    
    logging.info('Starting to read the input CSV file.')
    # Read the CSV file
    df = pd.read_csv(input_csv)
    logging.info('Finished reading the input CSV file.')

    # Get the unique participant IDs
    unique_ids = df['PARTICIPANT_ID'].unique()
    logging.info(f'Found {len(unique_ids)} unique participant IDs.')

    # Select the first num_ids unique participant IDs
    selected_ids = unique_ids[:num_ids]
    logging.info(f'Selected the first {num_ids} unique participant IDs.')

    # Filter the DataFrame to include only the selected participant IDs
    filtered_df = df[df['PARTICIPANT_ID'].isin(selected_ids)]
    logging.info('Filtered the DataFrame to include only the selected participant IDs.')

    # Save the filtered DataFrame to a new CSV file
    filtered_df.to_csv(output_csv, index=False)
    logging.info(f'Filtered data saved to {output_csv}')

# Example usage:
input_csv_path = 'samples/part14_cleaned_samples_combined.csv'  # Replace with the path to your input CSV file
output_csv_path = 'demographics_csv/reduced800_14_participants.csv'  # Replace with the desired output CSV path
reduce_participant_ids(input_csv_path, output_csv_path)

2024-11-13 22:39:05,951 - INFO - Starting to read the input CSV file.
2024-11-13 22:39:15,929 - INFO - Finished reading the input CSV file.
2024-11-13 22:39:15,980 - INFO - Found 9677 unique participant IDs.
2024-11-13 22:39:15,981 - INFO - Selected the first 800 unique participant IDs.
2024-11-13 22:39:16,062 - INFO - Filtered the DataFrame to include only the selected participant IDs.
2024-11-13 22:39:18,948 - INFO - Filtered data saved to demographics_csv/reduced800_14_participants.csv


In [10]:
import pandas as pd
import logging
import os

def reduce_participant_ids(input_csv, output_folder, num_ids_per_file=25):
    """
    Reduces the number of unique participant IDs in the CSV file to the specified number per file and saves them into multiple files.

    Parameters:
    - input_csv (str): The path to the input CSV file.
    - output_folder (str): The folder to save the output CSV files.
    - num_ids_per_file (int): The number of unique participant IDs to retain per file.
    """
    # Set up logging
    logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
    
    logging.info('Starting to read the input CSV file.')
    # Read the CSV file
    df = pd.read_csv(input_csv)
    logging.info('Finished reading the input CSV file.')

    # Get the unique participant IDs
    unique_ids = df['PARTICIPANT_ID'].unique()
    logging.info(f'Found {len(unique_ids)} unique participant IDs.')

    # Ensure the output folder exists
    os.makedirs(output_folder, exist_ok=True)

    # Process the unique IDs in chunks
    for i in range(0, len(unique_ids), num_ids_per_file):
        part_number = (i // num_ids_per_file) + 1
        selected_ids = unique_ids[i:i + num_ids_per_file]
        logging.info(f'Selected {len(selected_ids)} unique participant IDs for part {part_number}.')

        # Filter the DataFrame to include only the selected participant IDs
        filtered_df = df[df['PARTICIPANT_ID'].isin(selected_ids)]
        logging.info(f'Filtered the DataFrame to include only the selected participant IDs for part {part_number}.')

        # Save the filtered DataFrame to a new CSV file
        output_csv = os.path.join(output_folder, f'sub_25_file14_part{part_number}.csv')
        filtered_df.to_csv(output_csv, index=False)
        logging.info(f'Filtered data saved to {output_csv}')

# Example usage:
input_csv_path = 'samples/part14_cleaned_samples_combined.csv'  # Replace with the path to your input CSV file
output_folder_path = 'sub_25_key_features'  # Replace with the desired output folder path
reduce_participant_ids(input_csv_path, output_folder_path)

2024-11-14 21:26:34,339 - INFO - Starting to read the input CSV file.
2024-11-14 21:26:59,121 - INFO - Finished reading the input CSV file.
2024-11-14 21:26:59,235 - INFO - Found 9677 unique participant IDs.
2024-11-14 21:26:59,237 - INFO - Selected 25 unique participant IDs for part 1.
2024-11-14 21:26:59,320 - INFO - Filtered the DataFrame to include only the selected participant IDs for part 1.
2024-11-14 21:26:59,564 - INFO - Filtered data saved to sub_25_key_features\sub_25_file14_part1.csv
2024-11-14 21:26:59,565 - INFO - Selected 25 unique participant IDs for part 2.
2024-11-14 21:26:59,624 - INFO - Filtered the DataFrame to include only the selected participant IDs for part 2.
2024-11-14 21:26:59,848 - INFO - Filtered data saved to sub_25_key_features\sub_25_file14_part2.csv
2024-11-14 21:26:59,849 - INFO - Selected 25 unique participant IDs for part 3.
2024-11-14 21:26:59,900 - INFO - Filtered the DataFrame to include only the selected participant IDs for part 3.
2024-11-14 21

In [1]:
import pandas as pd
import numpy as np
import logging
import os

def add_new_columns_to_folder(input_folder, output_folder):
    """
    Adds new columns to each CSV file in the input folder and saves the updated data to new CSV files in the output folder.

    Parameters:
    - input_folder (str): The path to the folder containing the input CSV files.
    - output_folder (str): The path to the folder to save the output CSV files.
    """
    # Set up logging
    logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

    # Ensure the output folder exists
    os.makedirs(output_folder, exist_ok=True)

    # Process each CSV file in the input folder
    for file_name in os.listdir(input_folder):
        if file_name.endswith('.csv'):
            input_csv = os.path.join(input_folder, file_name)
            output_csv = os.path.join(output_folder, file_name.replace('.csv', '_processed.csv'))

            logging.info(f'Starting to process {input_csv}')
            
            # Read the cleaned CSV file into a DataFrame
            df = pd.read_csv(input_csv)
            logging.info('Finished reading the input CSV file.')

            # Initialize new DataFrame with the required columns
            new_df_columns = ['PARTICIPANT_ID', 'TEST_SECTION_ID', 'SENTENCE', 'USER_INPUT', 'KEYSTROKE_ID', 'PRESS_TIME', 'RELEASE_TIME', 'LETTER', 'KEYCODE', 'D1U1', 'D1U2', 'D1D2', 'U1D2', 'U1U2', 'D1U3', 'D1D3', 'D1U1_MEAN', 'D1U2_MEAN', 'D1U3_MEAN', 'D1D2_MEAN', 'D1D3_MEAN', 'U1D2_MEAN', 'Z_SCORE']
            new_df = pd.DataFrame(columns=new_df_columns)

            # Variables to keep track of previous and next events
            previous_down = None
            previous_up = None
            next_down = None
            next_up = None
            after_next_down = None
            after_next_up = None

            # Initialize participant counter
            participant_counter = 0
            current_participant_id = None

            # Calculate Time Differences and Fill New DataFrame
            for i, row in df.iterrows():
                participant_id = row['PARTICIPANT_ID']
                if participant_id != current_participant_id:
                    current_participant_id = participant_id
                    logging.info(f'Processing new PARTICIPANT_ID: {participant_id}, Counter: {participant_counter}')
                    participant_counter += 1

                keyCode = row['KEYCODE']
                keyDown = row['PRESS_TIME']
                keyUp = row['RELEASE_TIME']

                # Calculate D1U1 (current down to current up)
                D1U1 = keyUp - keyDown

                # Placeholder values for other calculations
                D1U2, D1D2, U1D2, U1U2, D1U3, D1D3 = [None] * 6

                if i > 0:  # If not the first row, calculate values involving previous key
                    D1D2 = keyDown - previous_down
                    U1D2 = keyDown - previous_up
                    U1U2 = keyUp - previous_up

                if i < len(df) - 1:  # If not the last row, look ahead to calculate future values
                    next_row = df.iloc[i + 1]
                    next_down = next_row['PRESS_TIME']
                    next_up = next_row['RELEASE_TIME']
                    D1U2 = next_up - keyDown
                    D1U3 = next_up - keyDown  # Placeholder, needs adjustment for "key after next"

                if i < len(df) - 2:  # If there are at least two keys ahead, calculate D1D3 and D1U3
                    after_next_row = df.iloc[i + 2]
                    after_next_down = after_next_row['PRESS_TIME']
                    after_next_up = after_next_row['RELEASE_TIME']
                    D1D3 = after_next_down - keyDown
                    D1U3 = after_next_up - keyDown

                # Append row to new DataFrame
                new_row = {'PARTICIPANT_ID': row['PARTICIPANT_ID'], 'TEST_SECTION_ID': row['TEST_SECTION_ID'], 'SENTENCE': row['SENTENCE'], 'USER_INPUT': row['USER_INPUT'], 'KEYSTROKE_ID': row['KEYSTROKE_ID'], 'PRESS_TIME': keyDown, 'RELEASE_TIME': keyUp, 'LETTER': row['LETTER'], 'KEYCODE': keyCode, 'D1U1': D1U1, 'D1U2': D1U2, 'D1D2': D1D2, 'U1D2': U1D2, 'U1U2': U1U2, 'D1U3': D1U3, 'D1D3': D1D3}
                new_df = new_df.append(new_row, ignore_index=True)

                # Update previous events
                previous_down = keyDown
                previous_up = keyUp

            # Calculate means and Z_SCORE
            new_df['D1U1_MEAN'] = new_df['D1U1'].mean()
            new_df['D1U2_MEAN'] = new_df['D1U2'].mean()
            new_df['D1U3_MEAN'] = new_df['D1U3'].mean()
            new_df['D1D2_MEAN'] = new_df['D1D2'].mean()
            new_df['D1D3_MEAN'] = new_df['D1D3'].mean()
            new_df['U1D2_MEAN'] = new_df['U1D2'].mean()
            new_df['Z_SCORE'] = (new_df['PRESS_TIME'] - new_df['PRESS_TIME'].mean()) / new_df['PRESS_TIME'].std()

            # Save the updated DataFrame to a new CSV file
            new_df.to_csv(output_csv, index=False)
            logging.info(f'Updated data saved to {output_csv}')

# Example usage:
input_folder_path = 'sub_25_key_features'  # Replace with the path to your input folder
output_folder_path = 'sub_25_key_features_processed'  # Replace with the path to your output folder
add_new_columns_to_folder(input_folder_path, output_folder_path)

2024-11-14 22:08:28,328 - INFO - Starting to process sub_25_key_features\sub_25_file14_part1.csv
2024-11-14 22:08:28,391 - INFO - Finished reading the input CSV file.
2024-11-14 22:08:28,401 - INFO - Processing new PARTICIPANT_ID: 458779, Counter: 0
2024-11-14 22:08:32,286 - INFO - Processing new PARTICIPANT_ID: 458780, Counter: 1
2024-11-14 22:08:34,568 - INFO - Processing new PARTICIPANT_ID: 458781, Counter: 2
2024-11-14 22:08:37,204 - INFO - Processing new PARTICIPANT_ID: 458782, Counter: 3
2024-11-14 22:08:39,776 - INFO - Processing new PARTICIPANT_ID: 458788, Counter: 4
2024-11-14 22:08:42,258 - INFO - Processing new PARTICIPANT_ID: 458796, Counter: 5
2024-11-14 22:08:44,215 - INFO - Processing new PARTICIPANT_ID: 45879, Counter: 6
2024-11-14 22:08:46,747 - INFO - Processing new PARTICIPANT_ID: 458803, Counter: 7
2024-11-14 22:08:49,188 - INFO - Processing new PARTICIPANT_ID: 458804, Counter: 8
2024-11-14 22:08:51,841 - INFO - Processing new PARTICIPANT_ID: 458806, Counter: 9
2024

KeyboardInterrupt: 

In [1]:
import os
import pandas as pd

def merge_csv_files(folder_path, output_file):
    """
    Merges all CSV files in a folder into a single CSV file.

    Parameters:
    folder_path (str): Path to the folder containing CSV files.
    output_file (str): Path to the output CSV file.
    """
    try:
        # List to hold DataFrames
        data_frames = []

        # Iterate through all files in the folder
        for file_name in os.listdir(folder_path):
            # Check if the file is a CSV
            if file_name.endswith('.csv'):
                file_path = os.path.join(folder_path, file_name)
                print(f"Processing file: {file_path}")
                
                # Read the CSV file into a DataFrame
                df = pd.read_csv(file_path)
                data_frames.append(df)

        # Concatenate all DataFrames
        merged_df = pd.concat(data_frames, ignore_index=True)
        
        # Save the merged DataFrame to the output file
        merged_df.to_csv(output_file, index=False)
        print(f"All CSV files have been merged into {output_file}")
    except Exception as e:
        print(f"An error occurred: {e}")

# Example usage
# Replace 'your_folder_path' with the path to your folder containing CSV files
# Replace 'output_file.csv' with your desired output file name
merge_csv_files('sub_25_key_features_processed', 'demographics_csv/merged_sub_25_key_features.csv')


Processing file: sub_25_key_features_processed\sub_25_file14_part100_processed.csv
Processing file: sub_25_key_features_processed\sub_25_file14_part101_processed.csv
Processing file: sub_25_key_features_processed\sub_25_file14_part102_processed.csv
Processing file: sub_25_key_features_processed\sub_25_file14_part103_processed.csv
Processing file: sub_25_key_features_processed\sub_25_file14_part104_processed.csv
Processing file: sub_25_key_features_processed\sub_25_file14_part105_processed.csv
Processing file: sub_25_key_features_processed\sub_25_file14_part106_processed.csv
Processing file: sub_25_key_features_processed\sub_25_file14_part107_processed.csv
Processing file: sub_25_key_features_processed\sub_25_file14_part108_processed.csv
Processing file: sub_25_key_features_processed\sub_25_file14_part109_processed.csv
Processing file: sub_25_key_features_processed\sub_25_file14_part10_processed.csv
Processing file: sub_25_key_features_processed\sub_25_file14_part110_processed.csv
Proce

In [1]:
import pandas as pd
import numpy as np


def add_keystroke_features(input_csv, output_csv):
    """
    Adds new keystroke features to the CSV file and saves the updated data to a new CSV file.

    Parameters:
    - input_csv (str): The path to the input CSV file.
    - output_csv (str): The path to the output CSV file to save updated data.
    """
    # Read the cleaned CSV file into a DataFrame
    df = pd.read_csv(input_csv)

    # # Define left and right hand keys using their numeric KEYCODE values
    # left_hand_keys = {16, 65, 68, 69, 70, 71, 83, 87, 88, 90}  # Example keycodes for left hand
    # right_hand_keys = {32, 73, 78, 79, 82, 85, 89}  # Example keycodes for right hand

    left_hand_keys = {16, 65, 66, 67, 68, 69, 70, 71, 81, 82, 83, 84, 86, 87, 88, 90, 49, 50, 51, 52, 53, 9, 20,
                      190}  # Keycodes for left hand
    right_hand_keys = {16, 32, 72, 73, 74, 75, 76, 77, 78, 79, 80, 85, 89, 48, 54, 55, 56, 57, 8, 13, 189, 191, 188,
                       191}  # Keycodes for right hand

    # left = 0, right = 1
    # Function to determine hand
    def determine_hand(key):
        if key in left_hand_keys:
            return '0'
        elif key in right_hand_keys:
            return '1'
        else:
            return 'unknown'

    # Apply the function to determine hand for each key
    df['HAND'] = df['KEYCODE'].apply(determine_hand)

    # Calculate mean hold time for each hand
    hand_hold_time = df.groupby('HAND')['D1U1'].mean().to_dict()

    # Map the mean hold time back to the DataFrame
    df['HAND_HOLD_TIME'] = df['HAND'].map(hand_hold_time)

    # Calculate Keystroke Duration Variability
    df['KEY_HOLD_TIME_STD'] = df.groupby('KEYCODE')['D1U1'].transform('std')

    # Calculate Error Rate and Correction Features
    df['ERROR_RATE'] = df['KEYCODE'].apply(lambda x: 1 if x == 8 else 0)  # Assuming 8 is the keycode for backspace
    df['ERROR_RATE'] = df['ERROR_RATE'].cumsum() / (df.index + 1)

    # Calculate Consecutive Key Patterns
    df['CONSECUTIVE_KEYS'] = df['KEYCODE'].astype(str) + df['KEYCODE'].shift(-1).astype(str)
    df['CONSECUTIVE_KEYS_TIME'] = df['D1U1'].shift(-1) - df['D1D2']

    # Save the updated DataFrame to a new CSV file
    df.to_csv(output_csv, index=False)
    print(f"Updated data saved to {output_csv}")


# Example usage:
input_csv_path = 'demographics_csv/added_key_values.csv'  # Replace with your actual input CSV path
output_csv_path = 'demographics_csv/enhanced_keystroke_features.csv'  # Replace with your desired output CSV path
add_keystroke_features(input_csv_path, output_csv_path)

Updated data saved to demographics_csv/enhanced_keystroke_features.csv


In [2]:
import pandas as pd


def drop_unknown_hand(input_csv, output_csv):
    """
    Drops all rows where the value in the HAND column is 'unknown' and saves the updated data to a new CSV file.

    Parameters:
    - input_csv (str): The path to the input CSV file.
    - output_csv (str): The path to the output CSV file to save updated data.
    """
    # Read the CSV file into a DataFrame
    df = pd.read_csv(input_csv)

    # Drop rows where the HAND column value is 'unknown'
    df = df[df['HAND'] != 'unknown']

    # Save the updated DataFrame to a new CSV file
    df.to_csv(output_csv, index=False)
    print(f"Updated data saved to {output_csv}")


# Example usage:
input_csv_path = 'demographics_csv/enhanced_keystroke_features.csv'  # Replace with your actual input CSV path
output_csv_path = 'demographics_csv/enhanced_keystroke_features.csv'  # Replace with your desired output CSV path
drop_unknown_hand(input_csv_path, output_csv_path)

  drop_unknown_hand(input_csv_path, output_csv_path)


Updated data saved to demographics_csv/enhanced_keystroke_features.csv


In [3]:
import pandas as pd

def add_speed_classification(file_path, output_file):
    """
    This function adds a new feature 'Speed_Class' to classify participants based on their D1D2_MEAN values.
    Participants are classified from 1 (slowest) to 10 (fastest) based on percentiles of D1D2_MEAN.

    Parameters:
    - file_path (str): The path to the input CSV file.
    - output_file (str): The path where the output CSV file with the new feature will be saved.
    """
    # Load the CSV file into a pandas DataFrame
    df = pd.read_csv(file_path)

    # Ensure D1D2_MEAN column exists and contains valid numeric data
    if 'D1D2_MEAN' not in df.columns or not pd.api.types.is_numeric_dtype(df['D1D2_MEAN']):
        raise ValueError("The input CSV file must contain a numeric 'D1D2_MEAN' column.")

    # Calculate the deciles based on D1D2_MEAN
    df['SPEED_CLASS'] = pd.qcut(df['D1D2_MEAN'], q=10, labels=False, duplicates='drop') + 1

    # Invert the speed class to assign 10 to the fastest (lowest D1D2_MEAN values) and 1 to the slowest
    df['SPEED_CLASS'] = 11 - df['SPEED_CLASS']

    # Save the updated DataFrame to a new CSV file
    df.to_csv(output_file, index=False)

    print(f"Speed classification added. File saved to: {output_file}")

# Example usage:
input_file = 'demographics_csv/enhanced_keystroke_features.csv'  # Replace with the path to your CSV file
output_file = 'demographics_csv/enhanced_keystroke_features.csv'  # Replace with the path where you want to save the new CSV file

add_speed_classification(input_file, output_file)

  add_speed_classification(input_file, output_file)


Speed classification added. File saved to: demographics_csv/enhanced_keystroke_features.csv


In [2]:
# # this might erase outliers where values are too slow or fast
# # to be altered and used if needed, TODO: columns to check must be altered
# import pandas as pd
# 
# def remove_outliers(input_csv, output_csv):
#     """
#     Removes rows with extreme outliers (2.5% fastest and 2.5% slowest) from a CSV file and saves the updated data to a new CSV file.
# 
#     Parameters:
#     - input_csv (str): The path to the input CSV file.
#     - output_csv (str): The path to the output CSV file to save updated data.
#     """
#     # Read the cleaned CSV file into a DataFrame
#     df = pd.read_csv(input_csv)
# 
#     # Define the columns to check for outliers
#     columns_to_check = ['KEYSTROKE_ID']  # Add other relevant columns if needed
# 
#     # Calculate the 2.5% and 97.5% percentiles for each column
#     lower_bound = df[columns_to_check].quantile(0.025)
#     upper_bound = df[columns_to_check].quantile(0.975)
# 
#     # Filter out rows with values outside the 2.5% to 97.5% range
#     df_filtered = df[(df[columns_to_check] >= lower_bound) & (df[columns_to_check] <= upper_bound)].dropna()
# 
#     # Save the updated DataFrame to a new CSV file
#     df_filtered.to_csv(output_csv, index=False)
#     print(f"Updated data saved to {output_csv}")
# 
# # Example usage:
# input_csv_path = 'added_key_values.csv'  # Replace with your actual input CSV path
# output_csv_path = 'filtered_key_values.csv'  # Replace with your desired output CSV path
# remove_outliers(input_csv_path, output_csv_path)

# 3.0 Clean metadata_participants.csv

In [1]:
import pandas as pd


def clean_metadata_participants(input_csv, output_csv):
    """
    Cleans the metadata_participants CSV file by removing rows with <null> or <unset> values,
    rows where AGE is less than 10, and rows where GENDER is 'none'.

    Parameters:
    - input_csv (str): The path to the input CSV file.
    - output_csv (str): The path to the output CSV file to save cleaned data.
    """
    # Read the CSV file into a DataFrame
    df = pd.read_csv(input_csv)

    # Drop rows with <null> or <unset> values
    df = df.replace(['<null>', '<unset>'], pd.NA).dropna()

    # Drop rows where AGE is less than 10
    df = df[df['AGE'] >= 10]

    # Drop rows where GENDER is 'none'
    df = df[df['GENDER'] != 'none']

    # Save the cleaned DataFrame to a new CSV file
    df.to_csv(output_csv, index=False)
    print(f"Cleaned data saved to {output_csv}")


# Example usage:
input_csv_path = 'demographics_csv/metadata_participants.csv'  # Path to the input CSV file
output_csv_path = 'demographics_csv/cleaned_metadata_participants.csv'  # Path to the output CSV file
clean_metadata_participants(input_csv_path, output_csv_path)

Cleaned data saved to demographics_csv/cleaned_metadata_participants.csv


# 4.0 Merge keystroke data with metadata

In [2]:
import pandas as pd


def merge_metadata_keystroke(metadata_csv, keystroke_csv, output_csv):
    """
    Merges the cleaned metadata CSV file with the keystroke data CSV file on PARTICIPANT_ID
    and saves the merged data to a new CSV file.

    Parameters:
    - metadata_csv (str): The path to the cleaned metadata CSV file.
    - keystroke_csv (str): The path to the keystroke data CSV file.
    - output_csv (str): The path to the output CSV file to save merged data.
    """
    # Read the cleaned metadata CSV file into a DataFrame
    metadata_df = pd.read_csv(metadata_csv)

    # Read the keystroke data CSV file into a DataFrame
    keystroke_df = pd.read_csv(keystroke_csv)

    # Merge the DataFrames on PARTICIPANT_ID
    merged_df = pd.merge(metadata_df, keystroke_df, on='PARTICIPANT_ID')

    # Save the merged DataFrame to a new CSV file
    merged_df.to_csv(output_csv, index=False)
    print(f"Merged data saved to {output_csv}")


# Example usage:
metadata_csv_path = 'demographics_csv/cleaned_metadata_participants.csv'  # Path to the cleaned metadata CSV file
keystroke_csv_path = 'demographics_csv/merged_sub_25_key_features.csv'  # Path to the keystroke data CSV file
output_csv_path = 'demographics_csv/demo_keystroke.csv'  # Path to the output CSV file
merge_metadata_keystroke(metadata_csv_path, keystroke_csv_path, output_csv_path)

Merged data saved to demographics_csv/demo_keystroke.csv


# 5.0 Display distribution and equalising sample amount

In [5]:
import pandas as pd


def display_gender_distribution(csv_path):
    """
    Displays the percentage distribution of male and female in the GENDER column of the given CSV file.

    Parameters:
    - csv_path (str): The path to the CSV file.
    """
    # Step 1: Read the CSV file
    df = pd.read_csv(csv_path)

    # Step 2: Calculate distribution
    gender_counts = df['GENDER'].value_counts(normalize=True) * 100

    # Step 3: Display distribution
    print("Gender Distribution:")
    for gender, percentage in gender_counts.items():
        print(f"{gender}: {percentage:.2f}%")


# Example usage:
csv_path = 'demographics_csv/demo_keystroke.csv'  # Path to the CSV file
display_gender_distribution(csv_path)

Gender Distribution:
female: 51.49%
male: 48.51%


In [2]:
import pandas as pd
csv_path = 'demographics_csv/demo_keystroke.csv'  # Path to the CSV file
df = pd.read_csv(csv_path, low_memory=False)

participant_ids = df['PARTICIPANT_ID']
print(len(participant_ids))
    
unique_ids = participant_ids.unique()
print("Unique IDs:", unique_ids)
print("Number of unique IDs:", len(unique_ids))

1631487
Unique IDs: [   459    466    469 ... 472726 472743 472752]
Number of unique IDs: 2234


In [2]:
import pandas as pd


def balance_gender_samples(input_csv, output_csv):
    """
    Balances the number of male and female samples in the given CSV file and saves the balanced data to a new CSV file.

    Parameters:
    - input_csv (str): The path to the input CSV file.
    - output_csv (str): The path to the output CSV file to save balanced data.
    """
    # Step 1: Read the CSV file
    df = pd.read_csv(input_csv)

    # Step 2: Separate male and female samples
    male_df = df[df['GENDER'] == 'male']
    female_df = df[df['GENDER'] == 'female']

    # Step 3: Determine the minimum sample size
    min_sample_size = min(len(male_df), len(female_df))

    # Step 4: Sample equal amounts
    balanced_male_df = male_df.sample(n=min_sample_size, random_state=42)
    balanced_female_df = female_df.sample(n=min_sample_size, random_state=42)

    # Step 5: Concatenate and save
    balanced_df = pd.concat([balanced_male_df, balanced_female_df])
    balanced_df.to_csv(output_csv, index=False)
    print(f"Balanced data saved to {output_csv}")


# Example usage:
input_csv_path = 'demographics_csv/demo_keystroke.csv'  # Path to the input CSV file
output_csv_path = 'demographics_csv/male_female_balanced.csv'  # Path to the output CSV file
balance_gender_samples(input_csv_path, output_csv_path)


Balanced data saved to demographics_csv/male_female_balanced.csv


In [3]:
import pandas as pd

def clean_gender_column(input_csv, output_csv):
    """
    Cleans the CSV file by dropping all rows where the GENDER column is not 'male' or 'female'.

    Parameters:
    - input_csv (str): The path to the input CSV file.
    - output_csv (str): The path to the output CSV file to save cleaned data.
    """
    # Read the CSV file into a DataFrame
    df = pd.read_csv(input_csv)

    # Drop rows where GENDER is not 'male' or 'female'
    df = df[df['GENDER'].isin(['male', 'female'])]

    # Save the cleaned DataFrame to a new CSV file
    df.to_csv(output_csv, index=False)
    print(f"Cleaned data saved to {output_csv}")

# Example usage:
input_csv_path = 'demographics_csv/male_female_balanced.csv'  # Replace with your actual input CSV path
output_csv_path = 'demographics_csv/male_female_balanced.csv'  # Replace with your desired output CSV path
clean_gender_column(input_csv_path, output_csv_path)

Cleaned data saved to demographics_csv/male_female_balanced.csv


In [6]:
display_gender_distribution(output_csv_path)

Gender Distribution:
male: 50.00%
female: 50.00%


# 6.0 KNN

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay
from sklearn.inspection import permutation_importance
import matplotlib.pyplot as plt
import pickle
import logging

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

def visualize_feature_importance(csv_path):
    logging.info('Starting feature importance visualization.')

    # Step 1: Read the CSV file
    logging.info('Reading CSV file.')
    df = pd.read_csv(csv_path, low_memory=False)

    # Drop rows with NaN or null values
    logging.info('Dropping rows with NaN or null values.')
    df = df.dropna()

    # Normalize GENDER column to lowercase for consistency
    logging.info('Normalizing GENDER column to lowercase.')
    df['GENDER'] = df['GENDER'].str.lower().map({'male': 1, 'female': 0})  # Map gender to binary

    # Filter valid GENDER values
    logging.info('Filtering valid GENDER values.')
    df = df[df['GENDER'].isin([0, 1])]
    if df['GENDER'].isna().any() or df['GENDER'].nunique() != 2:
        logging.error('Invalid GENDER column values after preprocessing.')
        return

    # Define features
    features = ['D1U1', 'D1U2', 'D1U3', 'D1D2', 'D1D3', 'Z_SCORE']
    X = df[features]
    y = df['GENDER']
    participant_ids = df['PARTICIPANT_ID']

    # Ensure unique PARTICIPANT_IDs in train and test sets
    unique_ids = participant_ids.unique()
    train_ids, test_ids = train_test_split(unique_ids, test_size=0.2, random_state=42)
    train_mask = participant_ids.isin(train_ids)
    test_mask = participant_ids.isin(test_ids)

    X_train, X_test = X[train_mask], X[test_mask]
    y_train, y_test = y[train_mask], y[test_mask]

    # Scale features
    logging.info('Scaling features.')
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    # Train KNN model
    logging.info('Training KNN model.')
    knn = KNeighborsClassifier(n_neighbors=5)
    knn.fit(X_train, y_train)

    # Predictions and confusion matrix
    logging.info('Predicting and evaluating.')
    y_pred = knn.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    logging.info(f'Accuracy: {accuracy}')

    # Debug y_test and y_pred
    logging.info(f"Unique values in y_test: {set(y_test)}")
    logging.info(f"Unique values in y_pred: {set(y_pred)}")

    if not set(y_test).issubset({0, 1}):
        logging.error(f"y_test contains unexpected values: {set(y_test)}")
        return
    if not set(y_pred).issubset({0, 1}):
        logging.error(f"y_pred contains unexpected values: {set(y_pred)}")
        return

    # Confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=[0, 1])
    disp.plot(cmap=plt.cm.Blues)
    plt.title('Confusion Matrix')
    plt.savefig('confusion_matrix.png')
    plt.close()
    logging.info('Confusion matrix saved as confusion_matrix.png.')

    # Feature importance using permutation importance
    logging.info('Calculating feature importance.')
    try:
        result = permutation_importance(knn, X_test, y_test, n_repeats=10, random_state=42, n_jobs=2)
        feature_importance = pd.Series(result.importances_mean, index=features)
        feature_importance.sort_values(ascending=False, inplace=True)

        # Save feature importance as an image
        plt.figure(figsize=(10, 6))
        feature_importance.plot(kind='bar', color='skyblue')
        plt.title('Feature Importance')
        plt.xlabel('Features')
        plt.ylabel('Importance')
        plt.savefig('feature_importance.png')
        plt.close()
        logging.info('Feature importance saved as feature_importance.png.')
    except ValueError as e:
        logging.error(f"Error during permutation importance: {e}")
        return

    # Save artifacts
    with open('knn_model.pkl', 'wb') as f:
        pickle.dump(knn, f)
    with open('scaler.pkl', 'wb') as f:
        pickle.dump(scaler, f)
    with open('confusion_matrix.pkl', 'wb') as f:
        pickle.dump(cm, f)
    logging.info('Model, scaler, and confusion matrix saved as .pkl files.')

# Example usage:
csv_path = 'demographics_csv/male_female_balanced.csv'  # Update with your dataset path
visualize_feature_importance(csv_path)


2024-12-10 02:06:37,174 - INFO - Starting feature importance visualization.
2024-12-10 02:06:37,175 - INFO - Reading CSV file.
2024-12-10 02:06:50,748 - INFO - Dropping rows with NaN or null values.
2024-12-10 02:06:51,647 - INFO - Normalizing GENDER column to lowercase.
2024-12-10 02:06:52,021 - INFO - Filtering valid GENDER values.
2024-12-10 02:06:52,554 - INFO - Scaling features.
2024-12-10 02:06:52,696 - INFO - Training KNN model.
2024-12-10 02:06:57,587 - INFO - Predicting and evaluating.
2024-12-10 02:10:59,667 - INFO - Accuracy: 0.49368629349125903
2024-12-10 02:10:59,680 - INFO - Unique values in y_test: {0, 1}
2024-12-10 02:10:59,695 - INFO - Unique values in y_pred: {0, 1}
2024-12-10 02:10:59,865 - INFO - Confusion matrix saved as confusion_matrix.png.
2024-12-10 02:10:59,866 - INFO - Calculating feature importance.
2024-12-10 02:19:21,784 - ERROR - Error during permutation importance: Classification metrics can't handle a mix of unknown and binary targets


In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay
from sklearn.inspection import permutation_importance
import matplotlib.pyplot as plt
import pickle
import logging

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

def visualize_feature_importance(csv_path):
    logging.info('Starting feature importance visualization.')

    # Step 1: Read the CSV file
    logging.info('Reading CSV file.')
    df = pd.read_csv(csv_path, low_memory=False)

    # Drop rows with NaN or null values
    logging.info('Dropping rows with NaN or null values.')
    df = df.dropna()

    # Normalize GENDER column to lowercase for consistency
    logging.info('Normalizing GENDER column to lowercase.')
    df['GENDER'] = df['GENDER'].str.lower().map({'male': 1, 'female': 0})  # Map gender to binary

    # Check for valid data
    if df['GENDER'].isna().any() or df['GENDER'].nunique() != 2:
        logging.error('Invalid GENDER column values after preprocessing.')
        return

    # Define features
    features = ['D1U1', 'D1U2', 'D1U3', 'D1D2', 'D1D3', 'Z_SCORE']
    X = df[features]
    y = df['GENDER']
    participant_ids = df['PARTICIPANT_ID']

    # Ensure unique PARTICIPANT_IDs in train and test sets
    unique_ids = participant_ids.unique()
    train_ids, test_ids = train_test_split(unique_ids, test_size=0.2, random_state=42)
    train_mask = participant_ids.isin(train_ids)
    test_mask = participant_ids.isin(test_ids)

    X_train, X_test = X[train_mask], X[test_mask]
    y_train, y_test = y[train_mask], y[test_mask]

    # Scale features
    logging.info('Scaling features.')
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    # Train KNN model
    logging.info('Training KNN model.')
    knn = KNeighborsClassifier(n_neighbors=5)
    knn.fit(X_train, y_train)

    # Predictions and confusion matrix
    logging.info('Predicting and evaluating.')
    y_pred = knn.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    logging.info(f'Accuracy: {accuracy}')

    # Confusion matrix
    cm = confusion_matrix(y_test, y_pred)

    # Save confusion matrix as an image
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=[0, 1])
    disp.plot(cmap=plt.cm.Blues)
    plt.title('Confusion Matrix')
    plt.savefig('confusion_matrix.png')
    plt.close()
    logging.info('Confusion matrix saved as confusion_matrix.png.')

    # Feature importance using permutation importance
    logging.info('Calculating feature importance.')
    result = permutation_importance(knn, X_test, y_test, n_repeats=10, random_state=42, n_jobs=2)
    feature_importance = pd.Series(result.importances_mean, index=features)
    feature_importance.sort_values(ascending=False, inplace=True)

    # Save feature importance as an image
    plt.figure(figsize=(10, 6))
    feature_importance.plot(kind='bar', color='skyblue')
    plt.title('Feature Importance')
    plt.xlabel('Features')
    plt.ylabel('Importance')
    plt.savefig('feature_importance.png')
    plt.close()
    logging.info('Feature importance saved as feature_importance.png.')

    # Save artifacts
    with open('knn_model.pkl', 'wb') as f:
        pickle.dump(knn, f)
    with open('scaler.pkl', 'wb') as f:
        pickle.dump(scaler, f)
    with open('confusion_matrix.pkl', 'wb') as f:
        pickle.dump(cm, f)
    logging.info('Model, scaler, and confusion matrix saved as .pkl files.')

# Example usage:
csv_path = 'demographics_csv/male_female_balanced.csv'  # Update with your dataset path
visualize_feature_importance(csv_path)


2024-12-10 01:17:36,614 - INFO - Starting feature importance visualization.
2024-12-10 01:17:36,615 - INFO - Reading CSV file.
2024-12-10 01:17:49,149 - INFO - Dropping rows with NaN or null values.
2024-12-10 01:17:49,894 - INFO - Normalizing GENDER column to lowercase.
2024-12-10 01:17:50,378 - INFO - Scaling features.
2024-12-10 01:17:50,496 - INFO - Training KNN model.
2024-12-10 01:17:54,876 - INFO - Predicting and evaluating.
2024-12-10 01:22:00,932 - INFO - Accuracy: 0.49368629349125903
2024-12-10 01:22:01,244 - INFO - Confusion matrix saved as confusion_matrix.png.
2024-12-10 01:22:01,245 - INFO - Calculating feature importance.


ValueError: Classification metrics can't handle a mix of unknown and binary targets

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.inspection import permutation_importance
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt

def visualize_feature_importance(csv_path):
    # Step 1: Read the CSV file
    df = pd.read_csv(csv_path, low_memory=False)

    # Step 2: Preprocess data
    df = df.dropna()
    label_encoder = LabelEncoder()
    df['GENDER'] = label_encoder.fit_transform(df['GENDER'])
    features = [
        'AVG_WPM_15', 'AVG_IKI', 'ECPC', 'KSPC', 'ROR', 'D1U1', 'D1U2', 'D1U3', 'D1D2', 'D1D3', 'Z_SCORE'
    ]
    X = df[features]
    y = df['GENDER']
    participant_ids = df['PARTICIPANT_ID']

    # Ensure unique PARTICIPANT_IDs in train and test sets
    unique_ids = participant_ids.unique()
    train_ids, test_ids = train_test_split(unique_ids, test_size=0.2, random_state=42)
    train_mask = participant_ids.isin(train_ids)
    test_mask = participant_ids.isin(test_ids)

    X_train, X_test = X[train_mask], X[test_mask]
    y_train, y_test = y[train_mask], y[test_mask]

    # Scale features
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    # Train KNN model
    knn = KNeighborsClassifier(n_neighbors=5)
    knn.fit(X_train, y_train)

    # Calculate feature importance using permutation importance
    result = permutation_importance(knn, X_test, y_test, n_repeats=10, random_state=42, n_jobs=2)
    importance = result.importances_mean

    # Visualize feature importance
    feature_importance = pd.Series(importance, index=features)
    feature_importance.sort_values(ascending=False, inplace=True)

    plt.figure(figsize=(10, 6))
    feature_importance.plot(kind='bar')
    plt.title('Feature Importance')
    plt.xlabel('Features')
    plt.ylabel('Importance')
    plt.show()

    # Print accuracy
    y_pred = knn.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f'Accuracy: {accuracy:.2f}')

# Example usage:
csv_path = 'demographics_csv/demo_keystroke.csv'  # Path to the CSV file
visualize_feature_importance(csv_path)

KeyboardInterrupt: 

# 7.0 ANN

In [3]:

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.utils import to_categorical
from sklearn.metrics import accuracy_score

# Step 1: Read the CSV file with dtype specification
df = pd.read_csv('demographics_csv/male_female_balanced.csv', low_memory=False)

# Step 2: Preprocess data
# Drop rows with missing values
df = df.dropna()

# Encode the 'GENDER' column
label_encoder = LabelEncoder()
df['GENDER'] = label_encoder.fit_transform(df['GENDER'])

# Select specified features and target
features = [
    'D1U1', 'D1U2', 'D1U3', 'D1D2', 'D1D3', 'Z_SCORE', 'U1D2', 'SPEED_CLASS', 'HAND_HOLD_TIME', 'KEY_HOLD_TIME_STD',
    'HAND'
]
X = df[features]
y = df['GENDER']
participant_ids = df['PARTICIPANT_ID']

# Ensure unique PARTICIPANT_IDs in train and test sets
unique_ids = participant_ids.unique()
train_ids, test_ids = train_test_split(unique_ids, test_size=0.2, random_state=42)
train_mask = participant_ids.isin(train_ids)
test_mask = participant_ids.isin(test_ids)

X_train, X_test = X[train_mask], X[test_mask]
y_train, y_test = y[train_mask], y[test_mask]

# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Convert target to categorical
y_train = to_categorical(y_train)
y_test = to_categorical(y_test)

# Step 4: Build ANN model
model = Sequential()
model.add(Dense(32, input_dim=X_train.shape[1], activation='relu'))
model.add(Dense(16, activation='relu'))
model.add(Dense(2, activation='softmax'))

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Step 5: Train ANN model
model.fit(X_train, y_train, epochs=20, batch_size=10, validation_split=0.2)

# Step 6: Evaluate model
y_pred = model.predict(X_test)
y_pred_classes = y_pred.argmax(axis=1)
y_test_classes = y_test.argmax(axis=1)
accuracy = accuracy_score(y_test_classes, y_pred_classes)
print(f'Accuracy: {accuracy:.2f}')

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/20
[1m6556/6556[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 2ms/step - accuracy: 0.6088 - loss: 0.6683 - val_accuracy: 0.0241 - val_loss: 0.9457
Epoch 2/20
[1m6556/6556[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 2ms/step - accuracy: 0.6102 - loss: 0.6611 - val_accuracy: 0.1204 - val_loss: 0.8877
Epoch 3/20
[1m6556/6556[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 2ms/step - accuracy: 0.6235 - loss: 0.6527 - val_accuracy: 0.1724 - val_loss: 0.9165
Epoch 4/20
[1m6556/6556[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 2ms/step - accuracy: 0.6292 - loss: 0.6439 - val_accuracy: 0.2425 - val_loss: 0.9290
Epoch 5/20
[1m6556/6556[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 2ms/step - accuracy: 0.6407 - loss: 0.6363 - val_accuracy: 0.2061 - val_loss: 0.9486
Epoch 6/20
[1m6556/6556[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 2ms/step - accuracy: 0.6492 - loss: 0.6323 - val_accuracy: 0.3278 - val_loss: 0.8418
Epoch 7/20

# 8.0 SVM

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay
from sklearn.inspection import permutation_importance
import matplotlib.pyplot as plt
import seaborn as sns

def train_svm_model(csv_path):
    # Read the CSV file
    df = pd.read_csv(csv_path, low_memory=False)

    # Preprocess data
    df = df.dropna()
    label_encoder = LabelEncoder()
    df['GENDER'] = label_encoder.fit_transform(df['GENDER'])
    features = ['D1U1', 'D1U2', 'D1U3', 'D1D2', 'D1D3', 'SPEED_CLASS', 'HAND_HOLD_TIME', 'KEY_HOLD_TIME_STD', 'HAND',
                'Z_SCORE', 'U1D2', 'CONSECUTIVE_KEYS', 'CONSECUTIVE_KEYS_TIME']
    X = df[features]
    y = df['GENDER']
    participant_ids = df['PARTICIPANT_ID']

    # Ensure unique PARTICIPANT_IDs in train and test sets
    unique_ids = participant_ids.unique()
    train_ids, test_ids = train_test_split(unique_ids, test_size=0.2, random_state=42)
    train_mask = participant_ids.isin(train_ids)
    test_mask = participant_ids.isin(test_ids)

    X_train, X_test = X[train_mask], X[test_mask]
    y_train, y_test = y[train_mask], y[test_mask]

    # Scale features
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    # Train SVM model
    svm_model = SVC()
    svm_model.fit(X_train, y_train)

    # Evaluate SVM model
    y_pred_svm = svm_model.predict(X_test)
    accuracy_svm = accuracy_score(y_test, y_pred_svm)
    print(f'SVM Accuracy: {accuracy_svm:.2f}')

    # Display feature importance using permutation importance
    result = permutation_importance(svm_model, X_test, y_test, n_repeats=10, random_state=42, n_jobs=2)
    importance = result.importances_mean
    feature_importance = pd.Series(importance, index=features)
    feature_importance.sort_values(ascending=False, inplace=True)

    plt.figure(figsize=(10, 6))
    feature_importance.plot(kind='bar')
    plt.title('SVM Feature Importance')
    plt.xlabel('Features')
    plt.ylabel('Importance')
    plt.show()

    # Display confusion matrix
    cm = confusion_matrix(y_test, y_pred_svm)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=label_encoder.classes_)
    disp.plot(cmap=plt.cm.Blues)
    plt.title('SVM Confusion Matrix')
    plt.show()

csv_path = 'demographics_csv/male_female_balanced.csv'  # Replace with your actual CSV path
train_svm_model(csv_path)

# 9.0 Random Forest

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay
from sklearn.inspection import permutation_importance
import matplotlib.pyplot as plt
import seaborn as sns

def train_rf_model(csv_path):
    # Read the CSV file
    df = pd.read_csv(csv_path, low_memory=False)

    # Preprocess data
    df = df.dropna()
    label_encoder = LabelEncoder()
    df['GENDER'] = label_encoder.fit_transform(df['GENDER'])
    features = ['D1U1', 'D1U2', 'D1U3', 'D1D2', 'D1D3', 'HAND_HOLD_TIME', 'KEY_HOLD_TIME_STD', 'HAND',
                'Z_SCORE', 'U1D2', 'CONSECUTIVE_KEYS', 'CONSECUTIVE_KEYS_TIME', 'SPEED_CLASS', 'AVG_WPM_15', 'AVG_IKI', 'ECPC', 'KSPC', 'ROR']
    X = df[features]
    y = df['GENDER']
    participant_ids = df['PARTICIPANT_ID']

    # Ensure unique PARTICIPANT_IDs in train and test sets
    unique_ids = participant_ids.unique()
    train_ids, test_ids = train_test_split(unique_ids, test_size=0.2, random_state=42)
    train_mask = participant_ids.isin(train_ids)
    test_mask = participant_ids.isin(test_ids)

    X_train, X_test = X[train_mask], X[test_mask]
    y_train, y_test = y[train_mask], y[test_mask]

    # Scale features
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    # Train Random Forest model
    rf_model = RandomForestClassifier()
    rf_model.fit(X_train, y_train)

    # Evaluate Random Forest model
    y_pred_rf = rf_model.predict(X_test)
    accuracy_rf = accuracy_score(y_test, y_pred_rf)
    print(f'Random Forest Accuracy: {accuracy_rf:.2f}')

    # Display feature importance using permutation importance
    result = permutation_importance(rf_model, X_test, y_test, n_repeats=10, random_state=42, n_jobs=2)
    importance = result.importances_mean
    feature_importance = pd.Series(importance, index=features)
    feature_importance.sort_values(ascending=False, inplace=True)

    plt.figure(figsize=(10, 6))
    feature_importance.plot(kind='bar')
    plt.title('Random Forest Feature Importance')
    plt.xlabel('Features')
    plt.ylabel('Importance')
    plt.show()

    # Display confusion matrix
    cm = confusion_matrix(y_test, y_pred_rf)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=label_encoder.classes_)
    disp.plot(cmap=plt.cm.Blues)
    plt.title('Random Forest Confusion Matrix')
    plt.show()

csv_path = 'demographics_csv/male_female_balanced.csv'  # Replace with your actual CSV path
train_rf_model(csv_path)

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import PCA

# Load the dataset
df = pd.read_csv('demographics_csv/male_female_balanced.csv')

# Preprocess data
df = df.dropna()
label_encoder = LabelEncoder()
df['GENDER'] = label_encoder.fit_transform(df['GENDER'])

# Convert all categorical columns to numeric
categorical_columns = df.select_dtypes(include=['object']).columns
for column in categorical_columns:
    df[column] = df[column].astype('category').cat.codes

# Box Plot
plt.figure(figsize=(10, 6))
sns.boxplot(x='GENDER', y='SPEED_CLASS', data=df)
plt.title('Box Plot of SPEED_CLASS by Gender')
plt.show()

# Violin Plot
plt.figure(figsize=(10, 6))
sns.violinplot(x='GENDER', y='SPEED_CLASS', data=df)
plt.title('Violin Plot of SPEED_CLASS by Gender')
plt.show()

# Histogram
plt.figure(figsize=(10, 6))
df[df['GENDER'] == 0]['SPEED_CLASS'].hist(alpha=0.5, color='blue', bins=30, label='Male')
df[df['GENDER'] == 1]['SPEED_CLASS'].hist(alpha=0.5, color='red', bins=30, label='Female')
plt.title('Histogram of SPEED_CLASS by Gender')
plt.xlabel('SPEED_CLASS')
plt.ylabel('Frequency')
plt.legend()
plt.show()

# Pair Plot
sns.pairplot(df, hue='GENDER', vars=['SPEED_CLASS', 'CONSECUTIVE_KEYS', 'HAND'])
plt.show()

# Heatmap
plt.figure(figsize=(10, 6))
corr = df.corr()
sns.heatmap(corr, annot=True, cmap='coolwarm')
plt.title('Heatmap of Feature Correlations')
plt.show()

# PCA
pca = PCA(n_components=2)
pca_result = pca.fit_transform(df.drop(columns=['GENDER']))
df['PCA1'] = pca_result[:, 0]
df['PCA2'] = pca_result[:, 1]

plt.figure(figsize=(10, 6))
sns.scatterplot(x='PCA1', y='PCA2', hue='GENDER', data=df, palette=['blue', 'red'])
plt.title('PCA of Features by Gender')
plt.show()