In [ ]:
# from a local folder read 10k files and put them in project here

import os
import shutil

def organize_samples(source_dir, project_dir, total_files=10000, files_per_folder=100):
    """
    Organizes .txt files from the source directory into multiple folders
    in the 'samples' folder in the project directory, each containing a specified number of files.

    Parameters:
    - source_dir (str): The path to the directory containing the original .txt files.
    - project_dir (str): The root path to your project directory (which contains the 'samples' folder).
    - total_files (int): The total number of files to process (default is 10,000).
    - files_per_folder (int): The number of files per folder (default is 100).
    """
    # Path to the 'samples' folder in the project directory
    samples_dir = os.path.join(project_dir, 'samples')
    
    # Ensure the 'samples' directory exists in the project directory
    os.makedirs(samples_dir, exist_ok=True)

    # Get a list of all .txt files in the source directory
    all_files = [f for f in os.listdir(source_dir) if f.endswith('.txt')]
    
    # Sort the files (optional, depending on whether you want them in a specific order)
    all_files.sort()

    # Limit to the total number of files specified
    files_to_copy = all_files[:total_files]

    folder_number = 1
    file_count = 0
    total_copied = 0

    # Create the first folder (samples_01)
    current_folder_name = f'samples_{folder_number:02d}'
    current_folder_path = os.path.join(samples_dir, current_folder_name)
    os.makedirs(current_folder_path, exist_ok=True)

    for file_name in files_to_copy:
        source_file = os.path.join(source_dir, file_name)
        target_file = os.path.join(current_folder_path, file_name)

        # Copy the file
        shutil.copy2(source_file, target_file)
        file_count += 1
        total_copied += 1

        # Check if the current folder has reached the desired number of files
        if file_count >= files_per_folder:
            folder_number += 1
            if total_copied >= total_files:
                break  # Stop if we've copied the total desired number of files
            # Reset file count and create a new folder
            file_count = 0
            current_folder_name = f'samples_{folder_number:02d}'
            current_folder_path = os.path.join(samples_dir, current_folder_name)
            os.makedirs(current_folder_path, exist_ok=True)
    
    print(f'Total files copied: {total_copied}')
    print(f'Files organized into {folder_number} folders.')

# Example usage:
source_directory = '/path/to/your/source/folder'  # Replace with the path to your folder with 167,000 .txt files
project_directory = '/path/to/your/pycharm/project'  # Replace with the root path to your PyCharm project

organize_samples(source_directory, project_directory)


In [1]:
import pandas as pd

# Load the keystrokes data from the txt file
keystroke_file = '/mnt/data/198_keystrokes.txt'
columns = ['PARTICIPANT_ID', 'TEST_SECTION_ID', 'SENTENCE', 'USER_INPUT', 'KEYSTROKE_ID', 
           'PRESS_TIME', 'RELEASE_TIME', 'LETTER', 'KEYCODE']

# Load into DataFrame
df = pd.read_csv(keystroke_file, sep='\t', names=columns)

# Clean the data: remove any NaN or null values
df = df.dropna()

# Ensure the PRESS_TIME and RELEASE_TIME columns are in integer format
df['PRESS_TIME'] = pd.to_numeric(df['PRESS_TIME'])
df['RELEASE_TIME'] = pd.to_numeric(df['RELEASE_TIME'])

# Create the features D1U1, D1U2, D1U3, D1D2, D1D3
# D1U1: Time between key press and release of the same key
df['D1U1'] = df['RELEASE_TIME'] - df['PRESS_TIME']

# D1U2: Time between key press of current key and release of the next key
df['D1U2'] = df['RELEASE_TIME'].shift(-1) - df['PRESS_TIME']

# D1U3: Time between key press of current key and release of the key after the next one
df['D1U3'] = df['RELEASE_TIME'].shift(-2) - df['PRESS_TIME']

# D1D2: Time between pressing two consecutive keys
df['D1D2'] = df['PRESS_TIME'].shift(-1) - df['PRESS_TIME']

# D1D3: Time between pressing the current key and the one after the next one
df['D1D3'] = df['PRESS_TIME'].shift(-2) - df['PRESS_TIME']

# Drop any rows where the computed values are NaN
df = df.dropna()

# Display the resulting DataFrame
import ace_tools as tools; tools.display_dataframe_to_user(name="Keystroke Data with Features", dataframe=df)

# Save to CSV for future use
df.to_csv('/mnt/data/keystroke_features.csv', index=False)


FileNotFoundError: [Errno 2] No such file or directory: '/mnt/data/198_keystrokes.txt'