In [1]:
# from a local folder read 10k files and put them in project here
# can adjust file amount 

import os
import shutil

def organize_samples(source_dir, project_dir, total_files=10000, files_per_folder=100):
    """
    Organizes .txt files from the source directory into multiple folders
    in the 'samples' folder in the project directory, each containing a specified number of files.

    Parameters:
    - source_dir (str): The path to the directory containing the original .txt files.
    - project_dir (str): The root path to your project directory (which contains the 'samples' folder).
    - total_files (int): The total number of files to process (default is 10,000).
    - files_per_folder (int): The number of files per folder (default is 100).
    """
    # Path to the 'samples' folder in the project directory
    samples_dir = os.path.join(project_dir, 'samples')
    
    # Ensure the 'samples' directory exists in the project directory
    os.makedirs(samples_dir, exist_ok=True)

    # Get a list of all .txt files in the source directory
    all_files = [f for f in os.listdir(source_dir) if f.endswith('.txt')]
    
    # Sort the files (optional, depending on whether you want them in a specific order)
    all_files.sort()

    # Limit to the total number of files specified
    files_to_copy = all_files[:total_files]

    folder_number = 1
    file_count = 0
    total_copied = 0

    # Create the first folder (samples_01)
    current_folder_name = f'samples_{folder_number:02d}'
    current_folder_path = os.path.join(samples_dir, current_folder_name)
    os.makedirs(current_folder_path, exist_ok=True)

    for file_name in files_to_copy:
        source_file = os.path.join(source_dir, file_name)
        target_file = os.path.join(current_folder_path, file_name)

        # Copy the file
        shutil.copy2(source_file, target_file)
        file_count += 1
        total_copied += 1

        # Check if the current folder has reached the desired number of files
        if file_count >= files_per_folder:
            folder_number += 1
            if total_copied >= total_files:
                break  # Stop if we've copied the total desired number of files
            # Reset file count and create a new folder
            file_count = 0
            current_folder_name = f'samples_{folder_number:02d}'
            current_folder_path = os.path.join(samples_dir, current_folder_name)
            os.makedirs(current_folder_path, exist_ok=True)
    
    print(f'Total files copied: {total_copied}')
    print(f'Files organized into {folder_number} folders.')

# Example usage:
source_directory = r"C:\Users\laras\Downloads\Keystrokes\Keystrokes\files"
project_directory = 'samples'  # Replace with the root path to your PyCharm project

organize_samples(source_directory, project_directory)


Total files copied: 10000
Files organized into 101 folders.


In [1]:
import os
import pandas as pd

def try_multiple_delimiters(file_path):
    """
    Attempts to read a file using different delimiters.
    """
    delimiters = ['\t', ',', ';']  # Common delimiters to try
    for delimiter in delimiters:
        try:
            df = pd.read_csv(file_path, sep=delimiter, encoding='utf-8', on_bad_lines='skip', dtype={'PARTICIPANT_ID': str})
            # If we get more than one column, we assume we have the right delimiter
            if df.shape[1] > 1:
                return df
        except Exception as e:
            pass  # Try the next delimiter
    raise ValueError("Could not determine delimiter")


def read_keystroke_data(samples_dir, output_csv='all_samples_combined.csv'):
    """
    Reads keystroke data from multiple subfolders, handles errors, and saves valid data to CSV.
    
    Parameters:
    - samples_dir (str): The directory where the 'samples' folder resides, which contains multiple subfolders with .txt files.
    - output_csv (str): The output path for the CSV file to save valid data.

    Returns:
    - A pandas DataFrame containing valid keystroke data.
    """
    all_data = []  # To hold data from all valid files
    total_files_processed = 0  # Counter to keep track of the total number of files processed
    total_rows_processed = 0  # Counter for the total number of rows across all files
    skipped_files = 0  # Counter for the number of skipped files

    # Expected columns in the file
    required_columns = ['PARTICIPANT_ID', 'TEST_SECTION_ID', 'SENTENCE', 'USER_INPUT',
                        'KEYSTROKE_ID', 'PRESS_TIME', 'RELEASE_TIME', 'LETTER', 'KEYCODE']

    # Traverse each subfolder (e.g., samples_01, samples_02, ..., samples_100)
    for subdir in os.listdir(samples_dir):
        subdir_path = os.path.join(samples_dir, subdir)
        
        if os.path.isdir(subdir_path):
            # Traverse each subfolder inside (e.g., the subfolders inside samples_01, samples_02, etc.)
            for inner_subdir in os.listdir(subdir_path):
                inner_subdir_path = os.path.join(subdir_path, inner_subdir)
                
                if os.path.isdir(inner_subdir_path):
                    # Traverse each .txt file in the sub-subfolder
                    for file_name in os.listdir(inner_subdir_path):
                        if file_name.endswith('.txt'):
                            file_path = os.path.join(inner_subdir_path, file_name)
                            total_files_processed += 1  # Increment the counter for each file

                            print(f"Processing file: {file_path}")  # Debugging statement

                            try:
                                # Try reading the file using multiple delimiters
                                df = try_multiple_delimiters(file_path)

                                # Log the number of rows in the current file
                                print(f"File {file_name} has {df.shape[0]} rows")

                                # Ensure required columns exist
                                if not all(col in df.columns for col in required_columns):
                                    print(f"Missing columns in {file_path}")
                                    skipped_files += 1
                                    continue  # Skip this file if it doesn't have the required columns

                                # If the number of columns is 1, it means that the file might be incorrectly formatted
                                if len(df.columns) == 1:
                                    print(f"Malformed data in {file_path}, skipping.")
                                    skipped_files += 1
                                    continue  # Skip files with misformatted rows

                                # Append the number of rows to the total rows processed
                                total_rows_processed += df.shape[0]

                                # Append the valid DataFrame to the list
                                all_data.append(df)
                            
                            except Exception as e:
                                print(f"Error processing file {file_path}: {e}")
                                skipped_files += 1
                                continue  # Skip files with errors like encoding issues or missing data

    # Concatenate all valid data into one DataFrame
    if not all_data:
        print("No valid data found.")
        return pd.DataFrame()  # Return an empty DataFrame if no valid data was found

    full_data = pd.concat(all_data, ignore_index=True)

    # Save the data to CSV
    full_data.to_csv(output_csv, index=False)
    print(f"Valid data saved to {output_csv}")
    
    # Print the total number of files processed and total rows processed
    print(f"Total number of .txt files processed: {total_files_processed}")
    print(f"Total number of rows processed: {total_rows_processed}")
    print(f"Total number of skipped files: {skipped_files}")

    return full_data

# Example usage:
samples_directory = "samples"  # Replace with your actual path
read_keystroke_data(samples_directory, output_csv='all_samples_combined.csv')


Processing file: samples\samples\samples_01\100001_keystrokes.txt
File 100001_keystrokes.txt has 658 rows
Processing file: samples\samples\samples_01\100003_keystrokes.txt
File 100003_keystrokes.txt has 806 rows
Processing file: samples\samples\samples_01\100007_keystrokes.txt
File 100007_keystrokes.txt has 801 rows
Processing file: samples\samples\samples_01\100008_keystrokes.txt
File 100008_keystrokes.txt has 687 rows
Processing file: samples\samples\samples_01\100013_keystrokes.txt
File 100013_keystrokes.txt has 744 rows
Processing file: samples\samples\samples_01\100016_keystrokes.txt
File 100016_keystrokes.txt has 776 rows
Processing file: samples\samples\samples_01\10001_keystrokes.txt
File 10001_keystrokes.txt has 763 rows
Processing file: samples\samples\samples_01\100020_keystrokes.txt
File 100020_keystrokes.txt has 649 rows
Processing file: samples\samples\samples_01\100030_keystrokes.txt
File 100030_keystrokes.txt has 654 rows
Processing file: samples\samples\samples_01\1000

Unnamed: 0,PARTICIPANT_ID,TEST_SECTION_ID,SENTENCE,USER_INPUT,KEYSTROKE_ID,PRESS_TIME,RELEASE_TIME,LETTER,KEYCODE
0,100001,1090979,Was wondering if you and Natalie connected?,Was wondering if you and Natalie connected?,51891207.0,1.473275e+12,1.473275e+12,SHIFT,16.0
1,100001,1090979,Was wondering if you and Natalie connected?,Was wondering if you and Natalie connected?,51891214.0,1.473275e+12,1.473275e+12,W,87.0
2,100001,1090979,Was wondering if you and Natalie connected?,Was wondering if you and Natalie connected?,51891219.0,1.473275e+12,1.473275e+12,a,65.0
3,100001,1090979,Was wondering if you and Natalie connected?,Was wondering if you and Natalie connected?,51891226.0,1.473275e+12,1.473275e+12,s,83.0
4,100001,1090979,Was wondering if you and Natalie connected?,Was wondering if you and Natalie connected?,51891231.0,1.473275e+12,1.473275e+12,,32.0
...,...,...,...,...,...,...,...,...,...
7038466,127036,1380699,"Valerie, the prices look fine in Sitara.","Valerie, the prices look fine in Sitara.",65666276.0,1.473655e+12,1.473655e+12,t,84.0
7038467,127036,1380699,"Valerie, the prices look fine in Sitara.","Valerie, the prices look fine in Sitara.",65666277.0,1.473655e+12,1.473655e+12,a,65.0
7038468,127036,1380699,"Valerie, the prices look fine in Sitara.","Valerie, the prices look fine in Sitara.",65666278.0,1.473655e+12,1.473655e+12,r,82.0
7038469,127036,1380699,"Valerie, the prices look fine in Sitara.","Valerie, the prices look fine in Sitara.",65666279.0,1.473655e+12,1.473655e+12,a,65.0
