In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split


In [2]:
raw_logs = pd.read_csv('train_logs.csv')

# Create 80/20 train/test split, ensuring the same 'id' is kept in each set
unique_ids = raw_logs['id'].unique()
train_ids, test_ids = train_test_split(unique_ids, test_size=0.2, random_state=42)

train = raw_logs[raw_logs['id'].isin(train_ids)]
test = raw_logs[raw_logs['id'].isin(test_ids)]

train.to_csv('train_logs_grouped.csv', index=False)
test.to_csv('test_logs_grouped.csv', index=False)

In [5]:
def aggregate_user_logs(data):
    # Group 'Move From [x1, y1] To [x2, y2]' activities as 'Move From To'
    data['activity_processed'] = data['activity'].replace(
        to_replace=r'^Move From \[\d+, \d+\] To \[\d+, \d+\]$', value='Move From To', regex=True)

    # Group by user ID
    grouped = data.groupby('id')
    
    # Calculate latest uptime - earliest downtime
    time_spent = grouped.apply(lambda x: x['up_time'].max() - x['down_time'].min()).rename('time_worked')

    # Calculate average action time
    avg_action_time = grouped['action_time'].mean().rename('avg_action_time')

    # Count occurrences of each activity type
    activity_counts = grouped['activity_processed'].value_counts().unstack(fill_value=0)

    # Count the number of "q"s and special characters in text_change
    count_letters = grouped['text_change'].apply(lambda x: x.str.count('q').sum()).rename('count_letters')
    count_special_chars = grouped['text_change'].apply(lambda x: x.str.count('[^q]').sum()).rename('count_special_chars')

    # Get final word count
    final_word_count = grouped['word_count'].max().rename('final_word_count')

    # Combine all aggregations
    result = pd.concat([time_spent, avg_action_time, activity_counts, count_letters, count_special_chars, final_word_count], axis=1)
    result = result.reset_index()
    return result

In [6]:
# Aggregate the training data
aggregated_train_data = aggregate_user_logs(train)

aggregated_train_data.to_csv('train_logs_aggregated.csv', index=False)
print(aggregated_train_data.head())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['activity_processed'] = data['activity'].replace(


         id  time_worked  avg_action_time  Input  Move From To  Nonproduction  \
0  001519c8      1797443       116.246774   2010             3            120   
1  0022f953      1758346       112.221271   1938             0            254   
2  0042269b      1767228       101.837766   3515             0            175   
3  0059420b      1363074       121.848329   1304             0             99   
4  0075873a      1584002       123.943896   1942             0             72   

   Paste  Remove/Cut  Replace  count_letters  count_special_chars  \
0      0         417        7           2016                 1489   
1      1         260        1           1705                 2538   
2      0         439        7           3675                 2210   
3      1         151        1           1168                 1108   
4      0         517        0           1964                 1071   

   final_word_count  
0               256  
1               323  
2               404  
3         