In [2]:
import pandas as pd
import os
import glob

# Define the folder where the CSV files were downloaded
folder_path = '../data/daily_count/raw'

# Load all CSV files into a single DataFrame
csv_files = glob.glob(os.path.join(folder_path, "*.csv"))
df_list = [pd.read_csv(file) for file in csv_files]
df = pd.concat(df_list, ignore_index=True)

# Display the first few rows to verify
print(df.head())


   user_id  day_of_week  day_ratio
0      396            1   0.147690
1      396            2   0.111386
2      396            3   0.216997
3      396            4   0.134488
4      396            5   0.155116


In [3]:
# Pivot the DataFrame to create hour_count_00~23 columns
pivot_df = df.pivot(index='user_id', columns='day_of_week', values='day_ratio')
pivot_df.columns = ["Sun","Mon","Tues","Wed","Thur","Fri","Sat"]

# Fill NaN values with 0 (if needed, depending on how you want to handle missing hours)
pivot_df.fillna(0, inplace=True)

# Reset index to make user_id a column
pivot_df.reset_index(inplace=True)

# Display the pivoted DataFrame
print(pivot_df.head())


   user_id       Sun       Mon      Tues       Wed      Thur       Fri  \
0      396  0.147690  0.111386  0.216997  0.134488  0.155116  0.061056   
1     1096  0.000000  0.151163  0.325581  0.500000  0.000000  0.023256   
2     1196  0.063014  0.147945  0.263014  0.117808  0.210959  0.120548   
3     2996  0.122186  0.130761  0.128617  0.158628  0.140407  0.209003   
4     3096  0.014006  0.201681  0.100840  0.148459  0.280112  0.179272   

        Sat  
0  0.173267  
1  0.000000  
2  0.076712  
3  0.110397  
4  0.075630  


In [4]:
pivot_df

Unnamed: 0,user_id,Sun,Mon,Tues,Wed,Thur,Fri,Sat
0,396,0.147690,0.111386,0.216997,0.134488,0.155116,0.061056,0.173267
1,1096,0.000000,0.151163,0.325581,0.500000,0.000000,0.023256,0.000000
2,1196,0.063014,0.147945,0.263014,0.117808,0.210959,0.120548,0.076712
3,2996,0.122186,0.130761,0.128617,0.158628,0.140407,0.209003,0.110397
4,3096,0.014006,0.201681,0.100840,0.148459,0.280112,0.179272,0.075630
...,...,...,...,...,...,...,...,...
43202,60374096,0.000000,0.000000,0.000000,0.000000,0.000000,0.070000,0.930000
43203,60374496,0.000000,0.000000,0.000000,0.000000,0.000000,0.031915,0.968085
43204,60375796,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1.000000
43205,60375896,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1.000000


In [6]:
fd = "../data/daily_count/processed"
os.makedirs(fd)
pivot_df.to_csv(f"{fd}/pivoted.csv")