Group 5: LI LINGYU, LIU YICHAO, WU JINGYAN, YANG QINGSHAN, YE FANGDA

In [1]:
# Load env and data
import pandas as pd
import config
from tqdm import tqdm

chicago_clean = pd.read_csv(config.TRAIN_FILE_CLEAN2)
chicago_clean.rename(columns={'Unnamed: 0': 'Date'}, inplace=True)
chicago_clean

Unnamed: 0,Date,0_7,0_8,0_9,1_3,1_4,1_6,1_7,1_8,1_9,...,7_7,8_0,8_1,8_2,8_3,8_4,9_0,9_1,9_2,9_3
0,2001-01-01 00:00:00,0,1,0,0,0,0,0,0,0,...,0,0,0,3,1,0,0,0,4,1
1,2001-01-01 01:00:00,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2001-01-01 02:00:00,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,2001-01-01 03:00:00,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2001-01-01 04:00:00,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
210692,2025-01-13 20:00:00,0,0,0,0,0,0,0,0,0,...,0,0,1,0,1,0,0,0,0,0
210693,2025-01-13 21:00:00,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
210694,2025-01-13 22:00:00,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,1,0,0
210695,2025-01-13 23:00:00,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [2]:
# Construct time features
chicago_clean['Date'] = pd.to_datetime(chicago_clean['Date'], format='%Y-%m-%d %H:%M:%S')
chicago_clean['Hour'] = chicago_clean['Date'].dt.hour
chicago_clean['Day'] = chicago_clean['Date'].dt.day
chicago_clean['Month'] = chicago_clean['Date'].dt.month
chicago_clean['DayOfWeek'] = chicago_clean['Date'].dt.dayofweek
chicago_clean['Year'] = chicago_clean['Date'].dt.year
chicago_clean['IsWeekend'] = chicago_clean['DayOfWeek'].isin([5,6]).astype(int)
chicago_clean['TimeOfDay'] = pd.cut(chicago_clean['Hour'], 
                                   bins=[-1,6,12,18,24], 
                                   labels=['Night','Morning','Afternoon','Evening'], 
                                   right=False)
chicago_clean['Season'] = pd.cut(chicago_clean['Month'],
                                bins=[0, 4, 7, 10, 13],
                                labels=['Spring','Summer','Fall', 'Winter'],
                                right=False)
# Drop initial date column
chicago_clean = chicago_clean.drop('Date', axis=1)
chicago_clean.head()

# TODO -> 注意这里还没one-hot

Unnamed: 0,0_7,0_8,0_9,1_3,1_4,1_6,1_7,1_8,1_9,2_3,...,9_2,9_3,Hour,Day,Month,DayOfWeek,Year,IsWeekend,TimeOfDay,Season
0,0,1,0,0,0,0,0,0,0,0,...,4,1,0,1,1,0,2001,0,Night,Spring
1,0,0,0,0,0,0,0,0,0,0,...,0,0,1,1,1,0,2001,0,Night,Spring
2,0,0,0,0,0,0,0,0,0,0,...,0,0,2,1,1,0,2001,0,Night,Spring
3,0,0,0,0,0,0,0,0,0,0,...,0,0,3,1,1,0,2001,0,Night,Spring
4,0,0,0,0,0,0,0,0,0,0,...,0,0,4,1,1,0,2001,0,Night,Spring


In [None]:
#construct spatial features

# 6hour, 1day, 1week, 1month, 3month, 1year
WINDOW_SECS = [6, 24, 24*7, 24*30, 24*30*3, 24*365]
GRID_LIST = [c for c in chicago_clean.columns if '_' in c]

# 我们目前的地理feature就是给定一个时间窗口 + 距离，看累积有多少犯罪
# 这样可以捕捉到犯罪在时空间上的扩散和传递
def find_nearby_grid(grid, distance):
    x, y = grid.split('_')
    x, y = int(x), int(y)
    nearby_grids = {grid}
    for _ in range(distance):
        new_grids = set()
        for grid in nearby_grids:
            x, y = grid.split('_')
            x, y = int(x), int(y)
            for dx in [-1, 0, 1]:
                for dy in [-1, 0, 1]:
                    nx, ny = x + dx, y + dy
                    new_grid = str(nx) + '_' + str(ny)
                    if new_grid in GRID_LIST and new_grid not in nearby_grids:
                        new_grids.add(new_grid)
        nearby_grids.update(new_grids)
    return list(nearby_grids)


# define y and construct feature based on grid
# here if y is only one hour it will be pretty hard, so let's use 3 hours
dfs = []

for grid in tqdm(GRID_LIST):
    chicago_clean['y'] = chicago_clean[grid].rolling(window=3, min_periods=1).sum()
    dist_list = ['dist0', 'dist1', 'dist2', 'dist3', 'dist4']
    dist_grids = {}
    for dist in dist_list:
        dist_grids[dist] = find_nearby_grid(grid, int(dist[-1]))
    chicago_clean_revert = chicago_clean[::-1].copy()
    raw_grid_col = [c for c in chicago_clean_revert.columns if '_' in c]
    for window in WINDOW_SECS:
        for dist in dist_list:
            fcol = dist + '_' + str(window)
            # use mean to fix the problem of the boundary grid
            chicago_clean_revert[fcol] = chicago_clean_revert[dist_grids[dist]].rolling(window=window, min_periods=1).sum().mean(axis=1)
    df = chicago_clean_revert[::-1].copy()
    df = df.drop(raw_grid_col, axis=1)
    dfs.append(df)



100%|██████████| 72/72 [03:03<00:00,  2.56s/it]


In [None]:
df_feature = pd.concat(dfs, axis=0, ignore_index=True)

Unnamed: 0,Hour,Day,Month,DayOfWeek,Year,IsWeekend,TimeOfDay,Season,y,dist0_6,...,dist0_2160,dist1_2160,dist2_2160,dist3_2160,dist4_2160,dist0_8760,dist1_8760,dist2_8760,dist3_8760,dist4_8760
0,0,1,1,0,2001,0,Night,Spring,0.0,0.0,...,0.0,1.4,3.750000,7.650000,10.700000,1.0,7.8,17.500000,32.950000,50.966667
1,1,1,1,0,2001,0,Night,Spring,0.0,0.0,...,0.0,1.2,3.000000,6.300000,9.033333,1.0,8.2,17.250000,32.650000,50.400000
2,2,1,1,0,2001,0,Night,Spring,0.0,0.0,...,0.0,1.2,3.000000,6.300000,9.000000,1.0,8.4,17.333333,32.700000,50.433333
3,3,1,1,0,2001,0,Night,Spring,0.0,0.0,...,0.0,1.2,3.000000,6.300000,9.000000,1.0,8.4,17.333333,32.700000,50.466667
4,4,1,1,0,2001,0,Night,Spring,0.0,0.0,...,0.0,1.2,3.000000,6.300000,9.000000,1.0,8.4,17.333333,32.700000,50.466667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15170179,20,13,1,0,2025,0,Evening,Spring,0.0,0.0,...,0.0,1.0,1.333333,1.608696,1.454545,0.0,1.0,1.333333,1.608696,1.454545
15170180,21,13,1,0,2025,0,Evening,Spring,0.0,0.0,...,0.0,0.8,1.083333,1.391304,1.151515,0.0,0.8,1.083333,1.391304,1.151515
15170181,22,13,1,0,2025,0,Evening,Spring,0.0,0.0,...,0.0,0.6,0.916667,1.173913,0.909091,0.0,0.6,0.916667,1.173913,0.909091
15170182,23,13,1,0,2025,0,Evening,Spring,0.0,0.0,...,0.0,0.4,0.416667,0.695652,0.515152,0.0,0.4,0.416667,0.695652,0.515152


In [7]:
df_feature = df_feature[df_feature['Year'] > 2023].copy()
df_feature.to_csv(config.TRAIN_FILE_FEATURE, index=False)
df_feature

Unnamed: 0,Hour,Day,Month,DayOfWeek,Year,IsWeekend,TimeOfDay,Season,y,dist0_6,...,dist0_2160,dist1_2160,dist2_2160,dist3_2160,dist4_2160,dist0_8760,dist1_8760,dist2_8760,dist3_8760,dist4_8760
201600,0,1,1,0,2024,0,Night,Spring,0.0,0.0,...,6.0,99.0,230.833333,390.000000,601.700000,16.0,416.6,970.250000,1652.550000,2558.733333
201601,1,1,1,0,2024,0,Night,Spring,0.0,0.0,...,6.0,98.2,229.916667,389.100000,600.100000,16.0,416.0,969.333333,1651.450000,2557.000000
201602,2,1,1,0,2024,0,Night,Spring,0.0,0.0,...,6.0,98.2,230.083333,388.600000,599.600000,16.0,416.4,969.333333,1650.800000,2556.233333
201603,3,1,1,0,2024,0,Night,Spring,0.0,0.0,...,6.0,98.4,230.250000,388.600000,599.300000,16.0,416.8,969.416667,1650.650000,2555.900000
201604,4,1,1,0,2024,0,Night,Spring,0.0,0.0,...,6.0,98.0,230.083333,388.300000,599.333333,16.0,416.6,969.250000,1650.350000,2555.600000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15170179,20,13,1,0,2025,0,Evening,Spring,0.0,0.0,...,0.0,1.0,1.333333,1.608696,1.454545,0.0,1.0,1.333333,1.608696,1.454545
15170180,21,13,1,0,2025,0,Evening,Spring,0.0,0.0,...,0.0,0.8,1.083333,1.391304,1.151515,0.0,0.8,1.083333,1.391304,1.151515
15170181,22,13,1,0,2025,0,Evening,Spring,0.0,0.0,...,0.0,0.6,0.916667,1.173913,0.909091,0.0,0.6,0.916667,1.173913,0.909091
15170182,23,13,1,0,2025,0,Evening,Spring,0.0,0.0,...,0.0,0.4,0.416667,0.695652,0.515152,0.0,0.4,0.416667,0.695652,0.515152
