Group 5: LI LINGYU, LIU YICHAO, WU JINGYAN, YANG QINGSHAN, YE FANGDA

In [1]:
# Load env and data
import pandas as pd
import config

chicago_clean = pd.read_csv(config.TRAIN_FILE_CLEAN2)
chicago_clean.rename(columns={'Unnamed: 0': 'Date'}, inplace=True)
chicago_clean

Unnamed: 0,Date,0_7,0_8,0_9,1_3,1_4,1_6,1_7,1_8,1_9,...,7_7,8_0,8_1,8_2,8_3,8_4,9_0,9_1,9_2,9_3
0,2001-01-01 00:00:00,0,1,0,0,0,0,0,0,0,...,0,0,0,3,1,0,0,0,4,1
1,2001-01-01 01:00:00,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2001-01-01 02:00:00,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,2001-01-01 03:00:00,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2001-01-01 04:00:00,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
210692,2025-01-13 20:00:00,0,0,0,0,0,0,0,0,0,...,0,0,1,0,1,0,0,0,0,0
210693,2025-01-13 21:00:00,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
210694,2025-01-13 22:00:00,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,1,0,0
210695,2025-01-13 23:00:00,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [2]:
# Construct time features
chicago_clean['Date'] = pd.to_datetime(chicago_clean['Date'], format='%Y-%m-%d %H:%M:%S')
chicago_clean['Hour'] = chicago_clean['Date'].dt.hour
chicago_clean['Day'] = chicago_clean['Date'].dt.day
chicago_clean['Month'] = chicago_clean['Date'].dt.month
chicago_clean['DayOfWeek'] = chicago_clean['Date'].dt.dayofweek
chicago_clean['Year'] = chicago_clean['Date'].dt.year
chicago_clean['IsWeekend'] = chicago_clean['DayOfWeek'].isin([5,6]).astype(int)
chicago_clean['TimeOfDay'] = pd.cut(chicago_clean['Hour'], 
                                   bins=[-1,6,12,18,24], 
                                   labels=['Night','Morning','Afternoon','Evening'], 
                                   right=False)
chicago_clean['Season'] = pd.cut(chicago_clean['Month'],
                                bins=[0, 4, 7, 10, 13],
                                labels=['Spring','Summer','Fall', 'Winter'],
                                right=False)
# Drop initial date column
chicago_clean = chicago_clean.drop('Date', axis=1)
chicago_clean.head()

# TODO -> 注意这里还没one-hot

Unnamed: 0,0_7,0_8,0_9,1_3,1_4,1_6,1_7,1_8,1_9,2_3,...,9_2,9_3,Hour,Day,Month,DayOfWeek,Year,IsWeekend,TimeOfDay,Season
0,0,1,0,0,0,0,0,0,0,0,...,4,1,0,1,1,0,2001,0,Night,Spring
1,0,0,0,0,0,0,0,0,0,0,...,0,0,1,1,1,0,2001,0,Night,Spring
2,0,0,0,0,0,0,0,0,0,0,...,0,0,2,1,1,0,2001,0,Night,Spring
3,0,0,0,0,0,0,0,0,0,0,...,0,0,3,1,1,0,2001,0,Night,Spring
4,0,0,0,0,0,0,0,0,0,0,...,0,0,4,1,1,0,2001,0,Night,Spring


In [3]:
#construct spatial features

# 6hour, 1day, 1week, 1month, 3month, 1year
WINDOW_SECS = [6, 24, 24*7, 24*30, 24*30*3, 24*365]
GRID_LIST = [c for c in chicago_clean.columns if '_' in c]

# 我们目前的地理feature就是给定一个时间窗口 + 距离，看累积有多少犯罪
# 这样可以捕捉到犯罪在时空间上的扩散和传递
def find_nearby_grid(grid, distance):
    x, y = grid.split('_')
    x, y = int(x), int(y)
    nearby_grids = {grid}
    for _ in range(distance):
        new_grids = set()
        for grid in nearby_grids:
            x, y = grid.split('_')
            x, y = int(x), int(y)
            for dx in [-1, 0, 1]:
                for dy in [-1, 0, 1]:
                    nx, ny = x + dx, y + dy
                    new_grid = str(nx) + '_' + str(ny)
                    if new_grid in GRID_LIST and new_grid not in nearby_grids:
                        new_grids.add(new_grid)
        nearby_grids.update(new_grids)
    return list(nearby_grids)


# define y and construct feature based on grid
# here if y is only one hour it will be pretty hard, so let's use 3 hours
dfs = []
for grid in GRID_LIST[:2]:
    chicago_clean['y'] = chicago_clean[grid].rolling(window=3, min_periods=1).sum()
    dist_list = ['dist0', 'dist1', 'dist2', 'dist3', 'dist4']
    dist_grids = {}
    for dist in dist_list:
        dist_grids[dist] = find_nearby_grid(grid, int(dist[-1]))
    chicago_clean_revert = chicago_clean[::-1].copy()
    raw_grid_col = [c for c in chicago_clean_revert.columns if '_' in c]
    for window in WINDOW_SECS:
        for dist in dist_list:
            fcol = dist + '_' + str(window)
            chicago_clean_revert[fcol] = chicago_clean_revert[dist_grids[dist]].rolling(window=window, min_periods=1).sum().sum(axis=1)
    df = chicago_clean_revert[::-1].copy()
    df = df.drop(raw_grid_col, axis=1)
    dfs.append(df)



In [4]:
df_feature = pd.concat(dfs, axis=0, ignore_index=True)
df_feature.to_csv(config.TRAIN_FILE_FEATURE, index=False)
df_feature

Unnamed: 0,Hour,Day,Month,DayOfWeek,Year,IsWeekend,TimeOfDay,Season,y,dist0_6,...,dist0_2160,dist1_2160,dist2_2160,dist3_2160,dist4_2160,dist0_8760,dist1_8760,dist2_8760,dist3_8760,dist4_8760
0,0,1,1,0,2001,0,Night,Spring,0.0,0.0,...,0.0,7.0,45.0,153.0,321.0,1.0,39.0,210.0,659.0,1529.0
1,1,1,1,0,2001,0,Night,Spring,0.0,0.0,...,0.0,6.0,36.0,126.0,271.0,1.0,41.0,207.0,653.0,1512.0
2,2,1,1,0,2001,0,Night,Spring,0.0,0.0,...,0.0,6.0,36.0,126.0,270.0,1.0,42.0,208.0,654.0,1513.0
3,3,1,1,0,2001,0,Night,Spring,0.0,0.0,...,0.0,6.0,36.0,126.0,270.0,1.0,42.0,208.0,654.0,1514.0
4,4,1,1,0,2001,0,Night,Spring,0.0,0.0,...,0.0,6.0,36.0,126.0,270.0,1.0,42.0,208.0,654.0,1514.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
421389,20,13,1,0,2025,0,Evening,Spring,0.0,0.0,...,0.0,0.0,4.0,8.0,18.0,0.0,0.0,4.0,8.0,18.0
421390,21,13,1,0,2025,0,Evening,Spring,0.0,0.0,...,0.0,0.0,4.0,5.0,9.0,0.0,0.0,4.0,5.0,9.0
421391,22,13,1,0,2025,0,Evening,Spring,0.0,0.0,...,0.0,0.0,2.0,3.0,6.0,0.0,0.0,2.0,3.0,6.0
421392,23,13,1,0,2025,0,Evening,Spring,0.0,0.0,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0
