In [80]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Lasso

import pandas as pd
import numpy as np

In [70]:
crime_num = pd.read_csv("../Data/FE/crime_num_fe.csv")
crime_violent = pd.read_csv("../Data/FE/crime_violent_fe.csv")

In [71]:
crime_num.columns

Index(['Date', 'TS_interval', 'Time_Interval', 'Num_Crimes', 'Day', 'Month',
       'Year', 'tempmax', 'tempmin', 'temp', 'feelslikemax', 'feelslikemin',
       'feelslike', 'dew', 'humidity', 'precip', 'precipprob', 'precipcover',
       'preciptype', 'snow', 'snowdepth', 'windgust', 'windspeed', 'winddir',
       'sealevelpressure', 'cloudcover', 'visibility', 'solarradiation',
       'solarenergy', 'uvindex', 'severerisk', 'is_FullMoon_BLOCK', 'suntime',
       'Holiday', 'NationalIncident'],
      dtype='object')

In [72]:
y = crime_num['Num_Crimes']
X = crime_num.drop(columns=['Num_Crimes','TS_interval','Date','Day','Month', 'Year','tempmax', 'tempmin', 'feelslikemax', 'feelslikemin', 'dew'])
X['Holiday'] = X['Holiday'] != "None"

# Lasso Variable Selection
Unlike with PCA/dimension reduction, this aim is to determine if filtering the data to certain columns improve predictions

In [73]:
categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
numeric_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Create a transformer for numeric data
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

# Create a transformer for categorical data
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine transformers into a preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

In [74]:
lasso_model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', Lasso(alpha=0.05, random_state=0)) # .05 is subjective, not a model I am predicting so I didn't worry about it at this stage.
])

# Train the Lasso model
lasso_model.fit(X, y)

# Extract Feature Names

In [75]:
feature_names = (lasso_model.named_steps['preprocessor']
                           .transformers_[1][1]
                           .named_steps['onehot']
                           .get_feature_names_out())

# Extend this list with the names of numeric columns
feature_names = np.concatenate([numeric_cols, feature_names])

# Get coefficients from the lasso model
coefficients = lasso_model.named_steps['regressor'].coef_

# Print nonzero coefficients and their corresponding feature names
print("Non-zero coefficients and their corresponding features:")
for coef, feat in zip(coefficients, feature_names):
    print(f"{feat}: {coef}")

Non-zero coefficients and their corresponding features:
temp: 0.0
feelslike: 0.4036992079095316
humidity: 0.308514393594218
precip: -0.1397174901074042
precipprob: 0.0
precipcover: -0.016859716284167058
snow: -0.11932217757755441
snowdepth: -0.15698512061314124
windgust: -0.0
windspeed: -0.0
winddir: 0.0
sealevelpressure: -0.0424542955101843
cloudcover: 0.3374734507274959
visibility: -0.0
solarradiation: 0.3150086218018555
solarenergy: 1.0918839819124706
uvindex: -0.0
severerisk: -0.5846157939085586
suntime: -0.23423794990661073
NationalIncident: -0.0
Time_Interval_B1: -6.709394505044294
Time_Interval_B2: -2.058920278637923
Time_Interval_B3: 0.1769156346747726
Time_Interval_B4: 0.5683630030958223
preciptype_None: 0.0
preciptype_freezingrain: -0.0
preciptype_rain: -0.0
preciptype_rain,freezingrain: -0.0
preciptype_rain,freezingrain,snow: -0.0
preciptype_rain,freezingrain,snow,ice: -0.0
preciptype_rain,ice: -0.0
preciptype_rain,snow: 0.0
preciptype_rain,snow,ice: -0.0
preciptype_snow: -0

In [76]:
nonzero_coefs = []
for coef, feat in zip(coefficients, feature_names):
    if coef != 0:
        nonzero_coefs.append(feat)

print(nonzero_coefs)
len(nonzero_coefs)

['feelslike', 'humidity', 'precip', 'precipcover', 'snow', 'snowdepth', 'sealevelpressure', 'cloudcover', 'solarradiation', 'solarenergy', 'severerisk', 'suntime', 'Time_Interval_B1', 'Time_Interval_B2', 'Time_Interval_B3', 'Time_Interval_B4']


16

In [78]:
nonzero_coefs = ['feelslike', 'humidity', 'precip', 'precipcover', 'snow', 'snowdepth', 'sealevelpressure', 'cloudcover', 'solarradiation', 'solarenergy', 'severerisk', 'suntime', 'Time_Interval']

filt_num = crime_num[['Num_Crimes', 'TS_interval', 'Date', 'Day', 'Month', 'Year'] + nonzero_coefs]
filt_violent = crime_violent[['Num_Crimes', 'TS_interval', 'Date', 'Day', 'Month', 'Year'] + nonzero_coefs]

In [79]:
filt_num.to_csv("../Data/FE/crime_num_varselect.csv")
filt_violent.to_csv("../Data/FE/crime_violent_varselect.csv")