# Using wrap method to refine feature selection

###  Read clean data

In [2]:
import pandas as pd

# Specify the path to your CSV file
file_path = 'Group_2_clean_Data..csv'

# Load the CSV file into a DataFrame
df = pd.read_csv(file_path)

df.head()


Unnamed: 0,communityname,State,countyCode,communityCode,fold,pop,perHoush,pctBlack,pctWhite,pctAsian,...,burglaries,burglPerPop,larcenies,larcPerPop,autoTheft,autoTheftPerPop,arsons,arsonsPerPop,violentPerPop,nonViolPerPop
0,149.0,28.0,55.0,509.0,1.0,11980.0,3.1,1.37,91.78,6.5,...,14.1,114.85,138.0,1132.08,16.0,131.26,2.0,16.41,41.02,1394.59
1,1034.0,35.0,58.0,424.0,1.0,23123.0,2.82,0.8,95.57,3.44,...,57.0,242.37,376.0,1598.78,26.0,110.55,1.0,4.25,127.56,1955.95
2,1780.0,34.0,114.0,959.0,1.0,29344.0,2.43,0.74,94.33,3.43,...,274.0,758.14,1797.0,4972.19,136.0,376.3,22.0,60.87,218.59,6167.51
3,664.0,31.0,53.0,213.0,1.0,16656.0,2.4,1.7,97.35,0.5,...,225.0,1301.78,716.0,4142.56,47.0,271.93,5.0,21.08,306.64,4425.45
4,140.0,22.0,82.0,471.0,1.0,11245.0,2.76,0.53,89.16,1.17,...,91.0,728.93,1060.0,8490.87,91.0,728.93,5.0,40.05,374.06,9988.79


In [3]:
# Assuming `df` is your DataFrame:
df_feature = df.iloc[:, 5:-18]  # Select all rows and columns from index 5 to the 18th-to-last
df_target = df['violentPerPop']  # Select the 'violentPerPop' column as the target variable


df_feature.head(5)


Unnamed: 0,pop,perHoush,pctBlack,pctWhite,pctAsian,pctHisp,pct12-21,pct12-29,pct16-24,pct65up,...,persHomeless,pctForeignBorn,pctBornStateResid,pctSameHouse-5,pctSameCounty-5,pctSameState-5,landArea,popDensity,pctUsePubTrans,pctOfficDrugUnit
0,11980.0,3.1,1.37,91.78,6.5,1.88,12.47,21.44,10.93,11.33,...,0.1,10.66,53.72,65.29,78.09,89.14,13.7,1845.9,9.63,0.2
1,23123.0,2.82,0.8,95.57,3.44,0.85,11.01,21.3,10.48,17.18,...,0.0,8.3,77.17,71.27,90.22,96.12,10.6,2186.7,3.84,0.0
2,29344.0,2.43,0.74,94.33,3.43,2.35,11.36,25.88,11.01,10.28,...,0.0,5.0,44.77,36.6,61.26,82.85,10.6,2780.9,4.37,0.0
3,16656.0,2.4,1.7,97.35,0.5,0.7,12.55,25.2,12.19,17.57,...,0.0,2.04,88.71,56.7,90.17,96.24,5.2,3217.7,3.31,0.0
4,11245.0,2.76,0.53,89.16,1.17,0.52,24.46,40.53,28.69,12.65,...,0.0,1.74,73.75,42.22,60.34,89.02,11.5,974.2,0.38,0.0


# experimenting with backward feature selection

In [None]:
from sklearn.feature_selection import RFE
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import numpy as np

# Step 1: Log-transform the target variable
y_log = np.log1p(df_target)  # Apply log(1 + y) transformation

# Step 2: Train-Test Split
X_train, X_test, y_train_log, y_test_log = train_test_split(
    df_feature, y_log, test_size=0.2, random_state=42
)

# Step 3: Feature Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Step 4: Initialize the Base Model
svr = SVR(kernel='rbf', C=1.0, epsilon=0.1)

# Step 5: Apply Recursive Feature Elimination (RFE)
rfe = RFE(estimator=svr, n_features_to_select=5)  # Select top 5 features
rfe.fit(X_train_scaled, y_train_log)

# Get the selected features
selected_features = df_feature.columns[rfe.support_]
print(f"Selected Features by RFE: {list(selected_features)}")

# Subset the dataset to include only selected features
X_train_selected = X_train_scaled[:, rfe.support_]
X_test_selected = X_test_scaled[:, rfe.support_]

# Step 6: Train the Final Model with Selected Features
svr.fit(X_train_selected, y_train_log)

# Step 7: Predict and Evaluate
y_pred_log = svr.predict(X_test_selected)

# Inverse-transform predictions to the original scale
y_pred = np.expm1(y_pred_log)
y_test = np.expm1(y_test_log)

# Evaluate model performance
test_mse = mean_squared_error(y_test, y_pred)
test_r2 = r2_score(y_test, y_pred)

print(f"Testing Set Metrics using RFE-Selected Features:")
print(f"  - Mean Squared Error: {test_mse:.2f}")
print(f"  - R-squared: {test_r2:.2f}")


#  experimenting with svr model with forward wrap selection

In [None]:
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import numpy as np

# Step 1: Log-transform the target variable
y_log = np.log1p(df_target)  # Apply log(1 + y) transformation

# Step 2: Train-Test Split
X_train, X_test, y_train_log, y_test_log = train_test_split(
    df_feature, y_log, test_size=0.2, random_state=42
)

# Step 3: Feature Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Step 4: Initialize the Base Model
svr = SVR(kernel='rbf', C=1.0, epsilon=0.1)

# Step 5: Apply Forward Selection
sfs = SequentialFeatureSelector(
    estimator=svr,
    n_features_to_select=5,  # Number of features to select
    direction='forward',     # Forward selection
    scoring='r2',            # Scoring metric
    cv=5                     # Cross-validation for performance evaluation
)

# Fit Sequential Feature Selector
sfs.fit(X_train_scaled, y_train_log)

# Get the selected features
selected_features = df_feature.columns[sfs.get_support()]
print(f"Selected Features by Forward Selection: {list(selected_features)}")

# Subset the dataset to include only selected features
X_train_selected = X_train_scaled[:, sfs.get_support()]
X_test_selected = X_test_scaled[:, sfs.get_support()]

# Step 6: Train the Final Model with Selected Features
svr.fit(X_train_selected, y_train_log)

# Step 7: Predict and Evaluate
y_pred_log = svr.predict(X_test_selected)

# Inverse-transform predictions to the original scale
y_pred = np.expm1(y_pred_log)
y_test = np.expm1(y_test_log)

# Evaluate model performance
test_mse = mean_squared_error(y_test, y_pred)
test_r2 = r2_score(y_test, y_pred)

print(f"Testing Set Metrics using Forward-Selected Features:")
print(f"  - Mean Squared Error: {test_mse:.2f}")
print(f"  - R-squared: {test_r2:.2f}")
