# Feature Engineering

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
import joblib
import os

In [None]:
fp = pd.read_csv("../data/women_risk_cleaned.csv")

In [None]:
fp.head()

In [None]:
fp.shape

In [None]:
corr_matrix = fp.corr()

In [None]:
plt.figure(figsize=(14, 10))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', center=0, fmt='.2f', square=True, linewidths=1)
plt.title('Feature Correlation Heatmap')
plt.tight_layout()
plt.show()

In [None]:
target_col = '12. Overall, how would you rate the risk level of harassment in that situation?'

In [None]:
target_corr = corr_matrix[target_col].sort_values(ascending=False)
target_corr

In [None]:
plt.figure(figsize=(10, 6))
target_corr_filtered = target_corr[target_corr.index != target_col]
target_corr_filtered.plot(kind='barh', color='steelblue')
plt.title('Feature Correlation with Target')
plt.xlabel('Correlation Coefficient')
plt.ylabel('Features')
plt.tight_layout()
plt.show()

In [None]:
numeric_cols = fp.select_dtypes(include=[np.number]).columns.tolist()
if target_col in numeric_cols:
    numeric_cols.remove(target_col)

In [None]:
numeric_cols

In [None]:
scaler = StandardScaler()
fp[numeric_cols] = scaler.fit_transform(fp[numeric_cols])

In [None]:
fp.head()

In [None]:
fp.describe()

In [None]:
os.makedirs('../models', exist_ok=True)
joblib.dump(scaler, '../models/scaler.pkl')

In [None]:
X = fp.drop(target_col, axis=1)
y = fp[target_col]

In [None]:
X.shape, y.shape

In [None]:
y.value_counts()

In [None]:
fp.to_csv("../data/women_risk_processed.csv", index=False)

In [None]:
fp.shape