# Data Preparation

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import joblib
import os

In [2]:
fp = pd.read_csv("../data/women_risk.csv")

In [3]:
fp.head()

Unnamed: 0,Timestamp,1. What is your age group?,2. What is your occupation?,3. At what time of day did the incident occur?,4. Where did the incident occur?,5. How crowded was the location at the time of the incident?,6. What was the lighting condition in the area?,7. Was any form of security present at the location?,8. Were you familiar with the area where the incident occurred?,9. What type of harassment did you experience?,10. How often have you experienced harassment in similar situations?,11. How safe did you feel during the incident?,"12. Overall, how would you rate the risk level of harassment in that situation?"
0,2026-01-29 22:19:51,18-25,Student,Evening,Street/Public place,The location was slightly crowded,The lighting was moderate,There was no security at all,I mostly knew the area,"Physical harassment (touching, grabbing, assault)",I have sometimes experienced harassment in sim...,I felt somewhat safe,There was no risk of harassment at all
1,2026-01-29 22:20:52,18-25,Student,Evening,Public transport,The location was extremely crowded,The lighting was poor,There was no security at all,I somewhat knew the area,"Physical harassment (touching, grabbing, assault)",I have never experienced harassment in similar...,Option I felt very unsafe1,There was a very high risk of harassment
2,2026-01-29 23:27:42,18-25,Student,Night,Online platform,The location was not crowded at all,The lighting was very poor,There was no security at all,I did not know the area at all,"Online harassment (messages, social media, calls)",I have sometimes experienced harassment in sim...,I felt very unsafe,There was a very high risk of harassment
3,2026-01-30 20:28:06,18-25,Student,Night,Public transport,The location was extremely crowded,The lighting was poor,There was no security at all,I mostly knew the area,"Physical harassment (touching, grabbing, assault)",I have rarely experienced harassment in simila...,I felt very unsafe,There was a high risk of harassment
4,2026-01-30 22:06:27,26-35,Self-employed,Early Morning,Street/Public place,The location was mostly crowded,The lighting was moderate,There was some security,I knew the area completely,"Verbal harassment (unwanted comments, remarks)",I have sometimes experienced harassment in sim...,I felt very unsafe,There was a high risk of harassment


In [4]:
fp.shape

(115, 13)

In [5]:
fp.info()

<class 'pandas.DataFrame'>
RangeIndex: 115 entries, 0 to 114
Data columns (total 13 columns):
 #   Column                                                                           Non-Null Count  Dtype
---  ------                                                                           --------------  -----
 0   Timestamp                                                                        115 non-null    str  
 1   1. What is your age group?                                                       115 non-null    str  
 2   2. What is your occupation?                                                      115 non-null    str  
 3   3. At what time of day did the incident occur?                                   115 non-null    str  
 4   4. Where did the incident occur?                                                 115 non-null    str  
 5   5. How crowded was the location at the time of the incident?                     115 non-null    str  
 6   6. What was the lighting condition in

In [6]:
fp.isnull().sum()

Timestamp                                                                          0
1. What is your age group?                                                         0
2. What is your occupation?                                                        0
3. At what time of day did the incident occur?                                     0
4. Where did the incident occur?                                                   0
5. How crowded was the location at the time of the incident?                       0
6. What was the lighting condition in the area?                                    0
7. Was any form of security present at the location?                               0
8. Were you familiar with the area where the incident occurred?                    0
9. What type of harassment did you experience?                                     0
10. How often have you experienced harassment in similar situations?               0
11. How safe did you feel during the incident?                   

In [7]:
fp = fp.dropna()

In [8]:
fp.shape

(115, 13)

In [9]:
fp = fp.drop_duplicates()

In [10]:
fp.shape

(115, 13)

In [11]:
categorical_cols = fp.select_dtypes(include=['object']).columns.tolist()
categorical_cols

See https://pandas.pydata.org/docs/user_guide/migration-3-strings.html#string-migration-select-dtypes for details on how to write code that works with pandas 2 and 3.
  categorical_cols = fp.select_dtypes(include=['object']).columns.tolist()


['Timestamp',
 '1. What is your age group?',
 '2. What is your occupation?',
 '3. At what time of day did the incident occur?',
 '4. Where did the incident occur?',
 '5. How crowded was the location at the time of the incident?',
 '6. What was the lighting condition in the area?',
 '7. Was any form of security present at the location?',
 '8. Were you familiar with the area where the incident occurred?',
 '9. What type of harassment did you experience?',
 '10. How often have you experienced harassment in similar situations?',
 '11. How safe did you feel during the incident?',
 '12. Overall, how would you rate the risk level of harassment in that situation?']

In [12]:
label_encoders = {}

for col in categorical_cols:
    le = LabelEncoder()
    fp[col] = le.fit_transform(fp[col])
    label_encoders[col] = le

In [13]:
fp.head()

Unnamed: 0,Timestamp,1. What is your age group?,2. What is your occupation?,3. At what time of day did the incident occur?,4. Where did the incident occur?,5. How crowded was the location at the time of the incident?,6. What was the lighting condition in the area?,7. Was any form of security present at the location?,8. Were you familiar with the area where the incident occurred?,9. What type of harassment did you experience?,10. How often have you experienced harassment in similar situations?,11. How safe did you feel during the incident?,"12. Overall, how would you rate the risk level of harassment in that situation?"
0,0,0,3,2,3,4,2,2,3,2,3,2,4
1,1,0,3,2,2,0,3,2,4,2,0,5,3
2,2,0,3,4,1,3,4,2,1,1,3,4,3
3,3,0,3,4,2,0,3,2,3,2,2,4,0
4,4,1,2,1,3,2,2,3,2,4,3,4,0


In [14]:
os.makedirs('../models', exist_ok=True)
joblib.dump(label_encoders, '../models/label_encoders.pkl')

['../models/label_encoders.pkl']

In [15]:
fp.to_csv("../data/women_risk_cleaned.csv", index=False)

In [16]:
fp.shape

(115, 13)