#### Clean Data

In [16]:
import pandas as pd
import numpy as np
import seaborn as sns
%matplotlib inline 
import matplotlib.pyplot as plt
import plotly.express as px
import sklearn

In [17]:
filtered_data = pd.read_csv("../Data/Filtered_311_Dataset.csv")
filtered_data.shape

(387218, 29)

### Drop Columns that are redundant (not focus of the project)

In [18]:
drop_col_data = filtered_data.drop(columns=['create_date_utc', 'last_action_utc', 'closed_date_utc', 'cross_street', 'street', 'street_id', 'cross_street_id', 'latitude', 'longitude', 'geo_accuracy', 'request_type_id'])
drop_col_data.shape

(387218, 18)

In [19]:
cleaned_data = drop_col_data.dropna(subset=['neighborhood'])
cleaned_data.shape

(365567, 18)

In [20]:
cleaned_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 365567 entries, 91 to 387217
Data columns (total 18 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   _id                365567 non-null  int64  
 1   group_id           365567 non-null  int64  
 2   num_requests       365567 non-null  int64  
 3   parent_closed      365567 non-null  object 
 4   status_name        365567 non-null  object 
 5   status_code        365567 non-null  int64  
 6   dept               363091 non-null  object 
 7   request_type_name  365567 non-null  object 
 8   create_date_et     365567 non-null  object 
 9   last_action_et     365567 non-null  object 
 10  closed_date_et     312003 non-null  object 
 11  origin             365567 non-null  object 
 12  city               365567 non-null  object 
 13  neighborhood       365567 non-null  object 
 14  census_tract       233900 non-null  float64
 15  council_district   365492 non-null  float64
 16  ward  

In [21]:
cleaned_data.isnull().sum()

_id                       0
group_id                  0
num_requests              0
parent_closed             0
status_name               0
status_code               0
dept                   2476
request_type_name         0
create_date_et            0
last_action_et            0
closed_date_et        53564
origin                    0
city                      0
neighborhood              0
census_tract         131667
council_district         75
ward                     26
police_zone              50
dtype: int64

### Converting to datetime format

In [22]:
# Convert date columns to DateTime format
cleaned_data['create_date_et'] = pd.to_datetime(cleaned_data['create_date_et'], errors='coerce')
cleaned_data['last_action_et'] = pd.to_datetime(cleaned_data['last_action_et'], errors='coerce')
cleaned_data['closed_date_et'] = pd.to_datetime(cleaned_data['closed_date_et'], errors='coerce')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleaned_data['create_date_et'] = pd.to_datetime(cleaned_data['create_date_et'], errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleaned_data['last_action_et'] = pd.to_datetime(cleaned_data['last_action_et'], errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleaned_dat

### Filling in missing values

In [23]:
# Fill in missing value for column police_zone
# Fill values based on https://pghsafeneighborhoods.wordpress.com/wp-content/uploads/2008/06/zones-by-neighborhood.pdf

# Neighborhoods without police zone
unique_neighborhoods = cleaned_data.loc[cleaned_data['police_zone'].isna(), 'neighborhood'].unique()
print(unique_neighborhoods)

# Fill in missing values for police_zone
dict_neighborhoods_policezone = {'Windgap': 6.0, 
                                 'Overbrook': 3.0,
                                 'Mount Oliver Borough': 3.9,
                                 'Mt. Oliver': 3.0,
                                 'Swisshelm Park': 4.0,
                                 'Westwood': 6.0,
                                 'Banksville': 3.0,
                                 'Fairywood': 6.0,
                                 'Knoxville': 3.0,
                                 'Ridgemont': 3.0,
                                 'East Carnegie': 6.0,
                                 'Oakwood': 6.0
}

for neighborhood, police_zone in dict_neighborhoods_policezone.items():
    cleaned_data.loc[cleaned_data['neighborhood'] == neighborhood, 'police_zone'] = police_zone

# Check if there are still missing values
missing_values_count = cleaned_data['police_zone'].isna().sum()
print(f"Missing values in police_zone: {missing_values_count}")

['Windgap' 'Overbrook' 'Mount Oliver Borough' 'Mt. Oliver'
 'Swisshelm Park' 'Westwood' 'Banksville' 'Fairywood' 'Knoxville'
 'Ridgemont' 'East Carnegie' 'Oakwood']
Missing values in police_zone: 0


In [24]:
cleaned_data.isnull().sum()

_id                       0
group_id                  0
num_requests              0
parent_closed             0
status_name               0
status_code               0
dept                   2476
request_type_name         0
create_date_et            0
last_action_et            0
closed_date_et        53564
origin                    0
city                      0
neighborhood              0
census_tract         131667
council_district         75
ward                     26
police_zone               0
dtype: int64

### One hot encoding

In [25]:
cleaned_data_encoded = pd.get_dummies(cleaned_data, drop_first=True, sparse=True)

In [26]:
cleaned_data_encoded.shape

(365567, 486)

### Checking for Sparse Columns

In [27]:
# from numpy import arange
# import altair as alt
# from sklearn.feature_selection import VarianceThreshold

# data = cleaned_data_encoded.values
# X = data[:, :-1]
# y = data[:, -1]

# print(X.shape, y.shape)

# thresholds = arange(0.0, 0.55, 0.05)

# results = []
# for t in thresholds:
    
#     vt = VarianceThreshold(threshold=t)
    
#     X_sel = vt.fit_transform(X)
#     rows, cols = X_sel.shape
#     n_features = cols
#     print('Threshold=%.2f, Features=%d' % (t, n_features))
    
#     results.append(n_features)
    
# d2 = pd.DataFrame({'threshold': thresholds, 'n_features': results})
# alt.Chart(d2).mark_line().encode(
#     x='threshold',
#     y='n_features')

In [28]:
cleaned_data_encoded.to_csv("../Data/CleanedData_311_Dataset.csv", index=False)