In [1]:
import numpy as np
import pandas as pd
from datetime import datetime as dt
import warnings
warnings.filterwarnings('ignore')

In [2]:
pd.options.display.max_columns = None

<font size=3 color='DarkBlue'> Import 2020 dataset where all rows with servicedate before 2018, rows with servicedate and closeddate of null are deleted

In [3]:
df311_2020=pd.read_csv(r'C:\Documents\projects\HackLA\311\data\MyLA311_Service_Request_Data_2020DT2.csv',
                       low_memory=False, index_col=0)

In [10]:
df311_2020.shape

(1417527, 34)

In [11]:
df311_2020.ServiceDate.min()

Timestamp('2019-01-02 00:00:00')

In [12]:
df311_2020.ServiceDate.max()

Timestamp('2022-11-25 00:00:00')

In [13]:
df311_2020.ClosedDate.min()

Timestamp('2019-01-02 00:00:00')

In [14]:
df311_2020.ClosedDate.max()

Timestamp('2022-07-18 15:07:29')

In [15]:
df311_2020['CreatedDate'] = pd.to_datetime(df311_2020['CreatedDate'])
df311_2020['UpdatedDate'] = pd.to_datetime(df311_2020['UpdatedDate'])
df311_2020['ServiceDate'] = pd.to_datetime(df311_2020['ServiceDate']) 
df311_2020['ClosedDate'] = pd.to_datetime(df311_2020['ClosedDate']) 

<font size=4 color='DarkBlue'> Inspect instances where Service Date is after Closed Date

In [16]:
#Create df where Service Date > Closed Date
dfSC = df311_2020[df311_2020.ServiceDate > df311_2020.ClosedDate]

In [17]:
dfSC.shape

(30190, 34)

In [18]:
#Percentage 
100 * float(30190)/float(1417527)

2.129765429512101

2.1% is not statistically significant

<font size=4 color='DarkBlue'> Inspect instances where Closed Date is before Updated Date:

In [19]:
#Create df where Closed Date < Updated Date
dfcu = df311_2020[df311_2020.ClosedDate < df311_2020.UpdatedDate]

In [20]:
dfcu.shape

(1181263, 34)

In [21]:
dfcu.head()

Unnamed: 0,SRNumber,CreatedDate,UpdatedDate,ActionTaken,Owner,RequestType,Status,RequestSource,CreatedByUserOrganization,MobileOS,Anonymous,AssignTo,ServiceDate,ClosedDate,AddressVerified,ApproximateAddress,Address,HouseNumber,Direction,StreetName,Suffix,ZipCode,Latitude,Longitude,Location,TBMPage,TBMColumn,TBMRow,APC,CD,CDMember,NC,NCName,PolicePrecinct
0,1-1523593381,2020-01-01 00:02:00,2020-01-07 12:19:00,SR Created,BOS,Illegal Dumping Pickup,Closed,Call,BOS,,N,SLA,2020-01-07,2020-01-07 10:49:00,Y,N,"1500 W SLAUSON AVE, 90047",1500.0,W,SLAUSON,AVE,90047.0,33.988984,-118.302681,"(33.988984265, -118.302681205)",673.0,J,5.0,South Los Angeles APC,8.0,Marqueece Harris-Dawson,81.0,EMPOWERMENT CONGRESS CENTRAL AREA NDC,77TH STREET
7,1-1523593661,2020-01-01 00:38:00,2020-02-14 13:17:00,SR Created,BOS,Homeless Encampment,Closed,Mobile App,Self Service,iOS,N,NC,2020-01-07,2020-02-14 12:46:00,Y,,"345 S GERTRUDE ST, 90033",345.0,S,GERTRUDE,ST,90033.0,34.043551,-118.217574,"(34.0435507512, -118.21757447)",635.0,A,5.0,East Los Angeles APC,14.0,Jose Huizar,50.0,BOYLE HEIGHTS NC,HOLLENBECK
8,1-1523593741,2020-01-01 00:43:00,2020-01-06 19:49:00,SR Created,BOS,Illegal Dumping Pickup,Closed,Email,BOS,,N,SLA,2020-01-06,2020-01-06 10:49:00,Y,N,"MARTIN LUTHER KING, JR BLVD AT MUIRFIELD ROAD,...",,,,,90008.0,34.015987,-118.343179,"(34.0159869949, -118.34317866)",673.0,D,2.0,South Los Angeles APC,10.0,Herb J. Wesson Jr.,79.0,EMPOWERMENT CONGRESS WEST AREA NDC,SOUTHWEST
10,1-1523593831,2020-01-01 00:49:00,2020-01-06 11:48:00,SR Created,BOS,Illegal Dumping Pickup,Closed,Email,BOS,,N,SLA,2020-01-06,2020-01-06 10:49:00,Y,N,"BUCKINGHAM ROAD AT MARTIN LUTHER KING,",,,,,0.0,34.014999,-118.341607,"(34.0149985801, -118.341606635)",673.0,D,2.0,South Los Angeles APC,10.0,Herb J. Wesson Jr.,79.0,EMPOWERMENT CONGRESS WEST AREA NDC,SOUTHWEST
11,1-1523597491,2020-01-01 00:51:00,2020-01-03 15:51:00,SR Created,BOS,Illegal Dumping Pickup,Closed,Call,BOS,,N,EV,2020-01-03,2020-01-03 10:18:00,Y,N,"SEPULVEDA BLVD AT OXNARD ST, 91411",,,,,91411.0,34.179369,-118.466216,"(34.1793689586, -118.466216382)",561.0,H,1.0,South Valley APC,6.0,Nury Martinez,20.0,VAN NUYS NC,VAN NUYS


<font size=3 color='DarkBlue'> There are 1,181,263, which is more than the number of records where  Service Dates < Created Date. However, many of these seem only hours apart, which could be that someone updated it a few hours after they closed it or there is an automatic process that updates the record when it's closed.  Will explore further.

In [22]:
#remove hours, minutes, seconds
dfcu['UpdatedDate'] = pd.to_datetime(dfcu['UpdatedDate']).dt.normalize()
dfcu['ClosedDate'] = pd.to_datetime(dfcu['ClosedDate']).dt.normalize()

In [23]:
#now that the hours, min, sec are gone, see if Updated Date actually happened on the same day as closed date
#by creating a column with value of 'samedate' or 'differentdate'
dfcu['compare'] = np.where((dfcu['UpdatedDate'] == dfcu['ClosedDate']), 'samedate', 'differentdate')

In [24]:
dfcu['compare'].value_counts()

samedate         963353
differentdate    217910
Name: compare, dtype: int64

<font size=2 color='DarkBlue'> Most are on the same date.  Explore the rest:

In [25]:
#create df that only contains records where UpdatedDate and ClosedDate are on different dates 
#(when UpdatedDate is on a different day than Closed Date)
dfcud = dfcu[dfcu['compare'] == 'differentdate']

In [26]:
dfcud.head(2)

Unnamed: 0,SRNumber,CreatedDate,UpdatedDate,ActionTaken,Owner,RequestType,Status,RequestSource,CreatedByUserOrganization,MobileOS,Anonymous,AssignTo,ServiceDate,ClosedDate,AddressVerified,ApproximateAddress,Address,HouseNumber,Direction,StreetName,Suffix,ZipCode,Latitude,Longitude,Location,TBMPage,TBMColumn,TBMRow,APC,CD,CDMember,NC,NCName,PolicePrecinct,compare
160,1-1523621291,2020-01-01 09:12:00,2020-02-14,SR Created,BSL,Single Streetlight Issue,Closed,Self Service,Self Service,,Y,SOUTH,2020-01-01,2020-01-01,Y,Y,"1925 W GAGE AVE, 90047",1925.0,W,GAGE,AVE,90047.0,33.981903,-118.312459,"(33.98190267, -118.31245847)",673.0,H,6.0,South Los Angeles APC,8.0,Marqueece Harris-Dawson,81.0,EMPOWERMENT CONGRESS CENTRAL AREA NDC,77TH STREET,differentdate
172,1-1523624286,2020-01-01 09:18:00,2020-01-16,SR Created,BSL,Single Streetlight Issue,Closed,Mobile App,Self Service,iOS,N,SOUTH,2020-01-13,2020-01-13,Y,,"1401 W 51ST PL, 90062",1401.0,W,51ST,PL,90062.0,33.996016,-118.300674,"(33.99601563, -118.30067385)",673.0,J,4.0,South Los Angeles APC,8.0,Marqueece Harris-Dawson,81.0,EMPOWERMENT CONGRESS CENTRAL AREA NDC,77TH STREET,differentdate


In [27]:
#create column called 'diff_days' that shows how many days exist between ClosedDate and UpdatedDate when 
#ClosedDate is on an actual different day than UpdatedDate
dfcud['diff_days'] = dfcud['ClosedDate'] - dfcud['UpdatedDate']

In [28]:
dfcud.diff_days.value_counts()

-1 days      186937
-2 days       14659
-3 days        7829
-4 days        3246
-5 days        1245
              ...  
-247 days         1
-540 days         1
-507 days         1
-414 days         1
-834 days         1
Name: diff_days, Length: 288, dtype: int64

In [36]:
#subtract the rows that have a difference of less than the same day
217910 - 186937

30973

In [37]:
100* float(30973)/float(1417527)

2.1850024726160417

<font size=4 color='DarkBlue'> Most of these are within a few days of each other.  If the premise that ServiceDate < CreatedDate is work people doing the work as they see it, inputing the info into the app they go and a accepting a default, that would explain these numbers as well. (And the data could beleft in the dataset without warping the outcome.) In addition, once the hours, minutes and seconds are removed and all rows where the ServiceDate and CreatedDate are within a day of each are removed, the amount is 2.2%, statistally insignificant.