# Austin-3-1-1 EDA

![](imgs/austintexas-gov.png)

In [34]:
%matplotlib inline

import numpy as np # Linear algebra lib
import pandas as pd # Data analysis lib
import matplotlib.pyplot as plt # plotting lib
import seaborn as sns # matplotlib wrapper plotting lib
import plotly.graph_objects as go # to customize plotly
import plotly_express as px

# Removes rows and columns truncation of '...'
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

# Matplotlib and Seaborn params
from matplotlib import rcParams
rcParams['figure.figsize'] = 10, 6
sns.set_palette('colorblind') # Makes it colorblind safe plots
sns.set_context('notebook')

# Ignore the warnings
import warnings

## Load data

In [35]:
# Toggle Comments to run
# ! rm -f raw_data/'311_Unified_Data.csv'
# !wget 'https://austin-311-data.s3.us-east-2.amazonaws.com/311_Unified_Data.csv' -P raw_data
# !ls -lh raw_data
# !head raw_data/'311_Unified_Data.csv'
# !tail raw_data/'311_Unified_Data.csv'
# !wc -l raw_data/'311_Unified_Data.csv'

## Clean Data

Start by loading the dataset

In [36]:
df = pd.read_csv('raw_data/311_Unified_Data.csv', low_memory=False)
df.head()

Unnamed: 0,Service Request (SR) Number,SR Type Code,SR Description,Owning Department,Method Received,SR Status,Status Change Date,Created Date,Last Update Date,Close Date,SR Location,Street Number,Street Name,City,Zip Code,County,State Plane X Coordinate,State Plane Y Coordinate,Latitude Coordinate,Longitude Coordinate,(Latitude.Longitude),Council District,Map Page,Map Tile
0,16-00107769,TRASIGMA,Traffic Signal - Maintenance,Transportation,Phone,Duplicate (closed),05/06/2016 12:11:57 PM,05/06/2016 12:01:46 PM,05/06/2016 12:11:57 PM,05/06/2016 12:11:57 PM,"WINDSOR RD & EXPOSITION BLVD, AUSTIN, TX 78703",,WINDSOR RD & EXPOSITION BLVD,AUSTIN,78703.0,TRAVIS,3106038.49745799,10080980.0,30.296699,-97.768264,"(30.29669887, -97.76826374)",10.0,584C,MH24
1,16-00108244,TRASIGMA,Traffic Signal - Maintenance,Transportation,Phone,Duplicate (closed),05/06/2016 07:51:27 PM,05/06/2016 07:51:27 PM,05/06/2016 07:51:27 PM,05/06/2016 07:51:27 PM,"6001 MANCHACA RD, AUSTIN, TX 78745",6001.0,MANCHACA,AUSTIN,78745.0,TRAVIS,3096240.5,10050190.0,30.212695,-97.801521,"(30.2126949, -97.8015215)",5.0,643M,MF17
2,16-00108269,TRASIGMA,Traffic Signal - Maintenance,Transportation,Phone,Duplicate (closed),05/06/2016 08:22:56 PM,05/06/2016 08:22:56 PM,05/06/2016 08:22:56 PM,05/06/2016 08:22:56 PM,"6001 MANCHACA RD, AUSTIN, TX 78745",6001.0,MANCHACA,AUSTIN,78745.0,TRAVIS,3096240.5,10050190.0,30.212695,-97.801521,"(30.2126949, -97.8015215)",5.0,643M,MF17
3,16-00324071,SWSDEADA,ARR Dead Animal Collection,Austin Resource Recovery,Phone,Closed,12/15/2016 09:05:43 AM,12/15/2016 06:41:40 AM,12/15/2016 09:05:43 AM,12/15/2016 09:05:43 AM,"2200 E OLTORF ST, AUSTIN, TX 78741",2200.0,OLTORF,AUSTIN,78741.0,TRAVIS,3118116.50749136,10057050.0,30.230164,-97.731776,"(30.23016411, -97.73177647)",3.0,615X,MJ19
4,16-00108062,TRASIGMA,Traffic Signal - Maintenance,Transportation,Phone,Duplicate (closed),05/10/2016 04:56:52 PM,05/06/2016 05:03:45 PM,05/10/2016 04:56:52 PM,05/10/2016 04:56:52 PM,"8401 N CAPITAL OF TEXAS HWY NB, AUSTIN, TX 78759",8401.0,CAPITAL OF TEXAS,AUSTIN,78759.0,TRAVIS,3105863.35797435,10113090.0,30.384989,-97.766471,"(30.38498865, -97.76647071)",10.0,494L,MH32


In [37]:
print('This dataset has number of rows {}, number of cols {}'.format(df.shape[0], df.shape[1]))

This dataset has number of rows 747986, number of cols 24


In [38]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 747986 entries, 0 to 747985
Data columns (total 24 columns):
Service Request (SR) Number    743752 non-null object
SR Type Code                   743751 non-null object
SR Description                 743751 non-null object
Owning Department              743751 non-null object
Method Received                743751 non-null object
SR Status                      743751 non-null object
Status Change Date             737819 non-null object
Created Date                   737819 non-null object
Last Update Date               737819 non-null object
Close Date                     695220 non-null object
SR Location                    738374 non-null object
Street Number                  592926 non-null object
Street Name                    738287 non-null object
City                           737345 non-null object
Zip Code                       735561 non-null float64
County                         731471 non-null object
State Plane X Coordinate

For our analysis we don't need following columns:
 
  - `Service Request (SR) Number`
  - `Status Change Date`
  - `Last Update Date`
  - `Close Date`
  - `Map Page`
  - `Map Tile`
  - `State Plane X Coordinate`
  - `State Plane Y Coordinate`
  - `Street Number`
  - `Street Name`
  - `SR Location`
  - `(Latitude.Longitude)`
  - `Council District`

### Drop unecessary columns and empty rows

In [39]:
columns = ['Council District', 'Map Page', 'Map Tile', 'Service Request (SR) Number', 'Status Change Date', 'Last Update Date', 'Close Date', 'SR Location', 'Street Number', 'Street Name', 'State Plane X Coordinate', 'State Plane Y Coordinate', '(Latitude.Longitude)']
df = df.drop(columns, axis=1)
df.head()

Unnamed: 0,SR Type Code,SR Description,Owning Department,Method Received,SR Status,Created Date,City,Zip Code,County,Latitude Coordinate,Longitude Coordinate
0,TRASIGMA,Traffic Signal - Maintenance,Transportation,Phone,Duplicate (closed),05/06/2016 12:01:46 PM,AUSTIN,78703.0,TRAVIS,30.296699,-97.768264
1,TRASIGMA,Traffic Signal - Maintenance,Transportation,Phone,Duplicate (closed),05/06/2016 07:51:27 PM,AUSTIN,78745.0,TRAVIS,30.212695,-97.801521
2,TRASIGMA,Traffic Signal - Maintenance,Transportation,Phone,Duplicate (closed),05/06/2016 08:22:56 PM,AUSTIN,78745.0,TRAVIS,30.212695,-97.801521
3,SWSDEADA,ARR Dead Animal Collection,Austin Resource Recovery,Phone,Closed,12/15/2016 06:41:40 AM,AUSTIN,78741.0,TRAVIS,30.230164,-97.731776
4,TRASIGMA,Traffic Signal - Maintenance,Transportation,Phone,Duplicate (closed),05/06/2016 05:03:45 PM,AUSTIN,78759.0,TRAVIS,30.384989,-97.766471


### Check for missing values and drop rows that are missing important info.

In [40]:
df.isnull().sum()

SR Type Code             4235
SR Description           4235
Owning Department        4235
Method Received          4235
SR Status                4235
Created Date            10167
City                    10641
Zip Code                12425
County                  16515
Latitude Coordinate      9686
Longitude Coordinate     9686
dtype: int64

So, there are lot missing values, mostly due to empty rows, let's drop those rows now.

In [41]:
df = df.dropna(how='all') # how='all' drops rows that have all NaN values, whereas, 'any' will drop any row that has NaN present
df.isnull().sum()

SR Type Code                0
SR Description              0
Owning Department           0
Method Received             0
SR Status                   0
Created Date             5932
City                     6406
Zip Code                 8190
County                  12280
Latitude Coordinate      5451
Longitude Coordinate     5451
dtype: int64

After dropping rows that had all `NaN` values, now we can drop those that having missing valuable information needed to stratify complaints by their location info. 



#### Missing all location data (i.e. Zip Code and County and City)

Let's drop rows that contain all of the missing location info.

In [42]:
print('Before dimensions: ', df.shape)
df = df.loc[df[['City', 'Zip Code', 'County']].notnull().values.any(axis=1)]
print('After dimensions: ', df.shape)
df.head()

Before dimensions:  (743751, 11)
After dimensions:  (738061, 11)


Unnamed: 0,SR Type Code,SR Description,Owning Department,Method Received,SR Status,Created Date,City,Zip Code,County,Latitude Coordinate,Longitude Coordinate
0,TRASIGMA,Traffic Signal - Maintenance,Transportation,Phone,Duplicate (closed),05/06/2016 12:01:46 PM,AUSTIN,78703.0,TRAVIS,30.296699,-97.768264
1,TRASIGMA,Traffic Signal - Maintenance,Transportation,Phone,Duplicate (closed),05/06/2016 07:51:27 PM,AUSTIN,78745.0,TRAVIS,30.212695,-97.801521
2,TRASIGMA,Traffic Signal - Maintenance,Transportation,Phone,Duplicate (closed),05/06/2016 08:22:56 PM,AUSTIN,78745.0,TRAVIS,30.212695,-97.801521
3,SWSDEADA,ARR Dead Animal Collection,Austin Resource Recovery,Phone,Closed,12/15/2016 06:41:40 AM,AUSTIN,78741.0,TRAVIS,30.230164,-97.731776
4,TRASIGMA,Traffic Signal - Maintenance,Transportation,Phone,Duplicate (closed),05/06/2016 05:03:45 PM,AUSTIN,78759.0,TRAVIS,30.384989,-97.766471


In [43]:
df.isnull().sum()

SR Type Code               0
SR Description             0
Owning Department          0
Method Received            0
SR Status                  0
Created Date            5889
City                     716
Zip Code                2500
County                  6590
Latitude Coordinate       70
Longitude Coordinate      70
dtype: int64

After eliminating rows with no location information, we can now dive into missing `Zip Code` values, as they are important to our analysis.

#### Only Missing Zipcodes

In [44]:
df[df[['Zip Code']].isnull().values].sample(10)

Unnamed: 0,SR Type Code,SR Description,Owning Department,Method Received,SR Status,Created Date,City,Zip Code,County,Latitude Coordinate,Longitude Coordinate
733961,PWBICYCL,Bicycle Issues,Transportation,Phone,Open,07/20/2019 11:49:29 AM,AUSTIN,,TRAVIS,30.279558,-97.720861
738416,TRASIGMA,Traffic Signal - Maintenance,Transportation,Phone,Work In Progress,07/20/2019 09:32:49 PM,AUSTIN,,TRAVIS,30.233035,-97.800625
729930,CODECOMP,Austin Code - Request Code Officer,Austin Code Department,Spot311 Interface,Closed,07/19/2019 09:27:17 PM,AUSTIN,,TRAVIS,30.254674,-97.722806
728100,CODECOMP,Austin Code - Request Code Officer,Austin Code Department,Phone,Closed,07/19/2019 11:47:56 AM,AUSTIN,,TRAVIS,30.236635,-97.90138
733939,SIGNTRAF,Sign - Traffic Sign Emergency,Transportation,Phone,Closed,07/20/2019 05:46:16 AM,AUSTIN,,TRAVIS,30.279638,-97.734515
738407,TRASIGMA,Traffic Signal - Maintenance,Transportation,Phone,Closed,07/20/2019 05:43:18 PM,AUSTIN,,TRAVIS,30.285732,-97.665957
64485,TRASIGNE,Traffic Signal - New/Change,Transportation,Web,Open,11/15/2016 08:17:30 PM,Austin,,,3.442387,-105.983195
36884,WWREPORT,Water Waste Report,Austin Water Utility,Field Request,Closed,06/27/2016 04:53:39 PM,AUSTIN,,,3.442387,-105.983195
696650,SIGNTRAF,Sign - Traffic Sign Emergency,Transportation,Phone,Closed,11/29/2018 10:44:30 PM,AUSTIN,,,30.373967,-97.72891
499678,STREETL2,Street Light Issue- Address,Austin Energy Department,Phone,Closed,07/18/2019 09:42:53 PM,AUSTIN,,TRAVIS,30.228265,-97.862981


Since `Austin` is in `Travis` County, we can fill in those columns that has `Austin` in them, while we can map missing city with `Zip codes` known to `Austin`, but we can't do vice-versa, maybe reverse geocoding API can help in getting `Zipcode` info from `Latitude & Longitude` coordinates. 

### Clean `text` columns

#### Strip leading and trailing whitespaces first from entire dataframe

##### Before Stripping whitespaces

In [45]:
df.isnull().sum()

SR Type Code               0
SR Description             0
Owning Department          0
Method Received            0
SR Status                  0
Created Date            5889
City                     716
Zip Code                2500
County                  6590
Latitude Coordinate       70
Longitude Coordinate      70
dtype: int64

In [46]:
df = df.apply(lambda x: x.str.strip() if x.dtype == "object" else x)
df.replace('', np.NaN, inplace=True)

##### After stripping whitespaces

In [47]:
df.isnull().sum()

SR Type Code               0
SR Description             0
Owning Department          0
Method Received            0
SR Status                  0
Created Date            5889
City                     717
Zip Code                2500
County                  6590
Latitude Coordinate       70
Longitude Coordinate      70
dtype: int64

#### Let's start by normalizing `City` columns to `Title Case`, and fix any misspellings.

In [48]:
df['City'] = df['City'].str.title()
df['City'].value_counts(dropna=False).sort_values(ascending=False)

Austin                      722829
Pflugerville                  3137
Del Valle                     2248
Manor                         1756
Austin 5 Etj                  1242
Cedar Park                     807
NaN                            717
Bee Cave                       444
Round Rock                     438
Leander                        429
Sunset Valley                  422
Lakeway                        367
Other                          363
Elgin                          282
Mustang Ridge                  270
West Lake Hills                261
Manchaca                       212
West Lake Hill                 167
Spicewood                      155
Lago Vista                     155
Buda                           144
Rollingwood                    138
Jonestown                      125
Webberville                    117
Creedmoor                      116
Travis                         111
Briarcliff                     100
Dripping Springs                52
Cedar Creek         

Not so surprisingly we have so many typos, and misspellings, which we can fix, but surprisingly there are calls from cities that are miles apart like `Houston`, `Dallas`, etc.., so let's keep those that are only from `City of Austin` in Travis County which is our main focus.

##### Austin and it's Extraterritorial Jurisdiction

Another thing to note is `Austin's 5 ETJ's` (Extraterritorial Jurisdiction) refers to cities, is the legal capability of a municipality to exercise authority beyond the boundaries of its incorporated area. In the US, Texas is one of the states that by law allow cities to claim ETJ to contiguous land beyond their city limits.  Austin’s ETJ currently extends into 4 counties including Williamson, Travis, Hays, and Bastrop.

In [49]:
df.loc[df['City'].str.contains('Austin 5 Etj', na=False), 'County'].value_counts(dropna=False)

TRAVIS     1217
HAYS          9
NaN           9
BASTROP       7
Name: County, dtype: int64

In [57]:
df.loc[df['City'].str.contains('Austin 5 Etj', na=False), 'Zip Code'].value_counts(dropna=False).sort_index()

 78610.0     26
 78612.0     45
 78617.0    171
 78641.0     11
 78645.0      1
 78653.0    315
 78660.0      2
 78719.0     72
 78724.0     16
 78725.0    328
 78733.0     10
 78734.0    103
 78735.0     31
 78736.0     25
 78737.0     43
 78738.0      5
 78747.0     35
 78758.0      1
NaN           2
Name: Zip Code, dtype: int64

In [None]:
df[df['Zip Code']

In [67]:
df[df['Zip Code'].isin(zips)]['Zip Code'].value_counts(dropna=False).sort_index()

78610.0      476
78617.0     8588
78653.0     3447
78660.0     6025
78701.0    33394
78702.0    40693
78703.0    26730
78704.0    50037
78705.0    21695
78721.0    16356
78722.0     8674
78723.0    35058
78724.0    16202
78725.0     3393
78726.0     4013
78727.0    13563
78728.0     2724
78729.0     8474
78730.0     2748
78731.0    21898
78732.0     1781
78733.0      956
78734.0     1827
78735.0     8519
78736.0     3501
78737.0      749
78738.0      876
78739.0     7883
78741.0    31891
78742.0     1310
78744.0    36287
78745.0    50418
78746.0    12651
78747.0     7668
78748.0    26595
78749.0    21594
78750.0    11929
78751.0    18728
78752.0    14720
78753.0    32058
78754.0    10915
78756.0    10821
78757.0    23524
78758.0    35347
78759.0    24559
Name: Zip Code, dtype: int64

Looks like there is considerable complaints from surrounding territories that are under City of Austin's jurisdiction so we will keep them, but only those that are in TRAVIS county.

In [56]:
zips = """
78701,78702,78703,78704,78705,
78721,78722,78723,78724,78725,
78726,78727,78728,78729,78730,
78731,78732,78733,78734,78735,
78736,78737,78738,78739,78741,
78742,78744,78745,78746,78747,
78748,78749,78750,78751,78752,
78753,78754,78756,78757,78758,
78759,78610,78617,78653,78660
""".split(',')
zips = [x.strip() for x in zips]
zips

['78701',
 '78702',
 '78703',
 '78704',
 '78705',
 '78721',
 '78722',
 '78723',
 '78724',
 '78725',
 '78726',
 '78727',
 '78728',
 '78729',
 '78730',
 '78731',
 '78732',
 '78733',
 '78734',
 '78735',
 '78736',
 '78737',
 '78738',
 '78739',
 '78741',
 '78742',
 '78744',
 '78745',
 '78746',
 '78747',
 '78748',
 '78749',
 '78750',
 '78751',
 '78752',
 '78753',
 '78754',
 '78756',
 '78757',
 '78758',
 '78759',
 '78610',
 '78617',
 '78653',
 '78660']

In [64]:
df[df['Zip Code'].isin(zips)]['City'].value_counts()

Austin                      710178
Pflugerville                  3014
Del Valle                     2232
Manor                         1724
Austin 5 Etj                  1111
Bee Cave                       442
Sunset Valley                  421
Lakeway                        315
West Lake Hills                261
Mustang Ridge                  170
West Lake Hill                 167
Buda                           142
Rollingwood                    136
Creedmoor                      116
Travis                         111
Webberville                     88
Other                           46
The Hills                       43
Round Rock                      42
Austn                           23
Cedar Park                      23
San Leanna                      18
Dripping Sprin                  15
Dripping Springs                15
Hays                            11
Manchaca                         9
Lago Vista                       9
West Lake Hil                    8
Aus                 

In [65]:
df[df['Zip Code'].isin(zips)]['County'].value_counts()

TRAVIS        702422
WILLIAMSON     13758
HAYS              76
BASTROP           19
Name: County, dtype: int64

In [26]:
print('Before dimensions: ', df.shape)
df.loc[df['City'].str.contains('Austin 5 Etj', na=False), :]
print('After dimensions: ', df.shape)
df['City'].value_counts(dropna=False).sort_values(ascending=False)

Before dimensions:  (736819, 11)
After dimensions:  (736819, 11)


Austin                      722875
Pflugerville                  3137
Del Valle                     2248
Manor                         1756
Cedar Park                     807
NaN                            717
Bee Cave                       444
Round Rock                     438
Leander                        429
Sunset Valley                  422
Lakeway                        367
Other                          363
Elgin                          282
Mustang Ridge                  270
West Lake Hills                261
Manchaca                       212
West Lake Hill                 167
Lago Vista                     155
Spicewood                      155
Buda                           144
Rollingwood                    138
Jonestown                      125
Webberville                    117
Creedmoor                      116
Travis                         111
Briarcliff                     100
Dripping Springs                52
Cedar Creek                     47
The Hills           

#### Fix `Austin` related typos and misspellings

In [27]:
print('Before dimensions: ', df.shape)
df.loc[(df['City'] != 'Austin 5 Etj') & df['City'].str.startswith('Aus', na=False).values, 'City'] = 'Austin'
print('After dimensions: ', df.shape)
df['City'].value_counts(dropna=False).sort_values(ascending=False)

Before dimensions:  (736819, 11)
After dimensions:  (736819, 11)


Austin                      722875
Pflugerville                  3137
Del Valle                     2248
Manor                         1756
Cedar Park                     807
NaN                            717
Bee Cave                       444
Round Rock                     438
Leander                        429
Sunset Valley                  422
Lakeway                        367
Other                          363
Elgin                          282
Mustang Ridge                  270
West Lake Hills                261
Manchaca                       212
West Lake Hill                 167
Lago Vista                     155
Spicewood                      155
Buda                           144
Rollingwood                    138
Jonestown                      125
Webberville                    117
Creedmoor                      116
Travis                         111
Briarcliff                     100
Dripping Springs                52
Cedar Creek                     47
The Hills           

#### Assert Zipcodes that are to Austin only, fill NaN values in City with Austin based on Zip Codes

In [28]:
df.loc[df['City'] == 'Austin', 'Zip Code'].value_counts(dropna=False).sort_index()

 78610.0       84
 78612.0       45
 78613.0      724
 78617.0     6085
 78620.0       17
 78621.0       45
 78641.0       79
 78645.0        1
 78652.0      588
 78653.0     1235
 78654.0        6
 78660.0     3089
 78669.0       40
 78681.0       24
 78701.0    33380
 78702.0    40679
 78703.0    26712
 78704.0    50008
 78705.0    21683
 78712.0      749
 78717.0     6809
 78719.0     1101
 78721.0    16352
 78722.0     8670
 78723.0    35046
 78724.0    16179
 78725.0     3064
 78726.0     4009
 78727.0    13442
 78728.0     2706
 78729.0     8467
 78730.0     2747
 78731.0    21893
 78732.0     1779
 78733.0      946
 78734.0     1489
 78735.0     8414
 78736.0     3462
 78737.0      681
 78738.0      274
 78739.0     7875
 78741.0    31883
 78742.0     1309
 78744.0    36270
 78745.0    50127
 78746.0    12074
 78747.0     7572
 78748.0    26558
 78749.0    21488
 78750.0    11913
 78751.0    18721
 78752.0    14712
 78753.0    32042
 78754.0    10910
 78756.0    10815
 78757.0  

In [29]:
print('Dimensions of data frame: ', df.shape)

Dimensions of data frame:  (736819, 11)


In [30]:
austin_df = df.loc[df['City'] == 'Austin', :]
austin_df = austin_df.reset_index(drop=True)
austin_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 722875 entries, 0 to 722874
Data columns (total 11 columns):
SR Type Code            722875 non-null object
SR Description          722875 non-null object
Owning Department       722875 non-null object
Method Received         722875 non-null object
SR Status               722875 non-null object
Created Date            717221 non-null object
City                    722875 non-null object
Zip Code                720460 non-null float64
County                  716672 non-null object
Latitude Coordinate     722843 non-null float64
Longitude Coordinate    722843 non-null float64
dtypes: float64(3), object(8)
memory usage: 60.7+ MB


In [31]:
austin_df.shape

(722875, 11)

In [32]:
df.loc[~(df['City'] == 'Austin'), :].shape

(13944, 11)

In [33]:
df.loc[df['City'].str.startswith('Aus', na=False).values, ['City', 'Zip Code']].groupby(by=['City', 'Zip Code']).size()

City    Zip Code
Austin  78610.0        84
        78612.0        45
        78613.0       724
        78617.0      6085
        78620.0        17
        78621.0        45
        78641.0        79
        78645.0         1
        78652.0       588
        78653.0      1235
        78654.0         6
        78660.0      3089
        78669.0        40
        78681.0        24
        78701.0     33380
        78702.0     40679
        78703.0     26712
        78704.0     50008
        78705.0     21683
        78712.0       749
        78717.0      6809
        78719.0      1101
        78721.0     16352
        78722.0      8670
        78723.0     35046
        78724.0     16179
        78725.0      3064
        78726.0      4009
        78727.0     13442
        78728.0      2706
        78729.0      8467
        78730.0      2747
        78731.0     21893
        78732.0      1779
        78733.0       946
        78734.0      1489
        78735.0      8414
        78736.0      