# Austin-3-1-1 EDA

![](imgs/austintexas-gov.png)

In [1]:
%matplotlib inline

import numpy as np # Linear algebra lib
import pandas as pd # Data analysis lib
import matplotlib.pyplot as plt # plotting lib
import seaborn as sns # matplotlib wrapper plotting lib
import plotly.graph_objects as go # to customize plotly
import plotly_express as px

# Removes rows and columns truncation of '...'
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

# Matplotlib and Seaborn params
from matplotlib import rcParams
rcParams['figure.figsize'] = 10, 6
sns.set_palette('colorblind') # Makes it colorblind safe plots
sns.set_context('notebook')

# Ignore the warnings
import warnings

## Load data

In [2]:
# Toggle Comments to run
# ! rm -f raw_data/'311_Unified_Data.csv'
# !wget 'https://austin-311-data.s3.us-east-2.amazonaws.com/311_Unified_Data.csv' -P raw_data
# !ls -lh raw_data
# !head raw_data/'311_Unified_Data.csv'
# !tail raw_data/'311_Unified_Data.csv'
# !wc -l raw_data/'311_Unified_Data.csv'

## Clean Data

Start by loading the dataset

In [3]:
df = pd.read_csv('raw_data/311_Unified_Data.csv', low_memory=False)
df.head()

Unnamed: 0,Service Request (SR) Number,SR Type Code,SR Description,Owning Department,Method Received,SR Status,Status Change Date,Created Date,Last Update Date,Close Date,SR Location,Street Number,Street Name,City,Zip Code,County,State Plane X Coordinate,State Plane Y Coordinate,Latitude Coordinate,Longitude Coordinate,(Latitude.Longitude),Council District,Map Page,Map Tile
0,16-00107769,TRASIGMA,Traffic Signal - Maintenance,Transportation,Phone,Duplicate (closed),05/06/2016 12:11:57 PM,05/06/2016 12:01:46 PM,05/06/2016 12:11:57 PM,05/06/2016 12:11:57 PM,"WINDSOR RD & EXPOSITION BLVD, AUSTIN, TX 78703",,WINDSOR RD & EXPOSITION BLVD,AUSTIN,78703.0,TRAVIS,3106038.49745799,10080980.0,30.296699,-97.768264,"(30.29669887, -97.76826374)",10.0,584C,MH24
1,16-00108244,TRASIGMA,Traffic Signal - Maintenance,Transportation,Phone,Duplicate (closed),05/06/2016 07:51:27 PM,05/06/2016 07:51:27 PM,05/06/2016 07:51:27 PM,05/06/2016 07:51:27 PM,"6001 MANCHACA RD, AUSTIN, TX 78745",6001.0,MANCHACA,AUSTIN,78745.0,TRAVIS,3096240.5,10050190.0,30.212695,-97.801521,"(30.2126949, -97.8015215)",5.0,643M,MF17
2,16-00108269,TRASIGMA,Traffic Signal - Maintenance,Transportation,Phone,Duplicate (closed),05/06/2016 08:22:56 PM,05/06/2016 08:22:56 PM,05/06/2016 08:22:56 PM,05/06/2016 08:22:56 PM,"6001 MANCHACA RD, AUSTIN, TX 78745",6001.0,MANCHACA,AUSTIN,78745.0,TRAVIS,3096240.5,10050190.0,30.212695,-97.801521,"(30.2126949, -97.8015215)",5.0,643M,MF17
3,16-00324071,SWSDEADA,ARR Dead Animal Collection,Austin Resource Recovery,Phone,Closed,12/15/2016 09:05:43 AM,12/15/2016 06:41:40 AM,12/15/2016 09:05:43 AM,12/15/2016 09:05:43 AM,"2200 E OLTORF ST, AUSTIN, TX 78741",2200.0,OLTORF,AUSTIN,78741.0,TRAVIS,3118116.50749136,10057050.0,30.230164,-97.731776,"(30.23016411, -97.73177647)",3.0,615X,MJ19
4,16-00108062,TRASIGMA,Traffic Signal - Maintenance,Transportation,Phone,Duplicate (closed),05/10/2016 04:56:52 PM,05/06/2016 05:03:45 PM,05/10/2016 04:56:52 PM,05/10/2016 04:56:52 PM,"8401 N CAPITAL OF TEXAS HWY NB, AUSTIN, TX 78759",8401.0,CAPITAL OF TEXAS,AUSTIN,78759.0,TRAVIS,3105863.35797435,10113090.0,30.384989,-97.766471,"(30.38498865, -97.76647071)",10.0,494L,MH32


In [4]:
print('This dataset has number of rows {}, number of cols {}'.format(df.shape[0], df.shape[1]))

This dataset has number of rows 747986, number of cols 24


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 747986 entries, 0 to 747985
Data columns (total 24 columns):
Service Request (SR) Number    743752 non-null object
SR Type Code                   743751 non-null object
SR Description                 743751 non-null object
Owning Department              743751 non-null object
Method Received                743751 non-null object
SR Status                      743751 non-null object
Status Change Date             737819 non-null object
Created Date                   737819 non-null object
Last Update Date               737819 non-null object
Close Date                     695220 non-null object
SR Location                    738374 non-null object
Street Number                  592926 non-null object
Street Name                    738287 non-null object
City                           737345 non-null object
Zip Code                       735561 non-null float64
County                         731471 non-null object
State Plane X Coordinate

For our analysis we don't need following columns:
 
  - `Service Request (SR) Number`
  - `Map Page`
  - `Map Tile`
  - `State Plane X Coordinate`
  - `State Plane Y Coordinate`
  - `Street Number`
  - `Street Name`
  - `SR Location`
  - `(Latitude.Longitude)`
  - `Council District`

### Drop unecessary columns and empty rows

In [6]:
columns = ['Map Page', 'Map Tile', 'Service Request (SR) Number', 'SR Location', 'Street Number', 'Street Name', 'State Plane X Coordinate', 'State Plane Y Coordinate', '(Latitude.Longitude)']
df = df.drop(columns, axis=1)
df.head()

Unnamed: 0,SR Type Code,SR Description,Owning Department,Method Received,SR Status,Status Change Date,Created Date,Last Update Date,Close Date,City,Zip Code,County,Latitude Coordinate,Longitude Coordinate,Council District
0,TRASIGMA,Traffic Signal - Maintenance,Transportation,Phone,Duplicate (closed),05/06/2016 12:11:57 PM,05/06/2016 12:01:46 PM,05/06/2016 12:11:57 PM,05/06/2016 12:11:57 PM,AUSTIN,78703.0,TRAVIS,30.296699,-97.768264,10.0
1,TRASIGMA,Traffic Signal - Maintenance,Transportation,Phone,Duplicate (closed),05/06/2016 07:51:27 PM,05/06/2016 07:51:27 PM,05/06/2016 07:51:27 PM,05/06/2016 07:51:27 PM,AUSTIN,78745.0,TRAVIS,30.212695,-97.801521,5.0
2,TRASIGMA,Traffic Signal - Maintenance,Transportation,Phone,Duplicate (closed),05/06/2016 08:22:56 PM,05/06/2016 08:22:56 PM,05/06/2016 08:22:56 PM,05/06/2016 08:22:56 PM,AUSTIN,78745.0,TRAVIS,30.212695,-97.801521,5.0
3,SWSDEADA,ARR Dead Animal Collection,Austin Resource Recovery,Phone,Closed,12/15/2016 09:05:43 AM,12/15/2016 06:41:40 AM,12/15/2016 09:05:43 AM,12/15/2016 09:05:43 AM,AUSTIN,78741.0,TRAVIS,30.230164,-97.731776,3.0
4,TRASIGMA,Traffic Signal - Maintenance,Transportation,Phone,Duplicate (closed),05/10/2016 04:56:52 PM,05/06/2016 05:03:45 PM,05/10/2016 04:56:52 PM,05/10/2016 04:56:52 PM,AUSTIN,78759.0,TRAVIS,30.384989,-97.766471,10.0


### Check for missing values and drop rows that are missing important info.

In [7]:
df = df.dropna(how='all') # how='all' drops rows that have all NaN values, whereas, 'any' will drop any row that has NaN present
df.isnull().sum()

SR Type Code                0
SR Description              0
Owning Department           0
Method Received             0
SR Status                   0
Status Change Date       5932
Created Date             5932
Last Update Date         5932
Close Date              48531
City                     6406
Zip Code                 8190
County                  12280
Latitude Coordinate      5451
Longitude Coordinate     5451
Council District        41746
dtype: int64

After dropping rows that had all `NaN` values, we now can drop those that having missing valuable information needed to stratify complaints by their location info. 

#### Missing Zip Code and County and City

In [8]:
df = df.copy()

In [12]:
df.shape

(743751, 15)

In [10]:
df.loc[df[['City', 'Zip Code', 'County']].isnull().values.all(axis=1)].head()

Unnamed: 0,SR Type Code,SR Description,Owning Department,Method Received,SR Status,Status Change Date,Created Date,Last Update Date,Close Date,City,Zip Code,County,Latitude Coordinate,Longitude Coordinate,Council District
27,PARMACIS,Parking Machine Issue,Transportation,Phone,Closed,08/08/2016 07:54:58 AM,08/07/2016 03:46:47 PM,08/08/2016 07:54:58 AM,08/08/2016 07:54:58 AM,,,,,,
74,PARMACIS,Parking Machine Issue,Transportation,Phone,Closed,06/13/2016 06:15:32 AM,06/09/2016 10:10:59 PM,06/13/2016 06:15:33 AM,06/13/2016 06:15:32 AM,,,,,,
189,PARMACIS,Parking Machine Issue,Transportation,Phone,Closed,09/14/2016 06:44:40 AM,09/13/2016 08:40:50 AM,09/14/2016 06:44:41 AM,09/14/2016 06:44:40 AM,,,,,,
402,PARMACIS,Parking Machine Issue,Transportation,Phone,Closed,07/20/2016 06:48:00 AM,07/19/2016 05:14:00 PM,07/20/2016 06:48:00 AM,07/20/2016 06:48:00 AM,,,,,,
944,AESTRPO1,Street Light Issue- Multiple poles/multiple st...,Austin Energy Department,Phone,Closed,05/06/2016 08:17:43 AM,05/02/2016 05:32:08 AM,05/06/2016 08:17:44 AM,05/06/2016 08:17:43 AM,,,,,,


In [13]:
df.loc[df[['City', 'Zip Code', 'County']].isnull().values.all(axis=1)].shape

(5690, 15)

In [11]:
df.loc[df[['City', 'Zip Code', 'County']].notnull().values.any(axis=1)]

Unnamed: 0,SR Type Code,SR Description,Owning Department,Method Received,SR Status,Status Change Date,Created Date,Last Update Date,Close Date,City,Zip Code,County,Latitude Coordinate,Longitude Coordinate,Council District
0,TRASIGMA,Traffic Signal - Maintenance,Transportation,Phone,Duplicate (closed),05/06/2016 12:11:57 PM,05/06/2016 12:01:46 PM,05/06/2016 12:11:57 PM,05/06/2016 12:11:57 PM,AUSTIN,78703.0,TRAVIS,30.296699,-97.768264,10.0
1,TRASIGMA,Traffic Signal - Maintenance,Transportation,Phone,Duplicate (closed),05/06/2016 07:51:27 PM,05/06/2016 07:51:27 PM,05/06/2016 07:51:27 PM,05/06/2016 07:51:27 PM,AUSTIN,78745.0,TRAVIS,30.212695,-97.801521,5.0
2,TRASIGMA,Traffic Signal - Maintenance,Transportation,Phone,Duplicate (closed),05/06/2016 08:22:56 PM,05/06/2016 08:22:56 PM,05/06/2016 08:22:56 PM,05/06/2016 08:22:56 PM,AUSTIN,78745.0,TRAVIS,30.212695,-97.801521,5.0
3,SWSDEADA,ARR Dead Animal Collection,Austin Resource Recovery,Phone,Closed,12/15/2016 09:05:43 AM,12/15/2016 06:41:40 AM,12/15/2016 09:05:43 AM,12/15/2016 09:05:43 AM,AUSTIN,78741.0,TRAVIS,30.230164,-97.731776,3.0
4,TRASIGMA,Traffic Signal - Maintenance,Transportation,Phone,Duplicate (closed),05/10/2016 04:56:52 PM,05/06/2016 05:03:45 PM,05/10/2016 04:56:52 PM,05/10/2016 04:56:52 PM,AUSTIN,78759.0,TRAVIS,30.384989,-97.766471,10.0
5,STREETL2,Street Light Issue- Address,Austin Energy Department,Phone,Closed,05/10/2016 06:58:01 AM,05/06/2016 10:28:16 AM,05/10/2016 06:58:01 AM,05/10/2016 06:58:01 AM,AUSTIN,78703.0,TRAVIS,30.268090,-97.751739,9.0
6,STREETL2,Street Light Issue- Address,Austin Energy Department,Phone,Closed,05/11/2016 04:41:32 PM,05/06/2016 10:19:24 AM,05/11/2016 04:41:33 PM,05/11/2016 04:41:32 PM,AUSTIN,78741.0,TRAVIS,30.232593,-97.702577,3.0
7,STREETL2,Street Light Issue- Address,Austin Energy Department,Phone,Closed,05/11/2016 04:42:25 PM,05/06/2016 10:29:40 AM,05/11/2016 04:42:26 PM,05/11/2016 04:42:25 PM,AUSTIN,78744.0,TRAVIS,30.200030,-97.693940,2.0
8,STREETL2,Street Light Issue- Address,Austin Energy Department,Spot311 Interface,Closed,05/11/2016 04:43:10 PM,05/06/2016 10:50:02 AM,05/11/2016 04:43:11 PM,05/11/2016 04:43:10 PM,AUSTIN,78704.0,TRAVIS,30.259790,-97.768425,5.0
9,STREETL2,Street Light Issue- Address,Austin Energy Department,Phone,Closed,05/11/2016 04:43:54 PM,05/06/2016 11:12:06 AM,05/11/2016 04:43:55 PM,05/11/2016 04:43:54 PM,AUSTIN,78702.0,TRAVIS,30.260038,-97.731196,3.0


In [None]:
df = df.loc[df.loc[:, ['City', 'Zip Code', 'County']].notnull().values.any(axis=1)]
df.info()

#### Missing County

#### Missing City