In [84]:
import pandas as pd
import numpy as np
import sqlalchemy as sa
from sqlalchemy import Table, MetaData, Column, Integer, create_engine
import urllib
import pyodbc 
import math

## Build and Establish Connection to DB

In [2]:
#server = ''
#database = ''
#username = ''
#password = ''


params = urllib.parse.quote_plus('DRIVER={ODBC Driver 17 for SQL Server};SERVER='+server+';DATABASE='+database+';UID='+username+';PWD='+ password)

engine = sa.create_engine("mssql+pyodbc:///?odbc_connect=%s" % params)

In [3]:
connection = engine.connect()

## Load Raw Eviction Notices Data

In [5]:
get_evictions = """
SELECT *
FROM SF_Eviction_Notices_Raw;
"""

evictions = pd.read_sql_query(get_evictions, connection)
evictions

Unnamed: 0,id,eviction_id,address,city,state,zip,file_date,non_payment,breach,nuisance,...,other_cause,late_payments,lead_remediation,development,good_samaritan_ends,supervisor_district,neighborhood,shape,constraints_date,client_location
0,0,M220284,2000 Block Of Broadway Street,San Francisco,CA,94123,2022-02-18T00:00:00.000,False,True,False,...,False,False,False,False,False,2,Pacific Heights,"{'type': 'Point', 'coordinates': [-122.430824,...",,
1,1,M220262,500 Block Of Beale Street,San Francisco,CA,94105,2022-02-14T00:00:00.000,False,True,False,...,False,False,False,False,False,6,Financial District/South Beach,"{'type': 'Point', 'coordinates': [-122.38909, ...",,
2,2,M220049,2000 Block Of Broadway Street,San Francisco,CA,94123,2022-01-07T00:00:00.000,False,True,False,...,False,False,False,False,False,2,Pacific Heights,"{'type': 'Point', 'coordinates': [-122.430824,...",,
3,3,M211708,100 Block Of Bartlett Street,San Francisco,CA,94115,2021-11-12T00:00:00.000,False,False,True,...,False,False,False,False,False,9,Mission,"{'type': 'Point', 'coordinates': [-122.41979, ...",,
4,4,M211587,700 Block Of Fillmore Street,San Francisco,CA,94134,2021-10-25T00:00:00.000,False,False,True,...,False,False,False,False,False,5,Hayes Valley,"{'type': 'Point', 'coordinates': [-122.431305,...",,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43388,43392,S000967,1200 Block Of 04th Avenue,San Francisco,CA,94107,1997-12-08T00:00:00.000,False,False,False,...,False,False,False,False,False,5,Inner Sunset,"{'type': 'Point', 'coordinates': [-122.46107, ...",,
43389,43388,E992050,900 Block Of Mission Street,San Francisco,CA,94103,1999-10-12T00:00:00.000,False,False,False,...,False,False,False,False,False,6,South of Market,"{'type': 'Point', 'coordinates': [-122.40798, ...",,"{'latitude': '37.78154530291944', 'longitude':..."
43390,43389,RE01899,500 Block Of Wisconsin Street,San Francisco,CA,94107,1997-05-21T00:00:00.000,False,False,False,...,False,False,False,False,False,10,Potrero Hill,"{'type': 'Point', 'coordinates': [-122.39924, ...",,"{'latitude': '37.76048824432999', 'longitude':..."
43391,43390,RE01981,200 Block Of Central Avenue,San Francisco,CA,94117,1997-06-03T00:00:00.000,False,False,False,...,False,False,False,False,False,5,Haight Ashbury,"{'type': 'Point', 'coordinates': [-122.44393, ...",,"{'latitude': '37.77182670111496', 'longitude':..."


## Pull Target Rel_Eviction_Notices Table Structure

In [7]:
evictions_table_target_names = """
select 
    col.column_id as id,
    col.name,
    t.name as data_type,
    col.max_length,
    col.precision,
    col.is_nullable
from sys.tables as tab
    inner join sys.columns as col
        on tab.object_id = col.object_id
    left join sys.types as t
    on col.user_type_id = t.user_type_id
where tab.name = 'Rel_Eviction_Notices'
order by tab.name, column_id;
"""

evictions_table_fields = pd.read_sql_query(evictions_table_target_names, connection)
evictions_table_fields

Unnamed: 0,id,name,data_type,max_length,precision,is_nullable
0,1,eviction_auto_id,uniqueidentifier,16,0,False
1,2,eviction_id,varchar,-1,0,False
2,3,file_date,date,3,10,False
3,4,non_payment,bit,1,1,False
4,5,breach,bit,1,1,False
5,6,nuisance,bit,1,1,False
6,7,illegal_use,bit,1,1,False
7,8,failure_to_sign_renewal,bit,1,1,False
8,9,access_denial,bit,1,1,False
9,10,unapproved_subtenant,bit,1,1,False


In [8]:
#view raw table column names
evictions.columns

Index(['id', 'eviction_id', 'address', 'city', 'state', 'zip', 'file_date',
       'non_payment', 'breach', 'nuisance', 'illegal_use',
       'failure_to_sign_renewal', 'access_denial', 'unapproved_subtenant',
       'owner_move_in', 'demolition', 'capital_improvement',
       'substantial_rehab', 'ellis_act_withdrawal', 'condo_conversion',
       'roommate_same_unit', 'other_cause', 'late_payments',
       'lead_remediation', 'development', 'good_samaritan_ends',
       'supervisor_district', 'neighborhood', 'shape', 'constraints_date',
       'client_location'],
      dtype='object')

## Pull Required Columns

In [43]:
evictions_columns = ['eviction_id', 'file_date','non_payment', 'breach', 'nuisance', 'illegal_use',
       'failure_to_sign_renewal', 'access_denial', 'unapproved_subtenant',
       'owner_move_in', 'demolition', 'capital_improvement',
       'substantial_rehab', 'ellis_act_withdrawal', 'condo_conversion',
       'roommate_same_unit', 'other_cause', 'late_payments',
       'lead_remediation', 'development', 'good_samaritan_ends', 'constraints_date']
evictions_grouped = evictions.groupby(evictions_columns).count().reset_index()
evictions_grouped = evictions_grouped[evictions_columns]
evictions_grouped

Unnamed: 0,eviction_id,file_date,non_payment,breach,nuisance,illegal_use,failure_to_sign_renewal,access_denial,unapproved_subtenant,owner_move_in,...,substantial_rehab,ellis_act_withdrawal,condo_conversion,roommate_same_unit,other_cause,late_payments,lead_remediation,development,good_samaritan_ends,constraints_date
0,AL2K0014,2000-02-11T00:00:00.000,False,True,False,False,False,False,False,False,...,False,False,False,False,False,True,False,False,False,
1,E2K2588,2000-09-19T00:00:00.000,False,False,False,False,False,False,False,True,...,False,False,False,False,False,False,False,False,False,
2,E980002,1998-01-02T00:00:00.000,False,False,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,
3,E980003,1998-01-02T00:00:00.000,False,True,True,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,
4,E980004,1998-01-02T00:00:00.000,False,False,False,False,False,False,False,False,...,False,False,False,False,False,True,False,False,False,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43387,S001123,1997-12-31T00:00:00.000,False,False,False,False,False,False,False,True,...,False,False,False,False,False,False,False,False,False,
43388,S001124,1997-12-31T00:00:00.000,False,False,False,False,False,False,False,True,...,False,False,False,False,False,False,False,False,False,
43389,S001125,1997-12-31T00:00:00.000,False,True,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,
43390,T2K2589,2000-09-19T00:00:00.000,False,False,False,False,False,False,False,False,...,False,False,False,False,True,False,False,False,False,


## Check for Nulls or Anomalies

In [11]:
#looks like nothing is registering as null
evictions_grouped.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43392 entries, 0 to 43391
Data columns (total 22 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   eviction_id              43392 non-null  object
 1   file_date                43392 non-null  object
 2   non_payment              43392 non-null  object
 3   breach                   43392 non-null  object
 4   nuisance                 43392 non-null  object
 5   illegal_use              43392 non-null  object
 6   failure_to_sign_renewal  43392 non-null  object
 7   access_denial            43392 non-null  object
 8   unapproved_subtenant     43392 non-null  object
 9   owner_move_in            43392 non-null  object
 10  demolition               43392 non-null  object
 11  capital_improvement      43392 non-null  object
 12  substantial_rehab        43392 non-null  object
 13  ellis_act_withdrawal     43392 non-null  object
 14  condo_conversion         43392 non-nul

### Eviction Ids

In [34]:
# identify unusual eviction ids by value length
# 3 eviction ids appear to have length other than 7 chars
evictions_grouped.eviction_id.apply(len).value_counts()

7     43389
8         2
24        1
Name: eviction_id, dtype: int64

In [44]:
# the 3 anomaly eviction id's are unusual, but the data appears valid otherwise
# because we will use a substitute UUID to identify the evictions anyway, we've decided not to drop these values
evictions_grouped[evictions_grouped.eviction_id.apply(len) > 7]

Unnamed: 0,eviction_id,file_date,non_payment,breach,nuisance,illegal_use,failure_to_sign_renewal,access_denial,unapproved_subtenant,owner_move_in,...,substantial_rehab,ellis_act_withdrawal,condo_conversion,roommate_same_unit,other_cause,late_payments,lead_remediation,development,good_samaritan_ends,constraints_date
0,AL2K0014,2000-02-11T00:00:00.000,False,True,False,False,False,False,False,False,...,False,False,False,False,False,True,False,False,False,
40902,R005-26E,1997-01-21T00:00:00.000,False,False,False,False,False,False,True,False,...,False,False,False,False,False,False,False,False,False,
43391,on at lease one occasion,2000-02-14T00:00:00.000,False,False,False,False,False,False,False,False,...,False,False,False,True,False,False,False,False,False,


In [46]:
evictions[evictions.eviction_id == "on at lease one occasion"]

Unnamed: 0,id,eviction_id,address,city,state,zip,file_date,non_payment,breach,nuisance,...,other_cause,late_payments,lead_remediation,development,good_samaritan_ends,supervisor_district,neighborhood,shape,constraints_date,client_location
36329,36302,on at lease one occasion,1500 Block Of 19th Avenue,San Francisco,CA,94122,2000-02-14T00:00:00.000,False,False,False,...,False,False,False,False,False,7,Inner Sunset,"{'type': 'Point', 'coordinates': [-122.47682, ...",,"{'latitude': '37.758898682240776', 'longitude'..."


In [47]:
evictions[evictions.eviction_id == "AL2K0014"]

Unnamed: 0,id,eviction_id,address,city,state,zip,file_date,non_payment,breach,nuisance,...,other_cause,late_payments,lead_remediation,development,good_samaritan_ends,supervisor_district,neighborhood,shape,constraints_date,client_location
41477,41466,AL2K0014,1600 Block Of Howard Street,San Francisco,CA,94103,2000-02-11T00:00:00.000,False,True,False,...,False,True,False,False,False,6,Mission,"{'type': 'Point', 'coordinates': [-122.417274,...",,"{'latitude': '37.771072990654986', 'longitude'..."


In [48]:
evictions[evictions.eviction_id == "R005-26E"]

Unnamed: 0,id,eviction_id,address,city,state,zip,file_date,non_payment,breach,nuisance,...,other_cause,late_payments,lead_remediation,development,good_samaritan_ends,supervisor_district,neighborhood,shape,constraints_date,client_location
36352,36327,R005-26E,3900 Block Of Sacramento Street,San Francisco,CA,,1997-01-21T00:00:00.000,False,False,False,...,False,False,False,False,False,2,Presidio Heights,"{'type': 'Point', 'coordinates': [-122.45807, ...",,"{'latitude': '37.78677576518167', 'longitude':..."


### File Dates

In [49]:
# identify unusual file dates by value length
# no apparent anomalies
evictions_grouped.file_date.apply(len).value_counts()

23    43392
Name: file_date, dtype: int64

### Binary Eviction Notice Values

In [31]:
#inspect breakdown of T/F in binary columns, confirms no NaNs
for i in range(2,21):
    print(evictions_grouped.iloc[:,i].value_counts())

False    40667
True      2725
Name: non_payment, dtype: int64
False    33667
True      9725
Name: breach, dtype: int64
False    35542
True      7850
Name: nuisance, dtype: int64
False    42476
True       916
Name: illegal_use, dtype: int64
False    43292
True       100
Name: failure_to_sign_renewal, dtype: int64
False    43063
True       329
Name: access_denial, dtype: int64
False    42584
True       808
Name: unapproved_subtenant, dtype: int64
False    33090
True     10302
Name: owner_move_in, dtype: int64
False    42215
True      1177
Name: demolition, dtype: int64
False    41688
True      1704
Name: capital_improvement, dtype: int64
False    43310
True        82
Name: substantial_rehab, dtype: int64
False    39262
True      4130
Name: ellis_act_withdrawal, dtype: int64
False    43263
True       129
Name: condo_conversion, dtype: int64
False    41998
True      1394
Name: roommate_same_unit, dtype: int64
False    41979
True      1413
Name: other_cause, dtype: int64
False    41729
True

### Constraint Dates

In [51]:
#39211 nan values
evictions_grouped.constraints_date.value_counts()

nan                        39211
2021-06-20T00:00:00.000        9
2005-08-01T00:00:00.000        9
2022-04-29T00:00:00.000        7
2005-11-15T00:00:00.000        7
                           ...  
2021-01-11T00:00:00.000        1
2024-07-26T00:00:00.000        1
2020-12-20T00:00:00.000        1
2022-12-28T00:00:00.000        1
2024-05-17T00:00:00.000        1
Name: constraints_date, Length: 2646, dtype: int64

In [81]:
#replace nan string with np.NaN values
evictions_grouped.constraints_date = evictions_grouped.constraints_date.replace('nan', np.NaN)

In [82]:
#check
evictions_grouped.constraints_date.isna().sum()

39211

In [93]:
#TODO: convert all datatypes to correct datatype
#for dates, once converted, check for outlier dates
#check to make sure dates make sense -- lots submitted on same date?