In [39]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
import pickle
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_validate
from sklearn.preprocessing import normalize

In [2]:
violations = pd.read_csv('data/violations.csv')
permits = pd.read_csv('data/permits.csv')
crimes = pd.read_csv('data/crimes.csv')
complaints = pd.read_csv('data/complaints.csv')
trainlabels = pd.read_csv('data/train_labels.csv')

  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
violations = violations.dropna(axis=0,subset=['building_id'])
crimes = crimes.dropna(axis=0,subset=['building_id'])
complaints = complaints.dropna(axis=0,subset=['building_id'])

# Adding building id to records

In [5]:
complaints.rename(columns={'lat':'LAT','lng':'LON'},inplace=True)

In [6]:
def distance(pt,c):
    pt = pt*np.pi/180
    c = c*np.pi/180
    lat,lng = c.T[0],c.T[1]
    deltay = lat - pt[0]
    deltax = (lng - pt[1])*np.cos((lat+pt[0])/2)
    R = 6371e3
    d = R*np.sqrt(np.power(deltax,2)+np.power(deltay,2))
    return d.min(), np.argmin(d)

def findbuilding(row,buildings):
    lat,long = row['LAT'],row['LON']
    dis, b_id = distance(np.array([lat,long]),np.array(buildings[['LAT','LONG']]))
    if(dis<=100):
        return buildings.loc[b_id,'building_id']
    else:
        return np.nan

In [7]:
crimes['building_id'] = crimes.apply(lambda row: findbuilding(row,trainlabels),axis=1)

In [9]:
crimes.to_csv('data/crimes.csv',index=None)

In [10]:
complaints['building_id'] = complaints.apply(lambda row: findbuilding(row,trainlabels),axis=1)

In [12]:
complaints.to_csv('data/complaints.csv',index=None)

# Features

## 1) No. of violations

In [4]:
violationcounts =  violations.groupby('building_id').size()

In [5]:
violationcounts = pd.DataFrame(violationcounts)

In [6]:
violationcounts = violationcounts.rename(columns = {0:"count"})

In [7]:
violationcounts.reset_index(inplace=True)

In [8]:
violationcounts = violationcounts.astype(int)

In [9]:
violationcounts.head()

Unnamed: 0,building_id,count
0,2,1
1,4,66
2,6,18
3,9,17
4,11,89


In [10]:
trainlabels.shape

(7846, 4)

In [11]:
trainset = trainlabels.merge(violationcounts,on='building_id',how='outer')

trainset.rename(columns={'count':'violation_count'},inplace=True)

In [12]:
trainset.shape

(7846, 5)

## Crimes count

In [13]:
crimecount =  crimes.groupby('building_id').size()

crimecount = pd.DataFrame(crimecount)

crimecount = crimecount.rename(columns = {0:"count"})

crimecount.reset_index(inplace=True)

crimecount = crimecount.astype(int)

crimecount.head()

Unnamed: 0,building_id,count
0,0,14
1,2,9
2,4,5
3,6,15
4,9,12


In [14]:
trainset = trainset.merge(crimecount,on='building_id',how='outer')

trainset.rename(columns={'count':'crime_count'},inplace=True)

## Complaint count

In [15]:
complaintcount =  complaints.groupby('building_id').size()

complaintcount = pd.DataFrame(complaintcount)

complaintcount = complaintcount.rename(columns = {0:"complaint_count"})

complaintcount.reset_index(inplace=True)

complaintcount = complaintcount.astype(int)

complaintcount.head()

Unnamed: 0,building_id,complaint_count
0,2,1
1,4,2
2,6,3
3,11,1
4,14,2


In [16]:
trainset = trainset.merge(complaintcount,on='building_id',how='outer')

In [17]:
trainset.head()

Unnamed: 0,building_id,LAT,LONG,label,violation_count,crime_count,complaint_count
0,4,42.433953,-83.098835,blighted,66.0,5.0,2.0
1,6,42.335282,-83.072696,blighted,18.0,15.0,3.0
2,9,42.389837,-83.019746,blighted,17.0,12.0,
3,11,42.329588,-83.156621,blighted,89.0,5.0,1.0
4,14,42.411072,-82.913786,blighted,12.0,5.0,2.0


## Closed complaints

In [18]:
status =  complaints.loc[complaints['ticket_status']=='Closed']

In [19]:
closed = status.loc[:,['building_id','ticket_status']].groupby('building_id').size()
closed = pd.DataFrame(closed)
closed.reset_index(inplace=True)
closed.head()

Unnamed: 0,building_id,0
0,4.0,1
1,6.0,1
2,36.0,1
3,42.0,1
4,46.0,1


In [20]:
closed.rename(columns={0:'closed_complaints'},inplace=True)
closed.isna().sum()

building_id          0
closed_complaints    0
dtype: int64

In [21]:
trainset = trainset.merge(closed,on='building_id',how='outer')
trainset.head()

Unnamed: 0,building_id,LAT,LONG,label,violation_count,crime_count,complaint_count,closed_complaints
0,4,42.433953,-83.098835,blighted,66.0,5.0,2.0,1.0
1,6,42.335282,-83.072696,blighted,18.0,15.0,3.0,1.0
2,9,42.389837,-83.019746,blighted,17.0,12.0,,
3,11,42.329588,-83.156621,blighted,89.0,5.0,1.0,
4,14,42.411072,-82.913786,blighted,12.0,5.0,2.0,


In [22]:
trainset['closed/total'] = trainset['closed_complaints']/trainset['complaint_count']

In [23]:
trainset.head()

Unnamed: 0,building_id,LAT,LONG,label,violation_count,crime_count,complaint_count,closed_complaints,closed/total
0,4,42.433953,-83.098835,blighted,66.0,5.0,2.0,1.0,0.5
1,6,42.335282,-83.072696,blighted,18.0,15.0,3.0,1.0,0.333333
2,9,42.389837,-83.019746,blighted,17.0,12.0,,,
3,11,42.329588,-83.156621,blighted,89.0,5.0,1.0,,
4,14,42.411072,-82.913786,blighted,12.0,5.0,2.0,,


## Distinct Crimes

In [24]:
crimes_distinct = crimes.groupby('building_id')['CATEGORY'].nunique()
crimes_distinct = pd.DataFrame(crimes_distinct)
crimes_distinct.reset_index(inplace=True)
crimes_distinct['building_id'] = crimes_distinct['building_id'].astype(int)
crimes_distinct.head()

Unnamed: 0,building_id,CATEGORY
0,0,5
1,2,6
2,4,4
3,6,5
4,9,8


In [25]:
crimes_distinct.isna().sum()

building_id    0
CATEGORY       0
dtype: int64

In [26]:
crimes_distinct.rename(columns = {'CATEGORY':'unique_crimes'},inplace=True)
crimes_distinct.head()

Unnamed: 0,building_id,unique_crimes
0,0,5
1,2,6
2,4,4
3,6,5
4,9,8


In [27]:
trainset  = trainset.merge(crimes_distinct,on='building_id',how='outer')

In [28]:
trainset.head()

Unnamed: 0,building_id,LAT,LONG,label,violation_count,crime_count,complaint_count,closed_complaints,closed/total,unique_crimes
0,4,42.433953,-83.098835,blighted,66.0,5.0,2.0,1.0,0.5,4.0
1,6,42.335282,-83.072696,blighted,18.0,15.0,3.0,1.0,0.333333,5.0
2,9,42.389837,-83.019746,blighted,17.0,12.0,,,,8.0
3,11,42.329588,-83.156621,blighted,89.0,5.0,1.0,,,3.0
4,14,42.411072,-82.913786,blighted,12.0,5.0,2.0,,,4.0


## Distinct Complaints

In [29]:
complaints_distinct = complaints.groupby('building_id')['issue_type'].nunique()
complaints_distinct = pd.DataFrame(complaints_distinct)
complaints_distinct.reset_index(inplace=True)
complaints_distinct['building_id'] = complaints_distinct['building_id'].astype(int)
complaints_distinct.head()

Unnamed: 0,building_id,issue_type
0,2,1
1,4,2
2,6,3
3,11,1
4,14,2


In [30]:
complaints_distinct.rename(columns = {'issue_type':'unique_complaints'},inplace=True)

In [31]:
complaints_distinct.isna().sum()

building_id          0
unique_complaints    0
dtype: int64

In [32]:
trainset  = trainset.merge(complaints_distinct,on='building_id',how='outer')

In [33]:
trainset.head()

Unnamed: 0,building_id,LAT,LONG,label,violation_count,crime_count,complaint_count,closed_complaints,closed/total,unique_crimes,unique_complaints
0,4,42.433953,-83.098835,blighted,66.0,5.0,2.0,1.0,0.5,4.0,2.0
1,6,42.335282,-83.072696,blighted,18.0,15.0,3.0,1.0,0.333333,5.0,3.0
2,9,42.389837,-83.019746,blighted,17.0,12.0,,,,8.0,
3,11,42.329588,-83.156621,blighted,89.0,5.0,1.0,,,3.0,1.0
4,14,42.411072,-82.913786,blighted,12.0,5.0,2.0,,,4.0,2.0


In [34]:
trainset.shape

(7846, 11)

In [35]:
trainset.isna().sum()

building_id             0
LAT                     0
LONG                    0
label                   0
violation_count       671
crime_count           331
complaint_count      2728
closed_complaints    5153
closed/total         5153
unique_crimes         331
unique_complaints    2728
dtype: int64

# Preprocessing

In [36]:
trainset.fillna(value={'crime_count':0,'violation_count':0,'complaint_count':0,'unique_crimes':0,'unique_complaints':0},inplace=True)

In [37]:
trainset.drop(['closed/total','closed_complaints'],axis=1,inplace=True)

In [38]:
trainset.head()

Unnamed: 0,building_id,LAT,LONG,label,violation_count,crime_count,complaint_count,unique_crimes,unique_complaints
0,4,42.433953,-83.098835,blighted,66.0,5.0,2.0,4.0,2.0
1,6,42.335282,-83.072696,blighted,18.0,15.0,3.0,5.0,3.0
2,9,42.389837,-83.019746,blighted,17.0,12.0,0.0,8.0,0.0
3,11,42.329588,-83.156621,blighted,89.0,5.0,1.0,3.0,1.0
4,14,42.411072,-82.913786,blighted,12.0,5.0,2.0,4.0,2.0


In [41]:
y = trainset['label']
X = normalize(trainset.drop(['LAT','LONG','label','building_id'],axis=1))

In [43]:
model = LogisticRegression(solver='lbfgs')
results = cross_validate(model,X,y,cv=5)

In [44]:
results

{'fit_time': array([0.3595016 , 0.05892062, 0.06088495, 0.06099725, 0.08858466]),
 'score_time': array([0.00692081, 0.00618148, 0.0070262 , 0.01065588, 0.01170993]),
 'test_score': array([0.6044586 , 0.60127389, 0.6522293 , 0.66772959, 0.62244898])}