In [1]:
import numpy as np
import pandas as pd
import random

In [27]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF

In [3]:
f = 'data/bldg_insp/insp_viol.csv'
# number of observations
n_lines = sum(1 for line in open(f))
n_lines

380521

In [8]:
# random sample
pct = 1
size = int(n_lines * pct/100)
# row indicies to skip
skip_idx = random.sample(range(1, n_lines), n_lines - size)
df = pd.read_csv(f, index_col=0, skiprows=skip_idx)
df.head()

Unnamed: 0_level_0,Item Sequence Number,Date Filed,Block,Lot,Street Number,Street Name,Street Suffix,Unit,Status,Receiving Division,...,Supervisor District,Zipcode,Location,Supervisor Districts,Fire Prevention Districts,Current Police Districts,Neighborhoods - Analysis Boundaries 2,Zip Codes,Central Market/Tenderloin Boundary,Central Market/Tenderloin Boundary Polygon - Updated
Complaint Number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
201970711,1261264,2019-07-17T00:00:00.000,2088,022,3927,Ortega,St,,active,Housing Inspection Services,...,4.0,94116.0,"{'longitude': '-122.50575188530959', 'human_ad...",3.0,1.0,5.0,35.0,29491.0,,
201970521,1260464,2019-07-16T00:00:00.000,2826,085,755,Burnett,Av,0.0,active,Housing Inspection Services,...,8.0,94131.0,"{'longitude': '-122.44536906706848', 'human_ad...",5.0,2.0,8.0,38.0,63.0,,
201970231,1260266,2019-07-16T00:00:00.000,1483,002G,518,Point Lobos,Av,,active,Housing Inspection Services,...,1.0,94121.0,"{'longitude': '-122.50888348428296', 'human_ad...",2.0,11.0,9.0,29.0,55.0,,
201970201,1260167,2019-07-16T00:00:00.000,130,022,790,Vallejo,St,,active,Housing Inspection Services,...,3.0,94133.0,"{'longitude': '-122.41029495785159', 'human_ad...",10.0,3.0,1.0,6.0,308.0,,
201970161,1260108,2019-07-15T00:00:00.000,1362,014,4005,California,St,0.0,active,Housing Inspection Services,...,1.0,94118.0,"{'longitude': '-122.46055839123706', 'human_ad...",2.0,11.0,9.0,11.0,54.0,,


In [12]:
# documents for analysis
df_comments = df['NOV Item Description']
df_comments.head()

Complaint Number
201970711                                          3927 ortega
201970521    repair or replace the sticking, deteriorating ...
201970231    it is the property owner's responsibility to b...
201970201    a responsible person shall reside upon the pre...
201970161    for all building permits:when all work is comp...
Name: NOV Item Description, dtype: object

In [20]:
# drop nan's
df_comments.dropna(inplace=True)

# Vectorize Text

In [22]:
# create instance of vectorizer
tf_vectorizer = TfidfVectorizer(stop_words='english', max_df=0.95, min_df=0.05)
# input text from documents
tf_vectorizer.fit(df_comments)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.95, max_features=None, min_df=0.05,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words='english', strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [24]:
# document-term matrix
X = tf_vectorizer.transform(df_comments)

In [26]:
# document-term dataframe
tf_df = pd.DataFrame(X.toarray(), columns=[tf_vectorizer.get_feature_names()])
tf_df.head()

Unnamed: 0,1954,303,access,accessed,apartment,area,areas,attend,bathroom,building,...,scheduled,section,secure,specified,street,tenants,time,unit,units,violation
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.113195,0.113195,0.104487,0.223847,0.105029,0.0,0.168701,0.215776,0.0,0.0,...,0.205488,0.2142,0.10942,0.219471,0.0,0.110054,0.091809,0.0,0.106426,0.098616
3,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.423561,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.090276


# Topic Modeling

In [30]:
# create instance of topic modeler
nmf = NMF(n_components=10)
# fit model to dataset
nmf.fit(tf_df)

NMF(alpha=0.0, beta_loss='frobenius', init=None, l1_ratio=0.0, max_iter=200,
  n_components=10, random_state=None, shuffle=False, solver='cd',
  tol=0.0001, verbose=0)

In [31]:
# matrix of documents x topic weights
nmf.transform(tf_df)

array([[0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 1.18550272e-01, 1.42736829e-01, ...,
        0.00000000e+00, 1.58273702e-01, 0.00000000e+00],
       [1.83992762e-01, 0.00000000e+00, 1.50038639e-04, ...,
        4.49097808e-02, 0.00000000e+00, 0.00000000e+00],
       ...,
       [0.00000000e+00, 5.95921672e-03, 0.00000000e+00, ...,
        0.00000000e+00, 2.62890066e-01, 0.00000000e+00],
       [0.00000000e+00, 1.05795723e-02, 0.00000000e+00, ...,
        0.00000000e+00, 4.97553007e-02, 2.04283220e-01],
       [0.00000000e+00, 8.05100067e-02, 1.15606776e-05, ...,
        3.60486863e-03, 2.04544314e-02, 4.26380819e-02]])

In [32]:
# function to print top words of topic model
def print_top_words(model, feature_names, n_top_words):
    for index, topic in enumerate(model.components_):
        message = "\nTopic #{}:".format(index)
        message += " ".join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1 :-1]])
        print(message)
        print("="*70)

In [33]:
# topics and their top words
print_top_words(nmf, tf_vectorizer.get_feature_names(), 25)


Topic #0:reinspection notice inspector inspection owner property scheduled code section responsibility attend accessed specified time areas violation housing san francisco representative present cited date entry purpose

Topic #1:repair replace damaged kitchen bathroom ceiling floor rear entry time exterior rooms apartment note inspection section secure permit areas required owner 303 dwellings door direct

Topic #2:door entry rear street apartment note access floor replace exterior required guest providing secure tenants rooms direct different common date dwellings damaged violation ceiling code

Topic #3:common areas street inspection property building time access san francisco damaged housing units violation code exterior california door guest 303 accessed floor entry dwellings apartment

Topic #4:building required permit exterior inspection inspector note housing rear time floor reinspection replace date violation francisco present san code access direct 303 entry accessed apartme