In [11]:
################################################### top2vec ######################################################

In [12]:
import pandas as pd
import sys
from top2vec import Top2Vec
from nltk.corpus import stopwords
import nltk
from nltk.tokenize import punkt
from nltk.stem import WordNetLemmatizer
from collections import Counter
import umap.umap_ as umap
import umap.plot

nltk.download("stopwords")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\PC\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [13]:
def preprocess(df):
    df = df.astype(str).fillna("").dropna()

    stop = stopwords.words("english")
    df.loc[:, "allegation_desc"] = df.allegation_desc.apply(
        lambda x: " ".join([word for word in x.split() if word not in (stop)])
    )
    return df[~((df.allegation_desc.fillna("") == ""))].dropna()


In [14]:

def explore_nopd_topics():
    stop = stopwords.words("english")
    df = pd.read_csv("../data/clean/cprr_new_orleans_pd_pib_reports_2014_2020.csv").astype(str)
    df["allegation_desc"] = df["allegation_desc"].apply(
        lambda x: " ".join([word for word in x.split() if word not in (stop)])
    )

    df.loc[:, "topics"] = " ".join(x for x in df["allegation_desc"].astype(str))
    df = df["topics"][0]
    df = df.split()

    counts = Counter(df)
    most_occur = counts.most_common(50)
    print(most_occur)
    return df


In [15]:
explore_nopd_topics()

[('complainant', 3874), ('officer', 3259), ('stated', 1746), ('accused', 1691), ('officers', 809), ('failed', 641), ('police', 570), ('nan', 547), ('vehicle', 476), ('report', 434), ('alleged', 345), ('‘the', 324), ('involved', 288), ('told', 256), ('incident', 255), ('complaint', 253), ('unprofessional', 249), ('supervisor', 245), ('called', 238), ('also', 224), ('arrested', 221), ('failing', 216), ('allegation:', 216), ('take', 210), ('said', 202), ('call', 199), ('neglect', 197), ('duty', 189), ('unknown', 188), ('subject', 180), ('nopd', 176), ('rude', 173), ('action', 170), ('traffic', 159), ('officer.', 158), ('her.', 147), ('domestic', 147), ('report.', 144), ('investigation', 141), ('incident.', 140), ('would', 135), ('district', 134), ('accident', 132), ("'*", 132), ('issued', 130), ('arrest', 129), ('another', 126), ('alleges', 121), ('vehicle.', 119), ('one', 119)]


['complaint',
 'via',
 'webmail.',
 'police',
 'called',
 'remove',
 'complainant',
 'home.',
 'complainant',
 'alleged',
 'told',
 'officers',
 'items',
 'vandalized',
 'thrown',
 'outside.',
 'complainant',
 'told',
 'leave',
 'home.',
 'complainant',
 'alleged',
 'name',
 'also',
 'lease',
 'responding',
 'officers',
 'request',
 'identification',
 'involved',
 'parties.',
 'two',
 'attempts',
 'made',
 'contact',
 'complainant',
 'via',
 'phone',
 'conduct',
 'audiotaped',
 'interview;',
 'determine',
 'specific',
 'allegations.',
 'complainant',
 'stated',
 'involved',
 'accident.',
 'prior',
 'officers',
 'arriving',
 'scene',
 'unknown',
 'officer',
 'arrived',
 'informed',
 'move',
 'vehicles',
 'traffic.',
 'complainant',
 'feels',
 'officer',
 'let',
 'explain',
 'happened',
 'investigate',
 'accident',
 'properly.',
 'complainant',
 'issued',
 'citation',
 'must',
 'adjudicate',
 'court.',
 'officer',
 'may',
 'contacted',
 'party',
 'complaint',
 'turn',
 'called',
 'threat

In [16]:
# bigrams = [word for word in model.vocab if len(word.split()) == 2]
# print(bigrams[:20])

In [17]:
def preprocess_nopd(df):
    df.loc[:, "allegation_desc"] = df.allegation_desc.fillna("")\
        .str.replace(r"( ?officers? | ?complain?a?n?ts?\.? ?| ?stated ?| ?accused ?| ?nan ?| ?police ?)", "", regex=True)
    return df

In [18]:

def create_model(df):
    docs = [x for x in df["allegation_desc"]]
    # unique = [x for x in df["allegation_topic_uid"]]

    model = Top2Vec(
        docs,
        ngram_vocab=True,
        speed="deep-learn",
        use_embedding_model_tokenizer=True,
        min_count=5,
        # document_ids=unique
    )
    return model

In [19]:


def model():
    df = pd.read_csv("../data/clean/cprr_new_orleans_pd_pib_reports_2014_2020.csv")\
    .pipe(preprocess)\
    .pipe(preprocess_nopd)\
    .pipe(create_model)
    return df

In [20]:
model = model()

2022-10-21 16:33:48,802 - top2vec - INFO - Pre-processing documents for training
2022-10-21 16:33:48,967 - top2vec - INFO - Creating joint document/word embedding
2022-10-21 16:34:55,518 - top2vec - INFO - Creating lower dimension embedding of documents
2022-10-21 16:35:17,521 - top2vec - INFO - Finding dense areas of documents
2022-10-21 16:35:17,861 - top2vec - INFO - Finding topics


In [21]:
if len(model.get_topic_sizes()) > 1:
    topic_words, word_scores, topic_nums = model.get_topics()
    for words, scores, num in zip(topic_words, word_scores, topic_nums):
        print(num)
        print(f"Words: {words}")

0
Words: ['body armor' 'participated unauthorized' 'accusesneglect'
 'allegesneglect' 'approve' 'used vulgar' 'side road' 'full'
 'decommissioned' 'facility' 'sexual assault' 'the ics' 'escaped' 'dwi'
 'discharged' 'allowing' 'vulgar' 'favoritism' 'unprofessional comments'
 'subpoenaed' 'overtime' 'recording' 'redirection' 'prisoner' 'correctly'
 'secure' 'acknowledge' 'multiple fatalities' 'pull' 'exonerated'
 'absence' 'llegation' 'accusesneglect duty' 'channel' 'custody' 'records'
 'engaged' 'appear' 'parking lot' 'fatalities' 'ensure' 'advising'
 'take necessaryaction' 'thoroughly' 'activate in' 'prematurely' 'mvu'
 'van' 'deactivated' 'left scene']
1
Words: ['send' 'paid detail' 'homeless' 'failed submit' 'carrying' 'strip search'
 'issued summonses' 'seriously' 'death' 'crossed' 'timely manner'
 'miscellaneous incident' 'profane' 'sleep' 'write report'
 'enforcement action' 'people' 'states' 'fast' 'task' 'contacting'
 'deceased' 'entering' 'unattended' 'party involved' 'suppleme

In [22]:
topic_sizes, top_nums = model.get_topic_sizes()
print(topic_sizes)
print(top_nums)

[152 151 148 128 123 119 111 108 105 104 101 101 100  96  96  96  95  95
  95  91  91  90  90  86  83  82  76  75  75  74  71  70  68  67  65  64
  64  64  62  62  62  62  60  59  59  58  49  41  36]
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
 48]


In [23]:
documents, document_scores, document_ids = model.search_documents_by_topic(topic_num=2, num_docs=10)

for doc, score, doc_id in list(zip(documents, document_scores, document_ids)):
    print(f"Document: {doc_id}, Score: {score}")
    print("--------------------")
    print(doc)
    print("--------------------")

Document: 2612, Score: 0.6505428552627563
--------------------
involved vehicle crash, driver’s license suspended,
--------------------
Document: 913, Score: 0.6244364380836487
--------------------
heading west bound highway subject driving white chevy impala dark tinted window drove past high rate speed cut almost causing crash,said followed vehicle back street continued travel high rate speed. ‘thesaid unknown driver got behind vehicle activated blue light.pulled exited vehicle approach vehicle driver pulled away heading east bound. ‘thesaid unable give description driver due tinted windows.said license plate jaw enforcement license plate believed driverofficer.
--------------------
Document: 2406, Score: 0.6097034811973572
--------------------
stoppedunknown officer, issued ticket valid license plate rear window car valid negotiated settlement
--------------------
Document: 124, Score: 0.601301908493042
--------------------
stopped traffic violationlicense plate.allegedextremely rud

In [24]:
documents, document_scores, document_ids = model.search_documents_by_topic(topic_num=5, num_docs=10)

ents = {(doc, score) for doc, score in list(zip(documents, document_scores))}
df = pd.DataFrame(ents, columns=["doc", "score"])
print(df)
# df.loc[:, "topic"] = "30"
# df.to_csv("../data/raw/new_orleans_pd/topics/topic_30.csv", index=False)

                                                 doc     score
0      unprofessional, rude, made demeaning remarks.  0.694617
1                            unprofessional remarks.  0.756773
2                        acted unprofes renew lease.  0.666379
3                          unjustified gun pointing.  0.696644
4  followinghome church.allegedacting impartially...  0.666155
5       thinkskeeping property can. ‘rofessionalism;  0.704724
6            statessexually assaulted searching him,  0.692430
7              allegedly made unprofessional remarks  0.700214
8            allegedly unprofessional lied incident.  0.668537
9                            unprofessional remarks.  0.757313


In [25]:
# model.save("../data/topic_modelling/model/new_orleans_pd_pib_2014_2019")

In [26]:
# model.generate_topic_wordcloud(0)

In [27]:
model.topic_words

array([['body armor', 'participated unauthorized', 'accusesneglect', ...,
        'van', 'deactivated', 'left scene'],
       ['send', 'paid detail', 'homeless', ..., 'laughing', 'uustained',
        'fit'],
       ['license', 'almost', 'driver license', ..., 'stopped',
        'followed', 'behind'],
       ...,
       ['excessive force', 'collect evidence', 'engaged unauthorized',
        ..., 'activate body', 'neglect duty', 'wanted know'],
       ['pertinent information', 'include', 'crime lab', ..., 'complete',
        'document', 'allegations neglect'],
       ['public duty', 'accusesneglect', 'allegesneglect', ..., 'public',
        'activate in', 'unauthorized pursuit']], dtype='<U27')

In [28]:
model.topic_words[0]

array(['body armor', 'participated unauthorized', 'accusesneglect',
       'allegesneglect', 'approve', 'used vulgar', 'side road', 'full',
       'decommissioned', 'facility', 'sexual assault', 'the ics',
       'escaped', 'dwi', 'discharged', 'allowing', 'vulgar', 'favoritism',
       'unprofessional comments', 'subpoenaed', 'overtime', 'recording',
       'redirection', 'prisoner', 'correctly', 'secure', 'acknowledge',
       'multiple fatalities', 'pull', 'exonerated', 'absence',
       'llegation', 'accusesneglect duty', 'channel', 'custody',
       'records', 'engaged', 'appear', 'parking lot', 'fatalities',
       'ensure', 'advising', 'take necessaryaction', 'thoroughly',
       'activate in', 'prematurely', 'mvu', 'van', 'deactivated',
       'left scene'], dtype='<U27')

In [29]:
topic_words, word_scores, topic_nums = model.get_topics(2)
for words, scores, nums in zip(topic_words, word_scores, topic_nums):
  print("Topic Number: ",nums)
  print(f"Words: {words}")
  print("\n")

Topic Number:  0
Words: ['body armor' 'participated unauthorized' 'accusesneglect'
 'allegesneglect' 'approve' 'used vulgar' 'side road' 'full'
 'decommissioned' 'facility' 'sexual assault' 'the ics' 'escaped' 'dwi'
 'discharged' 'allowing' 'vulgar' 'favoritism' 'unprofessional comments'
 'subpoenaed' 'overtime' 'recording' 'redirection' 'prisoner' 'correctly'
 'secure' 'acknowledge' 'multiple fatalities' 'pull' 'exonerated'
 'absence' 'llegation' 'accusesneglect duty' 'channel' 'custody' 'records'
 'engaged' 'appear' 'parking lot' 'fatalities' 'ensure' 'advising'
 'take necessaryaction' 'thoroughly' 'activate in' 'prematurely' 'mvu'
 'van' 'deactivated' 'left scene']


Topic Number:  1
Words: ['send' 'paid detail' 'homeless' 'failed submit' 'carrying' 'strip search'
 'issued summonses' 'seriously' 'death' 'crossed' 'timely manner'
 'miscellaneous incident' 'profane' 'sleep' 'write report'
 'enforcement action' 'people' 'states' 'fast' 'task' 'contacting'
 'deceased' 'entering' 'unatte

In [30]:
topic_words, word_scores, topic_scores, topic_nums = model.search_topics(keywords=["stolen"], num_topics=5)
for word, w_score, topic, t_score in list(zip(topic_words, word_scores, topic_scores, topic_nums)):
    print(f"Word: \n{word}")
    print("--------------------")
    print(f"Word Score \n{w_score}")
    print("--------------------")
    print(f"Topic Score: \n{topic}")
    print("--------------------")
    print (f"Topic # \n{t_score}")
    print("--------END---------")

Word: 
['french quarter' 'obtain copy' 'french' 'quarter' 'copy' 'supplemental'
 'arrived scene' 'supposed' 'gender' 'supplemental report'
 'sheriff deputy' 'parade route' 'obtain' 'automobile' 'class' 'drawn'
 'drop' 'unfounded' 'schedule' 'unreasonable' 'relocate' 'mandatory' 'no'
 'able' 'amount' 'traffic accident' 'paid detail' 'unable' 'id' 'accurate'
 'blocked' 'obtaining' 'light' 'new orleans' 'interaction' 'apartment'
 'listed' 'record' 'involved automobile' 'automobile accident' 'accident'
 'contacted' 'information' 'requested' 'know' 'last' 'forward' 'detail'
 'complaining' 'apartment complex']
--------------------
Word Score 
[0.63753027 0.6160442  0.44401962 0.43453908 0.38827482 0.3661675
 0.36597002 0.36252967 0.3619335  0.35712507 0.35071993 0.3456299
 0.314689   0.30113435 0.29525298 0.29140863 0.2894134  0.28477675
 0.28441012 0.2829387  0.28199947 0.2794389  0.27512854 0.27503827
 0.2729705  0.27011693 0.26884094 0.26263645 0.25954545 0.25878277
 0.25621802 0.2541481 

In [31]:
# model = Top2Vec.load("../data/topic_modelling/model/new_orleans_pd_pib_2014_2019")

# umap_args = {
#     "n_neighbors": 15,
#     "n_components": 2, # 5 -> 2 for plotting 
#     "metric": "cosine",
# }
# umap_model = umap.UMAP(**umap_args).fit(model.topic_vectors)
# umap.plot.points(umap_model, labels=model.doc_top_reduced)

In [32]:
############################################################ bert ##############################################################################

In [33]:
from bertopic import BERTopic
import json

  def resize(self, image, size, resample=PIL.Image.BILINEAR, default_to_square=True, max_size=None):
  def rotate(self, image, angle, resample=PIL.Image.NEAREST, expand=0, center=None, translate=None, fillcolor=None):
  from scipy.sparse.csr import csr_matrix


In [34]:
cprr = pd.read_csv("../data/clean/cprr_new_orleans_pd_pib_reports_2014_2020.csv")
cprr = cprr.pipe(preprocess).pipe(preprocess_nopd)


In [35]:
cprr.shape

(4180, 9)

In [36]:
docs = cprr.allegation_desc

In [37]:
docs = json.loads(docs.to_json(orient='records'))

In [38]:
docs[0]

'via webmail.called removehome.alleged tolditems vandalized thrown outside.told leave home.alleged name also lease respondingrequest identification involved parties. two attempts made contactvia phone conduct audiotaped interview; determine specific allegations.'

In [39]:
topic_model = BERTopic(embedding_model="all-MiniLM-L6-v2")

In [40]:
topic, probs = topic_model.fit_transform(docs)

In [41]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name
0,-1,1232,-1_vehicle_report_subject_her
1,0,557,0_struckstomach_pattern_uninterested_er
2,1,105,1_call_service_911_called
3,2,95,2_document_timely_report_failed
4,3,90,3_arrest_wallet_arrested_money
...,...,...,...
78,77,11,77_witness_interview_testify_20
79,78,11,78_accident_scene_investigation_investigate
80,79,11,79_theft_military_charged_stolennothing
81,80,11,80_training_class_decommissioned_complete


In [42]:
topic_model.get_topic(3)

[('arrest', 0.07177580282099073),
 ('wallet', 0.07081045230152935),
 ('arrested', 0.05015229586135581),
 ('money', 0.049518513421636004),
 ('arresting', 0.03913201580614666),
 ('falsely', 0.03545427591318902),
 ('warrant', 0.029789283412522512),
 ('false', 0.029467580327089144),
 ('purse', 0.0260496673233643),
 ('stole', 0.024956291307797487)]

In [43]:
topic_model.get_representative_docs(3)

['filing false/sworn affidavit led arrest',
 'wallet picked male fought wallet.called 911 report allegesnever showed up,',
 'arrested without warrantunprofessional,']

In [44]:
topic_model.visualize_topics()

In [45]:
topic_model.visualize_barchart()

In [46]:
df_bert = pd.DataFrame({"topic": topic, "documents": docs})

In [47]:
df_bert

Unnamed: 0,topic,documents
0,-1,via webmail.called removehome.alleged tolditem...
1,-1,involved accident. priorarriving scene unknown...
2,-1,may contacted partyturn called threatened her....
3,-1,involved auto accident.respondingrefused inves...
4,-1,drove entrance ramp hospital’s emergency room ...
...,...,...
4175,-1,struckclosed fistholding's weapon arrest situa...
4176,0,
4177,0,
4178,-1,ignored orders two rankingcrime scene.


In [48]:
############################################################ gensim ##############################################################################

In [49]:
import numpy as np
import json
import glob
import gensim
import gensim.corpora as corpora 
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
import spacy
import pandas as pd

import spacy
from nltk.corpus import stopwords

import pyLDAvis
import pyLDAvis.gensim_models

In [50]:
nopd = pd.read_csv("../data/clean/cprr_new_orleans_pd_pib_reports_2014_2020.csv")

In [51]:
def preprocess_gensim(df):
    df = df.astype(str).fillna("").dropna()
    return df[~((df.allegation_desc.fillna("") == ""))].dropna()

In [52]:
nopd = nopd.pipe(preprocess_gensim)

In [53]:
docs = nopd.allegation_desc

In [54]:
def lemmatization(descs, allowed_pos_tags=["NOUN", "ADJ", "VERB", "ADV"]):
    nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])
    final_text = []
    for desc in descs:
        doc = nlp(desc)
        new_text = " ".join([token.lemma_ for token in doc if token.pos_ in allowed_pos_tags])
        final_text.append(new_text)
    return (final_text)

In [55]:
lemmatized_texts = lemmatization(docs)

In [56]:
def gen_words(texts):
    final = []
    for text in texts:
        new = gensim.utils.simple_preprocess(text, deacc=True)
        final.append(new)
    return (final)

In [57]:
data_words = gen_words(lemmatized_texts)

In [58]:
id2word = corpora.Dictionary(data_words)

corpus = []
for text in data_words:
    new = id2word.doc2bow(text)
    corpus.append(new)

In [59]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=30,
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha="auto")

In [60]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word, mds="mmds", R=30)
vis