In [3]:
import pandas as pd
import sys
from top2vec import Top2Vec
from nltk.corpus import stopwords
import nltk
from nltk.tokenize import punkt
from nltk.stem import WordNetLemmatizer
from collections import Counter
import umap.umap_ as umap
import umap.plot

nltk.download("stopwords")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\PC\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [46]:
def preprocess(df):
    df = df.astype(str).fillna("").dropna()

    stop = stopwords.words("english")
    df.loc[:, "allegation_desc"] = df.allegation_desc.apply(
        lambda x: " ".join([word for word in x.split() if word not in (stop)])
    )
    return df[~((df.allegation_desc.fillna("") == ""))].dropna()


In [47]:

def explore_nopd_topics():
    stop = stopwords.words("english")
    df = pd.read_csv("../data/clean/cprr_new_orleans_pd_pib_reports_2014_2020.csv").astype(str)
    df["allegation_desc"] = df["allegation_desc"].apply(
        lambda x: " ".join([word for word in x.split() if word not in (stop)])
    )

    df.loc[:, "topics"] = " ".join(x for x in df["allegation_desc"].astype(str))
    df = df["topics"][0]
    df = df.split()

    counts = Counter(df)
    most_occur = counts.most_common(50)
    print(most_occur)
    return df


In [48]:
explore_nopd_topics()

[('complainant', 3874), ('officer', 3259), ('stated', 1746), ('accused', 1691), ('officers', 809), ('failed', 641), ('police', 570), ('nan', 547), ('vehicle', 476), ('report', 434), ('alleged', 345), ('‘the', 324), ('involved', 288), ('told', 256), ('incident', 255), ('complaint', 253), ('unprofessional', 249), ('supervisor', 245), ('called', 238), ('also', 224), ('arrested', 221), ('failing', 216), ('allegation:', 216), ('take', 210), ('said', 202), ('call', 199), ('neglect', 197), ('duty', 189), ('unknown', 188), ('subject', 180), ('nopd', 176), ('rude', 173), ('action', 170), ('traffic', 159), ('officer.', 158), ('her.', 147), ('domestic', 147), ('report.', 144), ('investigation', 141), ('incident.', 140), ('would', 135), ('district', 134), ('accident', 132), ("'*", 132), ('issued', 130), ('arrest', 129), ('another', 126), ('alleges', 121), ('vehicle.', 119), ('one', 119)]


['complaint',
 'via',
 'webmail.',
 'police',
 'called',
 'remove',
 'complainant',
 'home.',
 'complainant',
 'alleged',
 'told',
 'officers',
 'items',
 'vandalized',
 'thrown',
 'outside.',
 'complainant',
 'told',
 'leave',
 'home.',
 'complainant',
 'alleged',
 'name',
 'also',
 'lease',
 'responding',
 'officers',
 'request',
 'identification',
 'involved',
 'parties.',
 'two',
 'attempts',
 'made',
 'contact',
 'complainant',
 'via',
 'phone',
 'conduct',
 'audiotaped',
 'interview;',
 'determine',
 'specific',
 'allegations.',
 'complainant',
 'stated',
 'involved',
 'accident.',
 'prior',
 'officers',
 'arriving',
 'scene',
 'unknown',
 'officer',
 'arrived',
 'informed',
 'move',
 'vehicles',
 'traffic.',
 'complainant',
 'feels',
 'officer',
 'let',
 'explain',
 'happened',
 'investigate',
 'accident',
 'properly.',
 'complainant',
 'issued',
 'citation',
 'must',
 'adjudicate',
 'court.',
 'officer',
 'may',
 'contacted',
 'party',
 'complaint',
 'turn',
 'called',
 'threat

In [49]:
# bigrams = [word for word in model.vocab if len(word.split()) == 2]
# print(bigrams[:20])

In [50]:
def preprocess_nopd(df):
    df.loc[:, "allegation_desc"] = df.allegation_desc.fillna("")\
        .str.replace(r"( ?officers? | ?complainants? | ?stated ?| ?accused ?| ?nan ?| ?police ?)", "", regex=True)
    return df

In [51]:

def create_model(df):
    docs = [x for x in df["allegation_desc"]]
    # unique = [x for x in df["allegation_topic_uid"]]

    model = Top2Vec(
        docs,
        ngram_vocab=True,
        speed="deep-learn",
        use_embedding_model_tokenizer=True,
        min_count=5,
        # document_ids=unique
    )
    return model

In [52]:


def model():
    df = pd.read_csv("../data/clean/cprr_new_orleans_pd_pib_reports_2014_2020.csv")\
    .pipe(preprocess)\
    .pipe(preprocess_nopd)\
    .pipe(create_model)
    return df

In [53]:
model = model()

2022-10-18 10:11:14,637 - top2vec - INFO - Pre-processing documents for training
2022-10-18 10:11:14,747 - top2vec - INFO - Creating joint document/word embedding
2022-10-18 10:12:13,916 - top2vec - INFO - Creating lower dimension embedding of documents
2022-10-18 10:12:18,772 - top2vec - INFO - Finding dense areas of documents
2022-10-18 10:12:18,996 - top2vec - INFO - Finding topics


In [54]:
if len(model.get_topic_sizes()) > 1:
    topic_words, word_scores, topic_nums = model.get_topics()
    for words, scores, num in zip(topic_words, word_scores, topic_nums):
        print(num)
        print(f"Words: {words}")

0
Words: ['participated unauthorized' 'sexual assault' 'protective order'
 'another employee' 'multiple fatalities' 'photograph' 'would like'
 'correspondence' 'ics' 'mediation' 'take necessaryaction' 'deactivated'
 'prematurely' 'act' 'redirection' 'take action' 'send' 'the ics' 'let go'
 'failed take' 'fatalities' 'bwcs' 'participated' 'smelled' 'escape'
 'year old' 'secure' 'hung' 'used vulgar' 'feel like' 'van' 'condition'
 'concrete' 'necessaryaction' 'pushing' 'ran' 'resulted' 'moving'
 'favoritism' 'trouble' 'suicide' 'assisted' 'body armor' 'withdrawn'
 'unprofessional comments' 'operating' 'accusesneglect' 'equipment'
 'preferential' 'parking lot']
1
Words: ['highway' 'almost' 'speed' 'light' 'suv' 'sirens' 'plate' 'red' 'license'
 'mph' 'lights' 'dark' 'pulled' 'driving' 'siren' 'lane' 'thesaid'
 'driver' 'speeding' 'rolled' 'tint' 'blue' 'badge' 'license plate'
 'proceeded' 'cut' 'exited' 'drive' 'behind' 'window' 'tickets' 'pass'
 'windows' 'crossed' 'high' 'numbers' 'fall'

In [55]:
topic_sizes, top_nums = model.get_topic_sizes()
print(topic_sizes)
print(top_nums)

[200 166 151 148 144 141 138 137 136 125 117 112 111 110 108 105 104 100
  97  93  92  91  90  88  86  86  81  79  77  75  75  70  69  65  65  63
  60  60  60  56  54  51  44]
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42]


In [68]:
documents, document_scores, document_ids = model.search_documents_by_topic(topic_num=2, num_docs=10)

for doc, score, doc_id in list(zip(documents, document_scores, document_ids)):
    print(f"Document: {doc_id}, Score: {score}")
    print("--------------------")
    print(doc)
    print("--------------------")

Document: 2653, Score: 0.8529097437858582
--------------------
rude bad attitude.
--------------------
Document: 2757, Score: 0.7442191243171692
--------------------
harasses friend.
--------------------
Document: 2239, Score: 0.6979073286056519
--------------------
discourteous subject arrest.
--------------------
Document: 3006, Score: 0.6970369815826416
--------------------
bad attitude speaking unprofessionally. ‘* 2018-0120-p: 02/17/2018; allegation: use forcemalepulling hair femaleunprofessional.
--------------------
Document: 2594, Score: 0.6820839047431946
--------------------
madefeel bad way spoke him.
--------------------
Document: 3813, Score: 0.6743942499160767
--------------------
address complait's concern urinated on.
--------------------
Document: 3711, Score: 0.6674847602844238
--------------------
complait's concern complait's life threatened. :address
--------------------
Document: 3712, Score: 0.6655276417732239
--------------------
complait's concern complait's li

In [57]:
documents, document_scores, document_ids = model.search_documents_by_topic(topic_num=5, num_docs=10)

ents = {(doc, score) for doc, score in list(zip(documents, document_scores))}
df = pd.DataFrame(ents, columns=["doc", "score"])
print(df)
# df.loc[:, "topic"] = "30"
# df.to_csv("../data/raw/new_orleans_pd/topics/topic_30.csv", index=False)

                                           doc     score
0  failed wear departmental issued body armor.  0.714013
1                  permi duty. : neglect duty;  0.750299
2                          failed report duty.  0.692937
3                altering payroll information.  0.766367
4  failed wear departmental issued body armor.  0.706135
5                      equipment stolennothing  0.779809
6  failed wear departmental issued body armor.  0.698544
7                                  tardy duty.  0.809272
8               failed report duty instructed.  0.785208
9                                 neglect duty  0.764560


In [65]:
# model.save("../data/topic_modelling/model/new_orleans_pd_pib_2014_2019")

In [None]:
# model.generate_topic_wordcloud(0)

In [60]:
model.topic_words

array([['participated unauthorized', 'sexual assault',
        'protective order', ..., 'equipment', 'preferential',
        'parking lot'],
       ['highway', 'almost', 'speed', ..., 'car', 'driven', 'sign'],
       ['bad attitude', 'and run', 'transported hospital', ..., 'nephew',
        'allegesfailed', 'contacted'],
       ...,
       ['collect evidence', 'evidence property', 'allegation adherence',
        ..., 'authored', 'worn cameras', 'resulted crash'],
       ['party involved', 'obtain copy', 'paid detail', ...,
        'interstate', 'issue citation', 'departmental policy'],
       ['rude unprofessional', 'allegations neglect', 'failed complete',
        ..., 'search warrant', 'failed appear',
        'complaint accusesneglect']], dtype='<U27')

In [61]:
model.topic_words[0]

array(['participated unauthorized', 'sexual assault', 'protective order',
       'another employee', 'multiple fatalities', 'photograph',
       'would like', 'correspondence', 'ics', 'mediation',
       'take necessaryaction', 'deactivated', 'prematurely', 'act',
       'redirection', 'take action', 'send', 'the ics', 'let go',
       'failed take', 'fatalities', 'bwcs', 'participated', 'smelled',
       'escape', 'year old', 'secure', 'hung', 'used vulgar', 'feel like',
       'van', 'condition', 'concrete', 'necessaryaction', 'pushing',
       'ran', 'resulted', 'moving', 'favoritism', 'trouble', 'suicide',
       'assisted', 'body armor', 'withdrawn', 'unprofessional comments',
       'operating', 'accusesneglect', 'equipment', 'preferential',
       'parking lot'], dtype='<U27')

In [62]:
topic_words, word_scores, topic_nums = model.get_topics(2)
for words, scores, nums in zip(topic_words, word_scores, topic_nums):
  print("Topic Number: ",nums)
  print(f"Words: {words}")
  print("\n")

Topic Number:  0
Words: ['participated unauthorized' 'sexual assault' 'protective order'
 'another employee' 'multiple fatalities' 'photograph' 'would like'
 'correspondence' 'ics' 'mediation' 'take necessaryaction' 'deactivated'
 'prematurely' 'act' 'redirection' 'take action' 'send' 'the ics' 'let go'
 'failed take' 'fatalities' 'bwcs' 'participated' 'smelled' 'escape'
 'year old' 'secure' 'hung' 'used vulgar' 'feel like' 'van' 'condition'
 'concrete' 'necessaryaction' 'pushing' 'ran' 'resulted' 'moving'
 'favoritism' 'trouble' 'suicide' 'assisted' 'body armor' 'withdrawn'
 'unprofessional comments' 'operating' 'accusesneglect' 'equipment'
 'preferential' 'parking lot']


Topic Number:  1
Words: ['highway' 'almost' 'speed' 'light' 'suv' 'sirens' 'plate' 'red' 'license'
 'mph' 'lights' 'dark' 'pulled' 'driving' 'siren' 'lane' 'thesaid'
 'driver' 'speeding' 'rolled' 'tint' 'blue' 'badge' 'license plate'
 'proceeded' 'cut' 'exited' 'drive' 'behind' 'window' 'tickets' 'pass'
 'windows' '

In [63]:
topic_words, word_scores, topic_scores, topic_nums = model.search_topics(keywords=["stolen"], num_topics=5)
for word, w_score, topic, t_score in list(zip(topic_words, word_scores, topic_scores, topic_nums)):
    print(f"Word: \n{word}")
    print("--------------------")
    print(f"Word Score \n{w_score}")
    print("--------------------")
    print(f"Topic Score: \n{topic}")
    print("--------------------")
    print (f"Topic # \n{t_score}")
    print("--------END---------")

Word: 
['sobriety' 'supplemental report' 'jefferson parish'
 'investigation cancelled' 'carrying' 'unfounded action' 'pm'
 'even though' 'says' 'indicated' 'positive' 'test' 'fast' 'allegedfailed'
 'confiscated' 'preliminary investigation' 'within' 'substance'
 'strip search' 'field' 'obtain copy' 'parade route' 'period' 'six'
 'obtain permission' 'suicide' 'failed submit' 'tested' 'documented'
 'unfounded' 'additionally' 'onerated' 'units' 'ems' 'send' 'encounter'
 'forwarded' 'allegedtook' 'months' 'engaged' 'unrelated' 'hold'
 'allowing' 'nat' 'correspondence' 'advise' 'cocaine' 'initiated vehicle'
 'preliminary' 'smelled']
--------------------
Word Score 
[0.45317554 0.42242622 0.41589126 0.41281715 0.41048944 0.3918479
 0.38723737 0.36638924 0.3524646  0.34724623 0.3384915  0.33764958
 0.3351385  0.32740805 0.31566578 0.31265628 0.3113144  0.30956197
 0.3063449  0.30630034 0.3062438  0.29507065 0.29173648 0.29065275
 0.2846122  0.28356913 0.2819377  0.28132802 0.2791176  0.2758685

In [None]:
# model = Top2Vec.load("../data/topic_modelling/model/new_orleans_pd_pib_2014_2019")

# umap_args = {
#     "n_neighbors": 15,
#     "n_components": 2, # 5 -> 2 for plotting 
#     "metric": "cosine",
# }
# umap_model = umap.UMAP(**umap_args).fit(model.topic_vectors)
# umap.plot.points(umap_model, labels=model.doc_top_reduced)

In [44]:
from bertopic import BERTopic
import json

In [84]:
cprr = pd.read_csv("../data/clean/cprr_new_orleans_pd_pib_reports_2014_2020.csv")
cprr = cprr[~((cprr.allegation_desc.fillna("") == ""))]


In [85]:
cprr.shape

(3633, 9)

In [86]:
docs = cprr.allegation_desc

In [87]:
docs = json.loads(docs.to_json(orient='records'))

In [88]:
docs[0]

'complaint was via webmail. police were called to remove complainant from his home. complainant alleged he told officers his items were vandalized and thrown outside. complainant was told to leave the home. complainant alleged his name was also on the lease and the responding officers did not request any identification from any of the involved parties. two attempts were made to contact the complainant via phone to conduct an audiotaped interview; to determine the specific allegations.'

In [89]:
topic_model = BERTopic(embedding_model="all-MiniLM-L6-v2")

In [90]:
topic, probs = topic_model.fit_transform(docs)

In [91]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name
0,-1,1218,-1_the_her_to_complainant
1,0,135,0_nopd_the_he_and
2,1,111,1_bwc_activate_force_use
3,2,109,2_son_her_child_daughter
4,3,99,3_2018_allegation_neglect_of
...,...,...,...
69,68,12,68_social_media_posted_comments
70,69,12,69_did_reflect_not_accurately
71,70,12,70_harassing_him_sidewalk_performing
72,71,12,71_pursuit_unauthorized_failing_0142


In [None]:
topic_model.get_topic(1)

In [None]:
topic_model.get_representative_docs(1)

In [98]:
topic_model.visualize_topics()

In [99]:
topic_model.visualize_barchart()

In [96]:
df_bert = pd.DataFrame({"topic": topic, "documents": docs})

In [None]:
df_bert