In [1]:
################################################### top2vec ######################################################

In [2]:
import pandas as pd
import sys
from top2vec import Top2Vec
from nltk.corpus import stopwords
import nltk
from nltk.tokenize import punkt
from nltk.stem import WordNetLemmatizer
from collections import Counter
import umap.umap_ as umap
import umap.plot

nltk.download("stopwords")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\PC\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
def preprocess(df):
    df = df.astype(str).fillna("").dropna()

    stop = stopwords.words("english")
    df.loc[:, "allegation_desc"] = df.allegation_desc.apply(
        lambda x: " ".join([word for word in x.split() if word not in (stop)])
    )
    return df[~((df.allegation_desc.fillna("") == ""))].dropna()


In [4]:

def explore_nopd_topics():
    stop = stopwords.words("english")
    df = pd.read_csv("../data/clean/cprr_new_orleans_pd_pib_reports_2014_2020.csv").astype(str)
    df["allegation_desc"] = df["allegation_desc"].apply(
        lambda x: " ".join([word for word in x.split() if word not in (stop)])
    )

    df.loc[:, "topics"] = " ".join(x for x in df["allegation_desc"].astype(str))
    df = df["topics"][0]
    df = df.split()

    counts = Counter(df)
    most_occur = counts.most_common(50)
    print(most_occur)
    return df


In [5]:
explore_nopd_topics()

[('complainant', 3874), ('officer', 3259), ('stated', 1746), ('accused', 1691), ('officers', 809), ('failed', 641), ('police', 570), ('nan', 547), ('vehicle', 476), ('report', 434), ('alleged', 345), ('‘the', 324), ('involved', 288), ('told', 256), ('incident', 255), ('complaint', 253), ('unprofessional', 249), ('supervisor', 245), ('called', 238), ('also', 224), ('arrested', 221), ('failing', 216), ('allegation:', 216), ('take', 210), ('said', 202), ('call', 199), ('neglect', 197), ('duty', 189), ('unknown', 188), ('subject', 180), ('nopd', 176), ('rude', 173), ('action', 170), ('traffic', 159), ('officer.', 158), ('her.', 147), ('domestic', 147), ('report.', 144), ('investigation', 141), ('incident.', 140), ('would', 135), ('district', 134), ('accident', 132), ("'*", 132), ('issued', 130), ('arrest', 129), ('another', 126), ('alleges', 121), ('vehicle.', 119), ('one', 119)]


['complaint',
 'via',
 'webmail.',
 'police',
 'called',
 'remove',
 'complainant',
 'home.',
 'complainant',
 'alleged',
 'told',
 'officers',
 'items',
 'vandalized',
 'thrown',
 'outside.',
 'complainant',
 'told',
 'leave',
 'home.',
 'complainant',
 'alleged',
 'name',
 'also',
 'lease',
 'responding',
 'officers',
 'request',
 'identification',
 'involved',
 'parties.',
 'two',
 'attempts',
 'made',
 'contact',
 'complainant',
 'via',
 'phone',
 'conduct',
 'audiotaped',
 'interview;',
 'determine',
 'specific',
 'allegations.',
 'complainant',
 'stated',
 'involved',
 'accident.',
 'prior',
 'officers',
 'arriving',
 'scene',
 'unknown',
 'officer',
 'arrived',
 'informed',
 'move',
 'vehicles',
 'traffic.',
 'complainant',
 'feels',
 'officer',
 'let',
 'explain',
 'happened',
 'investigate',
 'accident',
 'properly.',
 'complainant',
 'issued',
 'citation',
 'must',
 'adjudicate',
 'court.',
 'officer',
 'may',
 'contacted',
 'party',
 'complaint',
 'turn',
 'called',
 'threat

In [130]:
# bigrams = [word for word in model.vocab if len(word.split()) == 2]
# print(bigrams[:20])

In [131]:
def preprocess_nopd(df):
    df.loc[:, "allegation_desc"] = df.allegation_desc.fillna("")\
        .str.replace(r"( ?officers? | ?complain?a?n?ts?\.? ?| ?stated ?| ?accused ?| ?nan ?| ?police ?)", "", regex=True)
    return df


def convert_to_list(df):
    df.loc[:, "allegation_desc"] = df.allegation_desc.astype(str).fillna("")
    return df[~((df.allegation_desc == ""))]

In [132]:

def create_model(df):
    docs = [x for x in df["allegation_desc"]]
    # unique = [x for x in df["allegation_topic_uid"]]

    model = Top2Vec(
        docs,
        ngram_vocab=True,
        speed="deep-learn",
        use_embedding_model_tokenizer=True,
        min_count=5,
        # document_ids=unique
    )
    return model

In [134]:
def model():
    df = pd.read_csv("../data/clean/cprr_new_orleans_pd_pib_reports_2014_2020.csv")\
    .pipe(convert_to_list)\
    .pipe(preprocess_nopd)\
    .pipe(create_model)
    return df

In [135]:
model = model()

2022-10-22 14:57:15,850 - top2vec - INFO - Pre-processing documents for training
2022-10-22 14:57:16,036 - top2vec - INFO - Creating joint document/word embedding
2022-10-22 14:58:50,674 - top2vec - INFO - Creating lower dimension embedding of documents
2022-10-22 14:58:57,620 - top2vec - INFO - Finding dense areas of documents
2022-10-22 14:58:57,870 - top2vec - INFO - Finding topics


In [136]:
if len(model.get_topic_sizes()) > 1:
    topic_words, word_scores, topic_nums = model.get_topics()
    for words, scores, num in zip(topic_words, word_scores, topic_nums):
        print(num)
        print(f"Words: {words}")

0
Words: ['which caused' 'been able' 'coming from' 'no reason' 'document domestic'
 'sick leave' 'sheriff deputy' 'armor' 'french quarter' 'based on'
 'supplemental report' 'parade route' 'allegesof' 'variouswho'
 'untruthful' 'states that' 'in jefferson' 'unprofessional manner'
 'miscellaneous incident' 'jefferson parish' 'enforcement action'
 'subordinates' 'trial' 'tools' 'thatdid' 'photograph' 'be identified'
 'thedue' 'seriously' 'take any' 'while driving' 'explain what'
 'dismissive' 'correspondence' 'accurate' 'out wanted'
 'aggravated assault' 'new orleans' 'unknownfailed' 'any information'
 'investigations' 'biased' 'any action' 'individuals' 'log'
 'departmental policy' 'task' 'summary' 'timely manner' 'issued summonses']
1
Words: ['french quarter' 'in jefferson' 'be identified' 'jefferson parish'
 'supplemental report' 'based on' 'explain what' 'enforcement action'
 'out wanted' 'states that' 'unprofessional manner' 'while driving'
 'miscellaneous incident' 'red light' 'file

In [137]:
topic_sizes, top_nums = model.get_topic_sizes()
print(topic_sizes)
print(top_nums)

[257 193 192 149 132 129 127 109 109 103 103 103  99  99  97  96  86  85
  84  82  80  79  73  70  69  69  68  68  68  66  63  61  60  58  57  57
  56  55  54  54  54  53  52  47  46  45  44  44  41  35]
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
 48 49]


In [140]:
documents, document_scores, document_ids = model.search_documents_by_topic(topic_num=2, num_docs=10)

for doc, score, doc_id in list(zip(documents, document_scores, document_ids)):
    print(f"Document: {doc_id}, Score: {score}")
    print("--------------------")
    print(doc)
    print("--------------------")

Document: 2278, Score: 0.7974550127983093
--------------------
was a bully and abused his position.
--------------------
Document: 3678, Score: 0.7839356660842896
--------------------
upset about
--------------------
Document: 2461, Score: 0.7519679665565491
--------------------
did not act impartially at a domestic scene.
--------------------
Document: 3677, Score: 0.7488626837730408
--------------------
upset about
--------------------
Document: 3578, Score: 0.7485863566398621
--------------------
allegesof neglect of duty.
--------------------
Document: 3554, Score: 0.7427622079849243
--------------------
allegesof neglect of duty.
--------------------
Document: 2778, Score: 0.7387575507164001
--------------------
tailgated supervisor on the interstate.
--------------------
Document: 3537, Score: 0.7378426194190979
--------------------
allegesof neglect of duty.
--------------------
Document: 3396, Score: 0.7308454513549805
--------------------
allegesof neglect of duty.
-----------

In [161]:
documents, document_scores, document_ids = model.search_documents_by_topic(topic_num=26, num_docs=68)

ents = {(doc, score) for doc, score in list(zip(documents, document_scores))}
df = pd.DataFrame(ents, columns=["doc", "score"])
print(df)
df.loc[:, "topic"] = "26"
df.to_csv("../data/raw/new_orleans_pd/topics/topic_26.csv", index=False)

                                                  doc     score
0   wasof conducting a deficient supervisory inves...  0.537130
1   argues that the fines for his traffic citation...  0.380044
2   made theand her partner feel like criminals du...  0.274917
3   that thecan’t write an accurate report because...  0.346054
4   of having inappropriate relationship.  ‘*  201...  0.502831
..                                                ...       ...
63                                                     0.132964
64  (assigned to ddact zone) to respond to a call ...  0.330052
65             followed him and brandished a firearm.  0.351662
66  technician failed to handle a call for service...  0.611780
67                                                     0.181981

[68 rows x 2 columns]


In [142]:
# model.save("../data/topic_modelling/model/new_orleans_pd_pib_2014_2019")

In [143]:
# model.generate_topic_wordcloud(0)

In [144]:
model.topic_words

array([['which caused', 'been able', 'coming from', ..., 'summary',
        'timely manner', 'issued summonses'],
       ['french quarter', 'in jefferson', 'be identified', ...,
        'any action', 'as if', 'theattempted'],
       ['uustained action', 'upset because', 'unprofessional comments',
        ..., 'neck', 'asupervisor', 'protective'],
       ...,
       ['as scheduled', 'scheduled', 'verbal argument', ..., 'could not',
        'taxi', 'detail'],
       ['being rude', 'being unprofessional', 'thewas rude', ...,
        'paragraph neglect', 'touching', 'paragraph'],
       ['of duty', 'call for', 'tested positive', ..., 'appear in',
        'sleeping on', 'arrived at']], dtype='<U27')

In [145]:
model.topic_words[0]

array(['which caused', 'been able', 'coming from', 'no reason',
       'document domestic', 'sick leave', 'sheriff deputy', 'armor',
       'french quarter', 'based on', 'supplemental report',
       'parade route', 'allegesof', 'variouswho', 'untruthful',
       'states that', 'in jefferson', 'unprofessional manner',
       'miscellaneous incident', 'jefferson parish', 'enforcement action',
       'subordinates', 'trial', 'tools', 'thatdid', 'photograph',
       'be identified', 'thedue', 'seriously', 'take any',
       'while driving', 'explain what', 'dismissive', 'correspondence',
       'accurate', 'out wanted', 'aggravated assault', 'new orleans',
       'unknownfailed', 'any information', 'investigations', 'biased',
       'any action', 'individuals', 'log', 'departmental policy', 'task',
       'summary', 'timely manner', 'issued summonses'], dtype='<U27')

In [146]:
topic_words, word_scores, topic_nums = model.get_topics(2)
for words, scores, nums in zip(topic_words, word_scores, topic_nums):
  print("Topic Number: ",nums)
  print(f"Words: {words}")
  print("\n")

Topic Number:  0
Words: ['which caused' 'been able' 'coming from' 'no reason' 'document domestic'
 'sick leave' 'sheriff deputy' 'armor' 'french quarter' 'based on'
 'supplemental report' 'parade route' 'allegesof' 'variouswho'
 'untruthful' 'states that' 'in jefferson' 'unprofessional manner'
 'miscellaneous incident' 'jefferson parish' 'enforcement action'
 'subordinates' 'trial' 'tools' 'thatdid' 'photograph' 'be identified'
 'thedue' 'seriously' 'take any' 'while driving' 'explain what'
 'dismissive' 'correspondence' 'accurate' 'out wanted'
 'aggravated assault' 'new orleans' 'unknownfailed' 'any information'
 'investigations' 'biased' 'any action' 'individuals' 'log'
 'departmental policy' 'task' 'summary' 'timely manner' 'issued summonses']


Topic Number:  1
Words: ['french quarter' 'in jefferson' 'be identified' 'jefferson parish'
 'supplemental report' 'based on' 'explain what' 'enforcement action'
 'out wanted' 'states that' 'unprofessional manner' 'while driving'
 'miscellan

In [147]:
topic_words, word_scores, topic_scores, topic_nums = model.search_topics(keywords=["sexual"], num_topics=5)
for word, w_score, topic, t_score in list(zip(topic_words, word_scores, topic_scores, topic_nums)):
    print(f"Word: \n{word}")
    print("--------------------")
    print(f"Word Score \n{w_score}")
    print("--------------------")
    print(f"Topic Score: \n{topic}")
    print("--------------------")
    print (f"Topic # \n{t_score}")
    print("--------END---------")

Word: 
['sex crimes' 'sex' 'crimes' 'tested positive' 'screening'
 'sexual assault' 'years' 'robbed' 'victim' 'thelearned' 'anto' 'act'
 'on social' 'stop sign' 'possibly' 'aware' 'crime lab' 'assault'
 'civil matter' 'lab technician' 'raped' 'thetheof' 'social media'
 'sexual' 'an illegal' 'collect evidence' 'rape' 'allegation neglect'
 'notify' 'substance' 'theresponded' 'prior' 'service' 'hit run'
 'notify pib' 'positive' 'substance abuse' 'once' 'for service'
 'allegation professionalism' 'refusing' 'call for' 'allegation rule'
 'traffic violation' 'formal' 'xonerated' 'command desk' 'arrived on'
 'detaining' 'allegation adherence']
--------------------
Word Score 
[0.776382   0.66883934 0.5946551  0.40445668 0.34332928 0.33284354
 0.31986773 0.30851933 0.3023288  0.29674977 0.2955022  0.29381585
 0.29366586 0.28656074 0.2738184  0.2684891  0.2668148  0.26576513
 0.26553985 0.25958532 0.25789946 0.253091   0.25012043 0.24637146
 0.2461028  0.24577665 0.24562147 0.23582023 0.2295383

In [148]:
# model = Top2Vec.load("../data/topic_modelling/model/new_orleans_pd_pib_2014_2019")

# umap_args = {
#     "n_neighbors": 15,
#     "n_components": 2, # 5 -> 2 for plotting 
#     "metric": "cosine",
# }
# umap_model = umap.UMAP(**umap_args).fit(model.topic_vectors)
# umap.plot.points(umap_model, labels=model.doc_top_reduced)

In [149]:
############################################################ bert ##############################################################################

In [150]:
from bertopic import BERTopic
import json

In [151]:
cprr = pd.read_csv("../data/clean/cprr_new_orleans_pd_pib_reports_2014_2020.csv")

In [152]:
def preprocess_bert(df):
    df = df.astype(str).fillna("").dropna()

    stop = stopwords.words("english")
    df.loc[:, "allegation_desc"] = df.allegation_desc.apply(
        lambda x: " ".join([word for word in x.split() if word not in (stop)])
    )
    return df[~((df.allegation_desc.fillna("") == ""))].dropna()

def preprocess_bert(df):
    df.loc[:, "allegation_desc"] = df.allegation_desc.fillna("")\
        .str.replace(r"( ?officers? | ?complain?a?n?ts?\.? ?| ?stated ?| ?accused ?| ?nan ?| ?police ?)", "", regex=True)\
        .str.replace(r"(\w+)\.(\w+)", r"\1 \2", regex=True)
    return df[~((df.allegation_desc == ""))]


def drop_rows_missing_data(df):
    return df[~((df.allegation_desc.fillna("") == ""))]


In [153]:
cprr = cprr.pipe(drop_rows_missing_data)

In [154]:
cprr.shape

(3633, 9)

In [155]:
docs = cprr.allegation_desc

In [156]:
docs = json.loads(docs.to_json(orient='records'))

In [157]:
docs[0]

'complaint was via webmail. police were called to remove complainant from his home. complainant alleged he told officers his items were vandalized and thrown outside. complainant was told to leave the home. complainant alleged his name was also on the lease and the responding officers did not request any identification from any of the involved parties. two attempts were made to contact the complainant via phone to conduct an audiotaped interview; to determine the specific allegations.'

In [158]:
topic_model = BERTopic(embedding_model="all-MiniLM-L6-v2")

In [159]:
topic, probs = topic_model.fit_transform(docs)

KeyboardInterrupt: 

In [None]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name
0,-1,1184,-1_the_her_to_complainant
1,0,133,0_nopd_the_an_by
2,1,130,1_driver_citation_accident_license
3,2,109,2_bwc_activate_use_force
4,3,102,3_son_daughter_child_her
...,...,...,...
71,70,12,70_social_media_posted_comments
72,71,12,71_failed_submit_mandated_complete
73,72,12,72_pursuit_unauthorized_failing_0142
74,73,11,73_suspended_dwi_breath_smelled


In [None]:
topic_model.get_topic(3)

[('son', 0.038848865224631705),
 ('daughter', 0.02242217886614821),
 ('child', 0.02195652016338895),
 ('her', 0.020927570863147792),
 ('she', 0.01486016357256261),
 ('juvenile', 0.014706670812861965),
 ('mother', 0.014301698831230114),
 ('arrested', 0.013704328951432247),
 ('father', 0.013120305353513068),
 ('stated', 0.012935013067264023)]

In [None]:
topic_model.get_representative_docs(3)

['complainant stated the officers did not assist her with getting custody of her son and w: she stated that she was also told she was going to get arrested if she would not leave very rude,',
 'the complainants stated that their sons were arrested and they complained to them that they were slammed to the ground and dragged repeatedly.',
 'complainant stated her son was falsely charged by the arresting officer. complainant was not on the scene; and others who were told her the incident didn’t go as the police said it did. officer took temporary license plate off the car; left with it; and used profanity on the scene.']

In [None]:
# topic_model.visualize_topics()

In [None]:
# topic_model.visualize_barchart()

In [None]:
df_bert = pd.DataFrame({"topic": topic, "documents": docs})

In [None]:
df_bert.to_csv("berty.csv")

In [None]:
############################################################ gensim ##############################################################################

In [None]:
# import numpy as np
# import json
# import glob
# import gensim
# import gensim.corpora as corpora 
# from gensim.utils import simple_preprocess
# from gensim.models import CoherenceModel
# import spacy
# import pandas as pd

# import spacy
# from nltk.corpus import stopwords

# import pyLDAvis
# import pyLDAvis.gensim_models

In [None]:
# nopd = pd.read_csv("../data/clean/cprr_new_orleans_pd_pib_reports_2014_2020.csv")

In [None]:
# def preprocess_gensim(df):
#     df = df.astype(str).fillna("").dropna()
#     return df[~((df.allegation_desc.fillna("") == ""))].dropna()

In [None]:
# nopd = nopd.pipe(preprocess_gensim)

In [None]:
# docs = nopd.allegation_desc

In [None]:
# def lemmatization(descs, allowed_pos_tags=["NOUN", "ADJ", "VERB", "ADV"]):
#     nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])
#     final_text = []
#     for desc in descs:
#         doc = nlp(desc)
#         new_text = " ".join([token.lemma_ for token in doc if token.pos_ in allowed_pos_tags])
#         final_text.append(new_text)
#     return (final_text)

In [None]:
# lemmatized_texts = lemmatization(docs)

In [None]:
# def gen_words(texts):
#     final = []
#     for text in texts:
#         new = gensim.utils.simple_preprocess(text, deacc=True)
#         final.append(new)
#     return (final)

In [None]:
# data_words = gen_words(lemmatized_texts)

In [None]:
# id2word = corpora.Dictionary(data_words)

# corpus = []
# for text in data_words:
#     new = id2word.doc2bow(text)
#     corpus.append(new)

In [None]:
# lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
#                                            id2word=id2word,
#                                            num_topics=30,
#                                            random_state=100,
#                                            update_every=1,
#                                            chunksize=100,
#                                            passes=10,
#                                            alpha="auto")

In [None]:
# pyLDAvis.enable_notebook()
# vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word, mds="mmds", R=30)
# vis