In [1]:
################################################### top2vec ######################################################

In [2]:
import pandas as pd
import sys
from top2vec import Top2Vec
from nltk.corpus import stopwords
import nltk
from nltk.tokenize import punkt
from nltk.stem import WordNetLemmatizer
from collections import Counter
import umap.umap_ as umap
import umap.plot

nltk.download("stopwords")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\PC\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
def preprocess(df):
    df = df.astype(str).fillna("").dropna()

    stop = stopwords.words("english")
    df.loc[:, "allegation_desc"] = df.allegation_desc.apply(
        lambda x: " ".join([word for word in x.split() if word not in (stop)])
    )
    return df[~((df.allegation_desc.fillna("") == ""))].dropna()


In [4]:

def explore_nopd_topics():
    stop = stopwords.words("english")
    df = pd.read_csv("../data/clean/cprr_new_orleans_pd_pib_reports_2014_2020.csv").astype(str)
    df["allegation_desc"] = df["allegation_desc"].apply(
        lambda x: " ".join([word for word in x.split() if word not in (stop)])
    )

    df.loc[:, "topics"] = " ".join(x for x in df["allegation_desc"].astype(str))
    df = df["topics"][0]
    df = df.split()

    counts = Counter(df)
    most_occur = counts.most_common(50)
    print(most_occur)
    return df


In [5]:
explore_nopd_topics()

[('complainant', 3874), ('officer', 3259), ('stated', 1746), ('accused', 1691), ('officers', 809), ('failed', 641), ('police', 570), ('nan', 547), ('vehicle', 476), ('report', 434), ('alleged', 345), ('‘the', 324), ('involved', 288), ('told', 256), ('incident', 255), ('complaint', 253), ('unprofessional', 249), ('supervisor', 245), ('called', 238), ('also', 224), ('arrested', 221), ('failing', 216), ('allegation:', 216), ('take', 210), ('said', 202), ('call', 199), ('neglect', 197), ('duty', 189), ('unknown', 188), ('subject', 180), ('nopd', 176), ('rude', 173), ('action', 170), ('traffic', 159), ('officer.', 158), ('her.', 147), ('domestic', 147), ('report.', 144), ('investigation', 141), ('incident.', 140), ('would', 135), ('district', 134), ('accident', 132), ("'*", 132), ('issued', 130), ('arrest', 129), ('another', 126), ('alleges', 121), ('vehicle.', 119), ('one', 119)]


['complaint',
 'via',
 'webmail.',
 'police',
 'called',
 'remove',
 'complainant',
 'home.',
 'complainant',
 'alleged',
 'told',
 'officers',
 'items',
 'vandalized',
 'thrown',
 'outside.',
 'complainant',
 'told',
 'leave',
 'home.',
 'complainant',
 'alleged',
 'name',
 'also',
 'lease',
 'responding',
 'officers',
 'request',
 'identification',
 'involved',
 'parties.',
 'two',
 'attempts',
 'made',
 'contact',
 'complainant',
 'via',
 'phone',
 'conduct',
 'audiotaped',
 'interview;',
 'determine',
 'specific',
 'allegations.',
 'complainant',
 'stated',
 'involved',
 'accident.',
 'prior',
 'officers',
 'arriving',
 'scene',
 'unknown',
 'officer',
 'arrived',
 'informed',
 'move',
 'vehicles',
 'traffic.',
 'complainant',
 'feels',
 'officer',
 'let',
 'explain',
 'happened',
 'investigate',
 'accident',
 'properly.',
 'complainant',
 'issued',
 'citation',
 'must',
 'adjudicate',
 'court.',
 'officer',
 'may',
 'contacted',
 'party',
 'complaint',
 'turn',
 'called',
 'threat

In [6]:
# bigrams = [word for word in model.vocab if len(word.split()) == 2]
# print(bigrams[:20])

In [7]:
def preprocess_nopd(df):
    df.loc[:, "allegation_desc"] = df.allegation_desc.fillna("")\
        .str.replace(r"( ?officers? | ?complain?a?n?ts?\.? ?| ?stated ?| ?accused ?| ?nan ?| ?police ?)", "", regex=True)
    return df


def convert_to_list(df):
    df.loc[:, "allegation_desc"] = df.allegation_desc.astype(str).fillna("")
    return df[~((df.allegation_desc == ""))]

In [8]:

def create_model(df):
    docs = [x for x in df["allegation_desc"]]
    # unique = [x for x in df["allegation_topic_uid"]]

    model = Top2Vec(
        docs,
        ngram_vocab=True,
        speed="deep-learn",
        use_embedding_model_tokenizer=True,
        min_count=5,
        # document_ids=unique
    )
    return model

In [9]:
def model():
    df = pd.read_csv("../data/clean/cprr_new_orleans_pd_pib_reports_2014_2020.csv")\
    .pipe(convert_to_list)\
    .pipe(preprocess_nopd)\
    .pipe(create_model)
    return df

In [10]:
model = model()

2022-10-23 18:56:42,705 - top2vec - INFO - Pre-processing documents for training
2022-10-23 18:56:43,001 - top2vec - INFO - Creating joint document/word embedding
2022-10-23 18:58:07,252 - top2vec - INFO - Creating lower dimension embedding of documents
2022-10-23 18:58:34,735 - top2vec - INFO - Finding dense areas of documents
2022-10-23 18:58:35,026 - top2vec - INFO - Finding topics


In [11]:
if len(model.get_topic_sizes()) > 1:
    topic_words, word_scores, topic_nums = model.get_topics()
    for words, scores, num in zip(topic_words, word_scores, topic_nums):
        print(num)
        print(f"Words: {words}")

0
Words: ['french quarter' 'based on' 'states that' 'supplemental report'
 'explain what' 'in jefferson' 'be identified' 'miscellaneous incident'
 'red light' 'file aagainst' 'enforcement action' 'while driving'
 'unprofessional manner' 'aggravated assault' 'jefferson parish'
 'law enforcement' 'timely manner' 'no reason' 'out wanted' 'coming from'
 'new orleans' 'text messages' 'credit' 'been able' 'french'
 'document domestic' 'quarter' 'approached by' 'parade route'
 'which caused' 'lights' 'dating' 'proceeded' 'as though' 'sick leave'
 'as if' 'threatened by' 'pm' 'festival' 'theduring' 'nopd officer'
 'sirens' 'any action' 'wereof' 'sheriff deputy' 'variouswho' 'in verbal'
 'detained' 'illegal' 'physically']
1
Words: ['which caused' 'no reason' 'document domestic' 'been able' 'coming from'
 'sick leave' 'sheriff deputy' 'armor' 'french quarter' 'variouswho'
 'states that' 'parade route' 'based on' 'allegesof' 'supplemental report'
 'untruthful' 'subordinates' 'new orleans' 'trial'

In [12]:
topic_sizes, top_nums = model.get_topic_sizes()
print(topic_sizes)
print(top_nums)

[262 233 214 150 147 139 134 130 122 119 112 106 103 102 101 100  98  97
  95  92  89  88  87  87  82  82  78  75  74  73  73  71  69  68  67  66
  65  64  63  56  47]
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40]


In [13]:
documents, document_scores, document_ids = model.search_documents_by_topic(topic_num=2, num_docs=10)

for doc, score, doc_id in list(zip(documents, document_scores, document_ids)):
    print(f"Document: {doc_id}, Score: {score}")
    print("--------------------")
    print(doc)
    print("--------------------")

Document: 2278, Score: 0.7681394219398499
--------------------
was a bully and abused his position.
--------------------
Document: 3678, Score: 0.7641843557357788
--------------------
upset about
--------------------
Document: 2778, Score: 0.7591538429260254
--------------------
tailgated supervisor on the interstate.
--------------------
Document: 3677, Score: 0.7489407658576965
--------------------
upset about
--------------------
Document: 3538, Score: 0.7405471801757812
--------------------
allegesof neglect of duty.
--------------------
Document: 3554, Score: 0.7307953238487244
--------------------
allegesof neglect of duty.
--------------------
Document: 3578, Score: 0.7296826839447021
--------------------
allegesof neglect of duty.
--------------------
Document: 3537, Score: 0.727536678314209
--------------------
allegesof neglect of duty.
--------------------
Document: 3396, Score: 0.7236541509628296
--------------------
allegesof neglect of duty.
--------------------
Document:

In [14]:
documents, document_scores, document_ids = model.search_documents_by_topic(topic_num=26, num_docs=68)

ents = {(doc, score) for doc, score in list(zip(documents, document_scores))}
df = pd.DataFrame(ents, columns=["doc", "score"])
print(df)
df.loc[:, "topic"] = "26"
df.to_csv("../data/raw/new_orleans_pd/topics/topic_26.csv", index=False)

                                                  doc     score
0   her juvenile daughter was battered in a fight ...  0.211629
1   he was arrested for a warrant by an officer. h...  0.183115
2   theshe and her neighbor had been involved in a...  0.507553
3   that he called theto resolve a dispute between...  0.238703
4   theher home had been vandalized. ‘theexplained...  0.214415
..                                                ...       ...
63  he and a neighbor had a dispute and the neighb...  0.499947
64  theshe had gone to the district station to req...  0.433782
65  (a nopd offices)that she saw video/audio foota...  0.173938
66  thewent into the district station to file aaga...  0.362493
67                                                     0.287576

[68 rows x 2 columns]


In [15]:
# model.save("../data/topic_modelling/model/new_orleans_pd_pib_2014_2019")

In [16]:
# model.generate_topic_wordcloud(0)

In [17]:
model.topic_words

array([['french quarter', 'based on', 'states that', ..., 'detained',
        'illegal', 'physically'],
       ['which caused', 'no reason', 'document domestic', ...,
        'dismissive', 'departmental policy', 'dealing'],
       ['uustained action', 'unprofessional comments', 'upset because',
        ..., 'away from', 'private', 'escorting'],
       ...,
       ['changed', 'domestic disturbance', 'nat', ..., 'action taken',
        'determined', 'harassed by'],
       ['public neglect', 'accusesof', 'neglect of', ...,
        'multiple fatalities', 'rule paragraph', 'rule'],
       ['as scheduled', 'scheduled', 'verbal argument', ...,
        'departmental policy', 'supervisory approval', 'when required']],
      dtype='<U27')

In [18]:
model.topic_words[0]

array(['french quarter', 'based on', 'states that', 'supplemental report',
       'explain what', 'in jefferson', 'be identified',
       'miscellaneous incident', 'red light', 'file aagainst',
       'enforcement action', 'while driving', 'unprofessional manner',
       'aggravated assault', 'jefferson parish', 'law enforcement',
       'timely manner', 'no reason', 'out wanted', 'coming from',
       'new orleans', 'text messages', 'credit', 'been able', 'french',
       'document domestic', 'quarter', 'approached by', 'parade route',
       'which caused', 'lights', 'dating', 'proceeded', 'as though',
       'sick leave', 'as if', 'threatened by', 'pm', 'festival',
       'theduring', 'nopd officer', 'sirens', 'any action', 'wereof',
       'sheriff deputy', 'variouswho', 'in verbal', 'detained', 'illegal',
       'physically'], dtype='<U27')

In [19]:
topic_words, word_scores, topic_nums = model.get_topics(2)
for words, scores, nums in zip(topic_words, word_scores, topic_nums):
  print("Topic Number: ",nums)
  print(f"Words: {words}")
  print("\n")

Topic Number:  0
Words: ['french quarter' 'based on' 'states that' 'supplemental report'
 'explain what' 'in jefferson' 'be identified' 'miscellaneous incident'
 'red light' 'file aagainst' 'enforcement action' 'while driving'
 'unprofessional manner' 'aggravated assault' 'jefferson parish'
 'law enforcement' 'timely manner' 'no reason' 'out wanted' 'coming from'
 'new orleans' 'text messages' 'credit' 'been able' 'french'
 'document domestic' 'quarter' 'approached by' 'parade route'
 'which caused' 'lights' 'dating' 'proceeded' 'as though' 'sick leave'
 'as if' 'threatened by' 'pm' 'festival' 'theduring' 'nopd officer'
 'sirens' 'any action' 'wereof' 'sheriff deputy' 'variouswho' 'in verbal'
 'detained' 'illegal' 'physically']


Topic Number:  1
Words: ['which caused' 'no reason' 'document domestic' 'been able' 'coming from'
 'sick leave' 'sheriff deputy' 'armor' 'french quarter' 'variouswho'
 'states that' 'parade route' 'based on' 'allegesof' 'supplemental report'
 'untruthful' 'sub

In [20]:
topic_words, word_scores, topic_scores, topic_nums = model.search_topics(keywords=["sexual"], num_topics=5)
for word, w_score, topic, t_score in list(zip(topic_words, word_scores, topic_scores, topic_nums)):
    print(f"Word: \n{word}")
    print("--------------------")
    print(f"Word Score \n{w_score}")
    print("--------------------")
    print(f"Topic Score: \n{topic}")
    print("--------------------")
    print (f"Topic # \n{t_score}")
    print("--------END---------")

Word: 
['vulgar language' 'vulgar' 'profane language' 'body armor' 'phone calls'
 'correspondence' 'unprofessional comments' 'profane' 'andid not'
 'language' 'thedepartment' 'unprofessional towards' 'sexual assault'
 'another employee' 'walked into' 'as instructed' 'uustained action'
 'protective order' 'accusesof' 'off duty' 'photograph'
 'felt disrespected' 'comment' 'email' 'while handling' 'sexual' 'of duty'
 'response time' 'individual who' 'themade' 'used' 'subordinates'
 'when required' 'an individual' 'obscene' 'social media' 'inappropriate'
 'technician' 'allegesof' 'conduct paragraph' 'do anything' 'sending'
 'theused' 'paragraph neglect' 'used profanity' 'paid detail'
 'physical altercation' 'rule paragraph' 'as required' 'employee']
--------------------
Word Score 
[0.7589655  0.53663856 0.49623263 0.4779454  0.43670166 0.42055762
 0.41353524 0.39617273 0.394975   0.38783586 0.36175662 0.3516775
 0.34962255 0.34501922 0.343054   0.33646995 0.33302546 0.32854888
 0.31524238

In [21]:
# model = Top2Vec.load("../data/topic_modelling/model/new_orleans_pd_pib_2014_2019")

# umap_args = {
#     "n_neighbors": 15,
#     "n_components": 2, # 5 -> 2 for plotting 
#     "metric": "cosine",
# }
# umap_model = umap.UMAP(**umap_args).fit(model.topic_vectors)
# umap.plot.points(umap_model, labels=model.doc_top_reduced)

In [22]:
############################################################ bert ##############################################################################

In [23]:
from bertopic import BERTopic
import json

In [24]:
cprr = pd.read_csv("../data/clean/cprr_new_orleans_pd_pib_reports_2014_2020.csv")

In [25]:
def preprocess_bert(df):
    df = df.astype(str).fillna("").dropna()

    stop = stopwords.words("english")
    df.loc[:, "allegation_desc"] = df.allegation_desc.apply(
        lambda x: " ".join([word for word in x.split() if word not in (stop)])
    )
    return df[~((df.allegation_desc.fillna("") == ""))].dropna()

def preprocess_bert(df):
    df.loc[:, "allegation_desc"] = df.allegation_desc.fillna("")\
        .str.replace(r"( ?officers? | ?complain?a?n?ts?\.? ?| ?stated ?| ?accused ?| ?nan ?| ?police ?)", "", regex=True)\
        .str.replace(r"(\w+)\.(\w+)", r"\1 \2", regex=True)
    return df[~((df.allegation_desc == ""))]


def drop_rows_missing_data(df):
    return df[~((df.allegation_desc.fillna("") == ""))]


In [26]:
cprr = cprr.pipe(drop_rows_missing_data)

In [27]:
cprr.shape

(3633, 9)

In [28]:
docs = cprr.allegation_desc

In [29]:
docs = json.loads(docs.to_json(orient='records'))

In [30]:
docs[0]

'complaint was via webmail. police were called to remove complainant from his home. complainant alleged he told officers his items were vandalized and thrown outside. complainant was told to leave the home. complainant alleged his name was also on the lease and the responding officers did not request any identification from any of the involved parties. two attempts were made to contact the complainant via phone to conduct an audiotaped interview; to determine the specific allegations.'

In [31]:
topic_model = BERTopic(embedding_model="all-MiniLM-L6-v2")

In [32]:
topic, probs = topic_model.fit_transform(docs)

In [33]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name
0,-1,1243,-1_the_her_to_complainant
1,0,134,0_nopd_the_stated_and
2,1,110,1_bwc_activate_use_force
3,2,98,2_2018_allegation_neglect_duty
4,3,95,3_son_child_daughter_her
...,...,...,...
71,70,12,70_social_media_posted_comments
72,71,12,71_failed_submit_date_accused
73,72,12,72_brother_missing_arrested_death
74,73,12,73_pursuit_unauthorized_failing_0142


In [34]:
topic_model.get_topic(3)

[('son', 0.04258868092518672),
 ('child', 0.0223711097316915),
 ('daughter', 0.022305294492480383),
 ('her', 0.020113576607355268),
 ('juvenile', 0.015920889471555705),
 ('she', 0.01474646705432511),
 ('school', 0.014683452943718617),
 ('arrested', 0.014423686489862154),
 ('father', 0.013414463705371948),
 ('stated', 0.012417718656247516)]

In [35]:
topic_model.get_representative_docs(3)

['complainant stated her son was falsely charged by the arresting officer. complainant was not on the scene; and others who were told her the incident didn’t go as the police said it did. officer took temporary license plate off the car; left with it; and used profanity on the scene.',
 'the complainant stated that her son was arrested along with his friend when they got into a physical altercation with an elderly man. the complainant stated that the sergeant came to her home and had a bad attitude and he did not want to hear anything her son had to say and had already decided to take her son to the youth study center.',
 'complainant stated the officers did not assist her with getting custody of her son and w: she stated that she was also told she was going to get arrested if she would not leave very rude,']

In [36]:
# topic_model.visualize_topics()

In [37]:
# topic_model.visualize_barchart()

In [38]:
df_bert = pd.DataFrame({"topic": topic, "documents": docs})

In [40]:
############################################################ gensim ##############################################################################

In [41]:
# import numpy as np
# import json
# import glob
# import gensim
# import gensim.corpora as corpora 
# from gensim.utils import simple_preprocess
# from gensim.models import CoherenceModel
# import spacy
# import pandas as pd

# import spacy
# from nltk.corpus import stopwords

# import pyLDAvis
# import pyLDAvis.gensim_models

In [42]:
# nopd = pd.read_csv("../data/clean/cprr_new_orleans_pd_pib_reports_2014_2020.csv")

In [43]:
# def preprocess_gensim(df):
#     df = df.astype(str).fillna("").dropna()
#     return df[~((df.allegation_desc.fillna("") == ""))].dropna()

In [44]:
# nopd = nopd.pipe(preprocess_gensim)

In [45]:
# docs = nopd.allegation_desc

In [46]:
# def lemmatization(descs, allowed_pos_tags=["NOUN", "ADJ", "VERB", "ADV"]):
#     nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])
#     final_text = []
#     for desc in descs:
#         doc = nlp(desc)
#         new_text = " ".join([token.lemma_ for token in doc if token.pos_ in allowed_pos_tags])
#         final_text.append(new_text)
#     return (final_text)

In [47]:
# lemmatized_texts = lemmatization(docs)

In [48]:
# def gen_words(texts):
#     final = []
#     for text in texts:
#         new = gensim.utils.simple_preprocess(text, deacc=True)
#         final.append(new)
#     return (final)

In [49]:
# data_words = gen_words(lemmatized_texts)

In [50]:
# id2word = corpora.Dictionary(data_words)

# corpus = []
# for text in data_words:
#     new = id2word.doc2bow(text)
#     corpus.append(new)

In [51]:
# lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
#                                            id2word=id2word,
#                                            num_topics=30,
#                                            random_state=100,
#                                            update_every=1,
#                                            chunksize=100,
#                                            passes=10,
#                                            alpha="auto")

In [52]:
# pyLDAvis.enable_notebook()
# vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word, mds="mmds", R=30)
# vis