In [57]:
# source: https://softhints.com/merge-multiple-json-files-pandas-dataframe/
import numpy as np
import pandas as pd
import glob, os, json

# json_dir = './data'
json_dir = r'E:\STUDY\CKIDS\Analyse\data\data'
dir_list = glob.glob(os.path.join(json_dir, 'search_*'))

In [58]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.collocations import BigramAssocMeasures, BigramCollocationFinder


# Define filter function
def filter_text(text):
    # Define keywords related to LA crime victims
    keywords = ['kill', 'shoot', 'R.I.P.', 'homicide', 'gun', 'violence', 'rip', 'rest in peace', 'murder']
    
    # Check if the text contains at least one keyword
    if any(keyword in text.lower() for keyword in keywords):
        return True
    else:
        return False

# Load stop words
nltk.download('stopwords')
nltk.download('punkt')
stop_words = set(stopwords.words('english'))

# Process text
data_list = []
text_list = []
for dir in dir_list:
    file_list = glob.glob(os.path.join(dir, '*.json'))
    result_count = 0
    temp = []
    name = ""
    for file in file_list:
        f = open(file, 'r', encoding='utf-8') # file is a path to a json file
        js = json.load(f)
        file_name = os.path.basename(file)
        dir_name = os.path.basename(os.path.dirname(file))
        name = dir_name.split('search_')[-1]
        result_count = result_count + js['meta']['result_count']
        temp = np.concatenate((temp, js['data']))
        
        for d in js['data']:
            # Remove URLs and special characters
            text = re.sub(r'http\S+', '', d['text'])  # Remove URLs
            text = re.sub(r'[^\w\s]', '', text)  # Remove special characters
            
            # Remove stop words
            tokens = word_tokenize(text)
            tokens = [token.lower() for token in tokens if token.lower() not in stop_words]
            text = ' '.join(tokens)
            
            # Extract bigrams
            bigram_measures = BigramAssocMeasures()
            finder = BigramCollocationFinder.from_words(tokens)
            finder.apply_freq_filter(3)
            bigrams = finder.nbest(bigram_measures.pmi, 10)
            bigrams = ['_'.join(bigram) for bigram in bigrams]
            text = text + ' ' + ' '.join(bigrams)
            
            # Filter text
            if filter_text(text):
                text_list.append(text)
        
        f.close()
    data_list.append({'name': name, 'result_count': result_count, 'data': temp})

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [59]:
data = pd.DataFrame(data_list)
print(data.head())
print(f'data: {data.shape}')

                      name  result_count  \
0                                      0   
1           Aaron Brockway             2   
2    Aaron Carlos Martinez             2   
3  Aaron Christopher Scott             1   
4                                      0   

                                                data  
0                                                 []  
1  [{'text': 'Rip. Aaron Brockway üôè', 'author_id'...  
2  [{'id': '674337319760093184', 'edit_history_tw...  
3  [{'created_at': '2019-10-18T19:56:32.000Z', 't...  
4                                                 []  
data: (5333, 3)


In [60]:
# text_list is a list of strings containing the text of every tweet in the data
text_list[:5]

['rip aaron brockway ',
 'rip aaron brockway im lighting one ',
 'check donated son aaron harts final exspensive murderd walking 7eleven ',
 'aaron james pritchard 21 homicide report ',
 'abraham diaz 42yearold man bakersfield identified kern county coroners office man killed outside mariscos click read ']

In [61]:
text_list = [str(item) for item in text_list]

In [65]:
# source: https://towardsdatascience.com/interactive-topic-modeling-with-bertopic-1ea55e7d73d8#:~:text=BERTopic%20is%20a%20topic%20modeling,words%20in%20the%20topic%20descriptions.
from bertopic import BERTopic

# testing: training model with first 100; training with the whole list takes a long time
model = BERTopic(language="multilingual")
topics, probabilities = model.fit_transform(text_list[:10000])
model.get_topic_freq().head()

# ignore topic -1 (no topic assigned)


Unnamed: 0,Topic,Count
0,-1,1222
1,0,938
2,1,626
3,2,271
4,3,258


In [66]:
# testing: results
model.get_topic(0)

[('blavity', 0.05243440724107425),
 ('knows', 0.052208869908204045),
 ('15yearold', 0.051643689772004016),
 ('darius', 0.05101631735710126),
 ('smith', 0.050195785251330435),
 ('border', 0.050119048635789136),
 ('patrol', 0.050080770468818786),
 ('officer', 0.04995108410651453),
 ('one', 0.04832585663594681),
 ('killed', 0.022935866450412592)]