In [None]:
# Step 1: Load distictview data and clean it + remove invalid data
# Step 2: Perform first pass keyword filteration and generate context windows
# Step 3: Define relevancy tagging prompt and use Deepseek to tag context window sections

In [50]:
# libraries
import pandas as pd
import re
import math
import os
from tqdm import tqdm
import itertools
import ollama
import re
from ast import literal_eval
import time
import concurrent.futures



# --------------------------------------------------
# 1) load distictview jsonl data 

JSONL_FILE = "districtview2025.topics.jsonl"
TRANSCRIPT_COLUMN = 'caption_sentences' # <-- this should be sentence segmented transcripts 
DEEPSEEK_RESULTS = 'deepseek_kw_chunks_2025.csv' # filename to save deepseek relevancy tagging

# keep all columns except redundant ones (core cols are leaid, vid id, sentence segmented transcripts, mined topics, & location info)
DROP_COLUMNS = ['caption_text','channel_title','channel_id','caption_clean_rpunct']

# keywords for initial relevancy sweep related to topic - from literature review
topic = 'school closures'
INITIAL_KEYWORDS = ['closure', 'closing', 'merging','merge', 'consolidate','consolidation'
                    'reassignment','utilization','building repurposing', 'right sizing']

df = pd.read_json(JSONL_FILE, lines=True).drop(columns=DROP_COLUMNS)
display(df.head(2)) # 5304 rows


# ----------------------------------------------------

# DATA PREP FUNCTIONS

def clean_brackets_and_music(transcript):
    '''remove tags denoted by square brackets and music captions'''

    # remove any strings in square brackets
    pattern = r"\[.*?\]"  
    filtered_list = [re.sub(r"[\[\]]", "", s) for s in transcript if not re.search(pattern, s) and "music]" not in s]

    # remove empty strings
    filtered_list = [element for element in filtered_list if element != '']
    return filtered_list



def remove_invalid_meetings(df):
    '''remove transcripts which are Nan or None, empty strings, or < 5 minutes worth of words = 120 * 5 = 600 words '''

    # remove invalid meetings
    valid_df = df.loc[(df[TRANSCRIPT_COLUMN].str.lower() != 'deleted video') &
                         (~df[TRANSCRIPT_COLUMN].isin([[],'', None, math.nan]))] 

    # remove meetings with content less than 5 min
    valid_df['transcript_single_string'] = valid_df[TRANSCRIPT_COLUMN].apply(lambda x: ' '.join(x))
    valid_df_has_content = valid_df.loc[(valid_df['transcript_single_string'].str.split().str.len() >= 600)]

    print('original count: ', len(df),'filtered count: ',len(valid_df_has_content))
    
    return valid_df_has_content.drop(columns = ['transcript_single_string'])



# KEYWORD FILTERATION FUNCTIONS

def get_kw_list(transcript):
    '''gets initial keywords present in this transcript'''
    transcript_as_str = ' '.join(transcript).lower()
    kw_list= []
    for kw in INITIAL_KEYWORDS:
        if kw in transcript_as_str:
            kw_list.append(kw)
    return kw_list

def filter_to_kw_meetings(df):
    '''keep only meetings with at least 1 keyword'''
    
    df['initial_keywords'] = df[TRANSCRIPT_COLUMN].apply(get_kw_list)

    # onyl keep meetings that contain a kw
    kw_df = df[df['initial_keywords'].apply(lambda x: len(x) > 0)]
    print(f'original meeting count: {len(df)}, kw only meeting count: {len(kw_df)}')
    return kw_df


# --------------------------------------------------

# 2) remove music tags and remove invalid meetings
# invalid meetings defined as transcripts which are: None, empty '', or less than 5 minutes worth of words = 120 * 5 = 600 words
# (https://pmc.ncbi.nlm.nih.gov/articles/PMC2649675/#:~:text=Estimates%20of%20normal%20speaking%20rate,Studdert%2DKennedy%2C%201967)

df[TRANSCRIPT_COLUMN] = df[TRANSCRIPT_COLUMN].apply(clean_brackets_and_music)

valid_df = remove_invalid_meetings(df) # 5106 / 5304 left
display(valid_df)

# --------------------------------------------------

# 3) 1st pass - manual filter based on presence of general topic keywords

# If any keyword is found, the meeting is kept
# cast very wide net @ this stage since the purpose is to reduce the meetings 
# prior to DeepSeek + GPT annotation for resource purposes
# keywords from Lit Review


valid_df['initial_keywords'] = valid_df[TRANSCRIPT_COLUMN].apply(get_kw_list)

meetings_with_kw_df = filter_to_kw_meetings(valid_df)
display(meetings_with_kw_df)

Unnamed: 0,centroid_lat,centroid_lon,leaid,place_name,state_name,video_id,caption_sentences,caption_sentence_topics
0,34.195757,-118.095851,629940,Pasadena Unified sb,California,Qiv4tMRrUug,"[[, music] n [, music] n [, music] La, [, musi...","[27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 2..."
1,34.195757,-118.095851,629940,Pasadena Unified sb,California,jV5xKMXARjQ,"[La N N N N here., Okay we are recessing to cl...","[-1, -1, 64, -1, -1, -1, -1, -1, 2, 2, 63, 9, ..."


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  valid_df['transcript_single_string'] = valid_df[TRANSCRIPT_COLUMN].apply(lambda x: ' '.join(x))


original count:  5304 filtered count:  5106


Unnamed: 0,centroid_lat,centroid_lon,leaid,place_name,state_name,video_id,caption_sentences,caption_sentence_topics
0,34.195757,-118.095851,629940,Pasadena Unified sb,California,Qiv4tMRrUug,"[laughter n , We're going to start the meeting...","[27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 2..."
1,34.195757,-118.095851,629940,Pasadena Unified sb,California,jV5xKMXARjQ,"[La N N N N here., Okay we are recessing to cl...","[-1, -1, 64, -1, -1, -1, -1, -1, 2, 2, 63, 9, ..."
2,34.195757,-118.095851,629940,Pasadena Unified sb,California,megnv35esXQ,"[We're going to start the meeting right now., ...","[27, 27, 73, 27, -1, -1, -1, 27, 27, 27, 27, -..."
3,34.195757,-118.095851,629940,Pasadena Unified sb,California,owb9aYdkiFI,"[The , For, Will you lead us in the pledge?, P...","[27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 2..."
4,38.396716,-77.462184,5103660,Stafford County Public Schools sb,Virginia,3preQmqxcFA,"[Dr. Chase., Um, miss is Dr. Chase, I am onlin...","[51, -1, 27, 27, -1, -1, 65, -1, -1, -1, -1, -..."
...,...,...,...,...,...,...,...,...
5299,27.376443,-80.483779,1201770,St. Lucie sb,Florida,YWY08C9Lbs8,[Good morning All it is N9 o'clock and time to...,"[-1, 64, 2, 2, -1, -1, -1, -1, -1, 64, -1, -1,..."
5300,27.376443,-80.483779,1201770,St. Lucie sb,Florida,ZRqKfIuuEQY,"[Good evening everyone!, We will now get ready...","[-1, -1, 73, -1, -1, -1, -1, 2, -1, -1, -1, -1..."
5301,27.376443,-80.483779,1201770,St. Lucie sb,Florida,iJ1DQxcjosQ,"[Welcome The November 19, 2024 organizational ...","[73, -1, -1, 2, -1, 64, -1, -1, -1, -1, -1, -1..."
5302,27.376443,-80.483779,1201770,St. Lucie sb,Florida,oJ9eKsNlT5c,[Good morning it is 900 a.m and I call the spe...,"[-1, 2, 2, 34, 65, -1, -1, 81, 81, 81, 81, -1,..."


original meeting count: 5106, kw only meeting count: 3332


Unnamed: 0,centroid_lat,centroid_lon,leaid,place_name,state_name,video_id,caption_sentences,caption_sentence_topics,initial_keywords
0,34.195757,-118.095851,629940,Pasadena Unified sb,California,Qiv4tMRrUug,"[laughter n , We're going to start the meeting...","[27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 2...","[closure, closing, merging, merge]"
1,34.195757,-118.095851,629940,Pasadena Unified sb,California,jV5xKMXARjQ,"[La N N N N here., Okay we are recessing to cl...","[-1, -1, 64, -1, -1, -1, -1, -1, 2, 2, 63, 9, ...",[merge]
2,34.195757,-118.095851,629940,Pasadena Unified sb,California,megnv35esXQ,"[We're going to start the meeting right now., ...","[27, 27, 73, 27, -1, -1, -1, 27, 27, 27, 27, -...","[closure, closing, merging, merge]"
3,34.195757,-118.095851,629940,Pasadena Unified sb,California,owb9aYdkiFI,"[The , For, Will you lead us in the pledge?, P...","[27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 2...","[closure, merge]"
4,38.396716,-77.462184,5103660,Stafford County Public Schools sb,Virginia,3preQmqxcFA,"[Dr. Chase., Um, miss is Dr. Chase, I am onlin...","[51, -1, 27, 27, -1, -1, 65, -1, -1, -1, -1, -...",[closing]
...,...,...,...,...,...,...,...,...,...
5298,27.376443,-80.483779,1201770,St. Lucie sb,Florida,NpWXeUe_4GE,[Good morning Morning Good Morning Good mornin...,"[27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 2...",[closing]
5299,27.376443,-80.483779,1201770,St. Lucie sb,Florida,YWY08C9Lbs8,[Good morning All it is N9 o'clock and time to...,"[-1, 64, 2, 2, -1, -1, -1, -1, -1, 64, -1, -1,...",[closing]
5300,27.376443,-80.483779,1201770,St. Lucie sb,Florida,ZRqKfIuuEQY,"[Good evening everyone!, We will now get ready...","[-1, -1, 73, -1, -1, -1, -1, 2, -1, -1, -1, -1...",[merge]
5301,27.376443,-80.483779,1201770,St. Lucie sb,Florida,iJ1DQxcjosQ,"[Welcome The November 19, 2024 organizational ...","[73, -1, -1, 2, -1, 64, -1, -1, -1, -1, -1, -1...","[closure, merging]"


In [52]:

# --------------------------------------------------

# CONTEXT WINDOW FUNCTIONS 

def find_sublist_index(sentence_list, keyword_list):
    '''takes list of sentences and returns list of indices that contain kw'''
    key_idx_list = []

    # for each sentence, check if a kw appears
    for i, sentence in enumerate(sentence_list):
        if any(keyword in sentence for keyword in keyword_list):
            key_idx_list.append(i)
    unique_indices_lst = list(set(key_idx_list))
    return unique_indices_lst


def merge_ranges(ranges, gap = 1):
    '''combines consecutive windows allowing for space inbetween'''
    if len(ranges) == 0 :
        return []

    # sort by start idx
    ranges.sort()

    merged = [ranges[0]]

    for start, end in ranges[1:]:
        prev_start, prev_end = merged[-1]

        # check if current range is within the allowed gap
        if start <= prev_end + gap:
            merged[-1] = (prev_start, max(prev_end, end))  # if yes then update last range
        else:
            merged.append((start, end))  # otherwise add as separate range

    return merged


def get_ranges_for_windows(sentence_list, kw_index_list, window_size):
    '''returns list of sentences with window_size padded on each side of index'''
    init_ranges = []
    # for each keyword noted to be present in the transcript, get context window ranges
    for i in kw_index_list:

        # don't go out of index
        start = max(0, i - window_size)
        end = min(len(sentence_list), i + window_size + 1)

        if start ==0:
            end = min(len(sentence_list), 2 * window_size + 1)

        init_ranges.append((start, end))


    # combine overlapping or consecutive ranges
    final_window_ranges = merge_ranges(init_ranges, gap = 0 )

    return final_window_ranges


def get_simple_windows_by_range(sentence_list,idx_ranges_for_windows):
    '''get chunks by given index ranges.'''

    text_windows = []
    for start, end in idx_ranges_for_windows:
        window_as_list = (sentence_list[start:end])

        #join sentences in window as single string for llm input
        window_single_string = ' '.join(window_as_list)
        text_windows.append(window_single_string)

    return text_windows


def get_windows(sentence_list,keyword_list,window_size=20):
    kw_index_list = find_sublist_index(sentence_list, keyword_list)

    idx_ranges_for_windows = get_ranges_for_windows(sentence_list, kw_index_list, window_size)
    windows = get_simple_windows_by_range(sentence_list, idx_ranges_for_windows)
    return windows


def get_context_windows(df):

    # get the text windows where kw is present
    all_transcripts_windows = []
    for index, row in df.iterrows():
        #note window size = 15 so 15 sentence on both sides is default, then consecutive windows are merged if found
        result = get_windows(sentence_list=row[TRANSCRIPT_COLUMN], keyword_list=row['initial_keywords'], window_size=15)
        all_transcripts_windows.append(result)

    df['kw_text_chunks'] = all_transcripts_windows
    kw_chunks_df = df.loc[df['kw_text_chunks'].apply(lambda x: len(x) > 0)] # remove chunks not containing any content

    return kw_chunks_df


# --------------------------------------------------

# 4) Get context window sections of meetings from kw locations

kw_chunks_df = get_context_windows(meetings_with_kw_df)
kw_chunks_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['kw_text_chunks'] = all_transcripts_windows


Unnamed: 0,centroid_lat,centroid_lon,leaid,place_name,state_name,video_id,caption_sentences,caption_sentence_topics,initial_keywords,kw_text_chunks
0,34.195757,-118.095851,629940,Pasadena Unified sb,California,Qiv4tMRrUug,"[laughter n , We're going to start the meeting...","[27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 2...","[closure, closing, merging, merge]","[I Um, this is the readout language for Clos S..."
1,34.195757,-118.095851,629940,Pasadena Unified sb,California,jV5xKMXARjQ,"[La N N N N here., Okay we are recessing to cl...","[-1, -1, 64, -1, -1, -1, -1, -1, 2, 2, 63, 9, ...",[merge],"[Many of our students, educators and staff are..."
2,34.195757,-118.095851,629940,Pasadena Unified sb,California,megnv35esXQ,"[We're going to start the meeting right now., ...","[27, 27, 73, 27, -1, -1, -1, 27, 27, 27, 27, -...","[closure, closing, merging, merge]",[Public Employee Release Government Code 54957...
3,34.195757,-118.095851,629940,Pasadena Unified sb,California,owb9aYdkiFI,"[The , For, Will you lead us in the pledge?, P...","[27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 2...","[closure, merge]",[I heard a quote recently that really struck w...
4,38.396716,-77.462184,5103660,Stafford County Public Schools sb,Virginia,3preQmqxcFA,"[Dr. Chase., Um, miss is Dr. Chase, I am onlin...","[51, -1, 27, 27, -1, -1, 65, -1, -1, -1, -1, -...",[closing],[We must work together to build the foundation...
...,...,...,...,...,...,...,...,...,...,...
5298,27.376443,-80.483779,1201770,St. Lucie sb,Florida,NpWXeUe_4GE,[Good morning Morning Good Morning Good mornin...,"[27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 2...",[closing],"[That's right, Three people build everything. ..."
5299,27.376443,-80.483779,1201770,St. Lucie sb,Florida,YWY08C9Lbs8,[Good morning All it is N9 o'clock and time to...,"[-1, 64, 2, 2, -1, -1, -1, -1, -1, 64, -1, -1,...",[closing],[then fourth grade reading and math. so you ca...
5300,27.376443,-80.483779,1201770,St. Lucie sb,Florida,ZRqKfIuuEQY,"[Good evening everyone!, We will now get ready...","[-1, -1, 73, -1, -1, -1, -1, 2, -1, -1, -1, -1...",[merge],[Congratulations can you please come before us...
5301,27.376443,-80.483779,1201770,St. Lucie sb,Florida,iJ1DQxcjosQ,"[Welcome The November 19, 2024 organizational ...","[73, -1, -1, 2, -1, 64, -1, -1, -1, -1, -1, -1...","[closure, merging]","[, I'm sorry M Mam Chair Dr. Mills. Um um I th..."


In [None]:

# --------------------------------------------------

# RELEVANCY TAGGING FUNCTIONS

# specfic to school closures topic -- important to tailor a prompt specific to the topic of interest in the future
def check_relevancy(transcript_section,ds_model='deepseek-r1:7b'):

    '''returns string category:
        - CovidRelatedSchoolClosure
        - LongTermSchoolClosure
        - NonCovidTemporarySchoolClosure
        - NoneOrUnrelated
    '''

    prompt = f'''
    Classify the following TRANSCRIPT_INPUT into ONE of the categories listed in CATEGORIES that best fits the TRANSCRIPT_INPUT context.

    ### TRANSCRIPT_INPUT:
    ---
    {transcript_section}

    ### CATEGORIES
    ---
    Choose the best single answer from the following 4 categories that best fits the input:
    ----
    - "SchoolClosureDueToHealthIssue": select this category if the discussion around school closures is specific to covid, flu, or pandemic concerns,
    - "LongTermSchoolClosureOrReassignment": select this category if the discussion around school closures is specific to long term changes such as merging schools, shutting schools down permanently, and student reassignment.
    - "MinorShortTermSchoolClosure":  select this category if the discussion around school closures is specific to temporary school closures besides covid or pandemic concerns that have no long term impacts on student or community population such as weather related closures,
    - "NoneOfTheAbove":  select this category if the discussion is not relevant regarding information about school closures and does not fall into any of the other 3 categories.

    ### RULES
    1. Do not write an introduction or summary.
    2. Respond only with valid string following the OUTPUT_FORMAT 
    3. Do not use any other categories besides the 4 categories listed in CATEGORIES
    4. Your response should have three asterisks before and after the string name of the selected category to denote the final answer. 


    ### OUTPUT_FORMAT
    "***CATEGORY***"
    '''

    response = ollama.chat(model = ds_model, messages=[
        {
            'role': 'user',
            'content': prompt,
        },
    ])

    full_deepseek_response = (response['message']['content'])

    # extract the last answer
    pattern = r'\*\*\*(.*?)\*\*\*'
    matches = re.findall(pattern, full_deepseek_response)

    if matches:
        return matches[0]
    else:
        return f"INCORRECT RESPONSE RETURNED: {full_deepseek_response}"


def check_relevancy_with_timeout(section, timeout=300): # timeout a single section after 5 minutes (more than enough time)
    '''interrupts tagging if taking too long'''
    with concurrent.futures.ThreadPoolExecutor() as executor:
        future = executor.submit(check_relevancy, section)
        try:
            return future.result(timeout=timeout)
        except concurrent.futures.TimeoutError:
            return 'invalid_response_TIMEDOUT'


def tag_relevant_category(sections_list):
    print('Number of sections for this transcript: ',len(sections_list))

    # tagged_sections = [check_relevancy(section) for section in sections_list]
    tagged_sections = [check_relevancy_with_timeout(section) for section in sections_list]

    return tagged_sections


def get_indices_relevant(section_categories_lst, keep_list):
    '''returns list of section indices found relevant to school closure topic'''

    relevant_sections = []
    for i, val in enumerate(section_categories_lst):
        if val in keep_list:
            relevant_sections.append(i)
    return relevant_sections


def get_sections_relevant_from_indices(indx_list,sections_list):
    filtered_sections= [sections_list[i] for i in indx_list]
    return filtered_sections


def run_deepseek_filtering(window_df):
    '''iterate through context windows and tag with deepseek relevancy and save relevant sections under kept_sections column'''
    
    start = time.time()
    print('start time:', start)
    print('rows in this df: ',len(window_df))

    # tag with relevancy category
    window_df['relevancy_category'] = window_df['kw_text_chunks'].apply(tag_relevant_category)
    
    print('completed making relevancy_category column')
    #  filter to sections that are relevant

    window_df['indices'] =  window_df['relevancy_category'].apply(lambda x: get_indices_relevant(x, ['LongTermSchoolClosureOrReassignment']))
    print('completed making indices column')

    window_df['kept_sections'] = window_df.apply(lambda row: get_sections_relevant_from_indices(indx_list=row['indices'], sections_list=row['kw_text_chunks']), axis=1)
    print('completed making kept_sections column')

    end = time.time()

    total_time = end - start
    print(f'total time: {total_time} seconds')

    return window_df

deepseek_result_df = run_deepseek_filtering(window_df = kw_chunks_df)
deepseek_result_df.to_csv(DEEPSEEK_RESULTS)
deepseek_result_df

start time: 1743377627.2288058
rows in this df:  3293
Number of sections for this transcript:  12
Number of sections for this transcript:  3
Number of sections for this transcript:  12
Number of sections for this transcript:  9
Number of sections for this transcript:  1
Number of sections for this transcript:  3
Number of sections for this transcript:  3
Number of sections for this transcript:  8
Number of sections for this transcript:  3
Number of sections for this transcript:  1
Number of sections for this transcript:  3
Number of sections for this transcript:  1
Number of sections for this transcript:  1
Number of sections for this transcript:  1
Number of sections for this transcript:  1
Number of sections for this transcript:  1
Number of sections for this transcript:  4
Number of sections for this transcript:  2
Number of sections for this transcript:  1
Number of sections for this transcript:  2
Number of sections for this transcript:  4
Number of sections for this transcript:  