In [None]:
# Step 1: Prepares and cleans annotated section data 
# Step 2: Aggregates problem solution data by meeting and district
# Step 3: Defines prompt for and builds problem data dictionary with GPT4.1
# Step 4: Tags problem texts with data dictionary

In [10]:
# libraries
import json
import pandas as pd
from collections import defaultdict
from openai import OpenAI


#-------------------------------------------------------------

# vars, llm client set up, and loading the data

KEY = "" # YOUR KEY HERE :P
client = OpenAI(api_key=KEY)
DEEPSEEK_RESULTS = "deepseek_results_24_25_only.csv" # the deepseek CSV relevancy results with dates
GPT_JSON_OUTPUT = "PROBLEM_SOLUTION_GPT41_JSON_RESULTS/GPT4o_output_deepseek_results_24_25_only.csv.json" # the json output file from step1_problem_solution_gpt_annotation.ipynb
ALL_PROBLEMS_JSON = 'district_problems.json'# the json file path to save raw list of all district problems - used to get top n problems 

with open(GPT_JSON_OUTPUT, "r") as f:
    data = json.load(f)
    
topic = 'school closures' # the topic to be mined


#-------------------------------------------------------------
# clean the data and prep for meeting and district aggregation
    
# remove sections which didn't contain problem solution information - aka empty lists
relevant_data = [item for sublist in data if sublist for item in sublist]
print('\nnumber of relevant transcript sections to topic\n',len(relevant_data))
relevant_df = pd.DataFrame(relevant_data)

#-------------------------------------------------------------
# save video metadata info from district view & connect w/ llm annotation
metadata_df = pd.read_csv(DEEPSEEK_RESULTS)
df = pd.merge(relevant_df,metadata_df, on = 'video_id', how = 'left')
df = df.drop(columns=[col for col in df.columns if 'Unnamed' in col])

print('\nsection data after merged with video metadata:\n')
display(df)


#-------------------------------------------------------------

# AGGREGATION FUNCTIONS 


# aggregate by video aka by meeting
def merge_dictionaries(dicts_lst):
    '''
    Params: 
    - dicts_lst (list): list of dictionaries
    
    Returns: dictionary whose values are lists
    
    Description: 
    - concats list of dictionaries into single dict
    - used in consolidate_group to aggregate by key
    '''
    
    merged = defaultdict(list)
    
    for d in dicts_lst:
        for k, v in d.items():
            
            if isinstance(v, list):
                merged[k].extend(v)
            else:
                merged[k].append(v)
            
    return dict(merged)



def consolidate_group(group_df, keep_cols = ["video_id","leaid","problem_solution_dictionary"]):
    '''
    Params: 
    - group_df (pandas dataframe): dataframe of single group from pandas groupby
    - keep_cols: list of columns to keep in the returned dataframe,
    defaulted to["video_id","leaid","problem_solution_dictionary"]
    
    Returns: pandas df
    
    Description: 
    - consolidates given grouped dataframe into a single row df by
    updating the problem_solution_dictionary
    '''

    group_df = group_df[keep_cols]

    # nothing to be consolidated
    if len(group_df) ==1:
        return group_df
        
    else: 
        # only need to update the problem_solution_dictionary
        meeting_df = group_df.iloc[[0]].copy()
        ps_dict_list = group_df['problem_solution_dictionary'].tolist()
        meeting_df.at[meeting_df.index[0], 'problem_solution_dictionary'] = merge_dictionaries(ps_dict_list)
        
        return meeting_df


def aggregate_by_col(df,col = "video_id", keep_cols = ["video_id","leaid","problem_solution_dictionary"]):
    '''
    Params: 
    - df (pandas dataframe): pandas dataframe to group
    - col (str): column name to group by, defaulted to "video_id"
    - keep_cols (list of str): list of column to keep in the returned dataframe,
    defaulted to["video_id","leaid","problem_solution_dictionary"]
    
    Returns: pandas dataframe
    
    Description: 
    - groups by specified col & returns pandas df where rows represent consolidated grouped data
    '''
    
    all_groups = df.groupby(col)
    df_lst = [consolidate_group(group, keep_cols = keep_cols) for group_id, group in all_groups]
    
    aggregated_df = pd.concat(df_lst).reset_index(drop=True)
    return aggregated_df



# PROBLEM ANNOTATION FUNCTIONS 

def get_problem_solution_topics(n = 5,model="gpt-4.1", problems_list = []): # model="gpt-4o"
    '''
    Params:
    - n (int): Num of problems to find most represented, defaulted to 5
    - model (gpt model): defaulted to gpt-4.1
    - problems_list (list of str): list of all district problems 

    Returns: None

    Description:
    - defines prompt to extract top n most represented topics from the given list
    - feeds prompt with appended problem list to model specified
    - makes and saves json with problem topics and descriptions
    '''

    # make prompt to extract top n problems from the given district list

    prompt = f"""Below is a list of problems identified from school board meetings across different districts related to {topic}.
The problems may be phrased differently, but could refer to the same underlying subject matter. 
Your task is to group such similar problems together, identify the top {n} most discussed problem themes, and to 
return a dictionary where keys are taglines summarizing the top {n} themes with their associated values being descriptive but concise summaries of each respective problem topic. 
The dictionary will be used in the future for categorization and tagging of district problem statements, therefore it is important to focus on semantic and topic similarity rather than exact wording. 
IMPORTANT: (1) Do not include any text, explanations, or preambles outside the comma separated problem topic output. 
(2) Only use double quotes around keys and values of the dictionary. (3) Do not wrap your response in ```json or ``` markdown code blocks.

###PROBLEM LIST:
{problems_list}

###OUTPUT FORMAT: {{"string": "string", "string": "string", ... }}
"""
    
    response = client.chat.completions.create(
    model = model,
    messages=[
        {"role": "user",
         "content": prompt}
    ])
    
    response_content = response.choices[0].message.content

    try:
        response_content = response.choices[0].message.content

        result_json = json.loads(response_content)

    except json.JSONDecodeError:
        print("Failed to parse JSON:", response_content)
        result_json = {"error": f"Invalid JSON response - RAW RESPONSE: **{response_content}**"}

    json_filename = f'{ALL_PROBLEMS_JSON}'

    # save gpt4 output
    with open(json_filename, "w") as file:
        json.dump(result_json, file, indent=4)


    # show in terminal
    print(json.dumps(result_json, indent=2))




def get_problem_tag(prob_str, model="gpt-4.1", ps_dictionary = {}):
    '''
    Params:
    - prob_str (str): text string about district problem to be classified
    - model (gpt model): defaulted to gpt-4.1
    - ps_dictionary (dict): dictionary of problem categories and descriptions

    Returns: str

    Description:
    - defines prompt to tag a given string with one of the problem topics
    - for problems not in the data dictionary, the tag "other" is assigned to
     denote the problem as outside the scope of relevant problems
    - feeds prompt to model specified & gives categroizes string
    '''

    problem_mapping_pompt = f"""
You are given a data dictionary related to {topic} concerns that defines a set of problem categories, each with a corresponding description. 
Your task is to assign the INPUT string describing a problem to the most appropriate category based on these descriptions.

### Data Dictionary (categories and descriptions):
{ps_dictionary}

### Instructions:
1. Carefully read the descriptions of all categories.
2. Analyze the input string's content to determine the underlying meaning or intent.  
3. Match the string to the single best-fitting category based on the descriptions.  
4. If the input does not clearly match any category, return "other".
5. Return only the matched key from the data dictionary or "other" if no match is appropriate.
6. Do not include any text, explanations, or preambles outside the category string. 

### INPUT:
"""

    prompt =f'{problem_mapping_pompt}\n{prob_str}'

    response = client.chat.completions.create(
    model = model,
    messages=[
        {"role": "user",
         "content": prompt}
    ])
    
    response_content = response.choices[0].message.content

    return response_content




#-------------------------------------------------------------
# Get meeting aggregated dataframe

meeting_df = aggregate_by_col(df) # aggregate by video_id
print('\nview meeting dataframe:\n')
display(meeting_df) # # check that it looks right

# check what the output looks like
print('\ncheck meeting aggregated dataframe list by looking at first updated problem solution dictionary\n',
      meeting_df.iloc[[0]]['problem_solution_dictionary'][0])


#-------------------------------------------------------------
# Get district aggregated dataframe

# aggregate by district
district_df = aggregate_by_col(meeting_df,col = "leaid", keep_cols = ["leaid","problem_solution_dictionary"])
print('\nview district dataframe:\n')
display(district_df) # check that it looks right

# check what the output looks like
print('\ncheck district aggregated dataframe list by looking at first updated problem solution dictionary\n',
      district_df.iloc[[0]]['problem_solution_dictionary'][0]) 



#-------------------------------------------------------------
# Get raw list of all problems across all district
# --> used to find top n problems represented across all districts 
#     for problem matrices

prob_solutions_dict_lst = district_df['problem_solution_dictionary'].tolist()

PROBLEMS_LIST = []
for d in prob_solutions_dict_lst:
    for problem in d.keys():
        PROBLEMS_LIST.append(problem)

print('check problems list by looking at first 5\n',
      PROBLEMS_LIST[:5])



#-------------------------------------------------------------
# Use chatgpt to get the top n problems most represented 
# by the complete list of district problems

top_n=6
get_problem_solution_topics(n=top_n, problems_list=PROBLEMS_LIST) # save top 6 most relevant problems as json 


# check the problem data dictionary for n problems
with open(ALL_PROBLEMS_JSON, "r", encoding="utf-8") as f:
    PS_DICTIONARY = json.load(f) 

print(f'\nTop {top_n} problems:\n',PS_DICTIONARY)


# test problem tagging using teh problem solution dictionary just created
district_prob = 'Inability to effectively communicate with parents due to outdated contact information'

get_problem_tag(prob_str = district_prob, ps_dictionary = PS_DICTIONARY)

#-------------------------------------------------------------




number of relevant transcript sections to topic
 310

section data after merged with video metadata:



Unnamed: 0,video_id,problem_solution_dictionary,centroid_lat,centroid_lon,leaid,place_name,state_name,caption_sentences,caption_sentence_topics,initial_keywords,kw_text_chunks,relevancy_category,indices,kept_sections,meeting_date
0,jV5xKMXARjQ,"{'Displacement and instability for students, e...",34.195757,-118.095851,629940,Pasadena Unified sb,California,"['La N N N N here.', 'Okay we are recessing to...","[-1, -1, 64, -1, -1, -1, -1, -1, 2, 2, 63, 9, ...",['merge'],"[""Many of our students, educators and staff ar...","['LongTermSchoolClosureOrReassignment', 'NoneO...",[0],"[""Many of our students, educators and staff ar...",2025-03-13
1,megnv35esXQ,{'Loss of RTI (Response to Intervention) and w...,34.195757,-118.095851,629940,Pasadena Unified sb,California,"[""We're going to start the meeting right now.""...","[27, 27, 73, 27, -1, -1, -1, 27, 27, 27, 27, -...","['closure', 'closing', 'merging', 'merge']","[""Public Employee Release Government Code 5495...","['NoneOfTheAbove', 'NoneOfTheAbove', 'LongTerm...",[2],"[""They wouldn't have their trust yet. And that...",2025-02-27
2,yUsla-dsyt0,{'Potential closure of the outdoor school prog...,39.562855,-77.022521,2400210,Carroll County Public Schools sb,Maryland,['Budget to the Board of Education at our meet...,"[-1, -1, -1, -1, -1, -1, 65, -1, -1, -1, -1, 9...","['closing', 'merging']","[""my oldest child attended outdoor school and ...","['NoneOfTheAbove', 'NoneOfTheAbove', 'LongTerm...",[2],"[""Please don't take the easy way out and kill ...",2025-03-26
3,WWEo6dW1Kqs,{'Difficulty contacting school staff directly ...,34.988357,-80.530740,3704620,Union County Public Schools sb,North Carolina,"['Good Morning!', 'It is February 26, 8:01 Am....","[-1, -1, 73, -1, 70, -1, -1, -1, 49, 49, 49, 4...",['closure'],"[""Um, you can just pick up the phone and call ...",['LongTermSchoolClosureOrReassignment'],[0],"[""Um, you can just pick up the phone and call ...",2025-02-26
4,nOWzl8DJdWQ,{'Repeated failure to secure funding for new s...,35.274862,-83.134278,3702340,Jackson County Public Schools sb,North Carolina,"['E E E E E E E E E E E E E E E E E E Sorry.',...","[64, 64, -1, -1, 70, 70, 70, -1, -1, -1, -1, -...","['closure', 'merging', 'merge', 'consolidate']","[""When the state says thou shalt pay everybody...","['NoneOfTheAbove', 'NoneOfTheAbove', 'LongTerm...",[2],"[""Um, this is the fourth cycle we have applied...",2024-08-27
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
305,iYGrWwvYods,{'Summit Elementary projected to be over-enrol...,38.037491,-78.485500,5100780,Charlottesville Cty Public Schools sb,Virginia,['You see us as standing around copers bemoani...,"[-1, -1, -1, 25, -1, -1, -1, -1, -1, -1, 27, -...","['closure', 'merging', 'utilization']","[""We will have another opportunity at the conc...","['NoneOfTheAbove', 'LongTermSchoolClosureOrRea...","[1, 3, 9]","[""All right? Well, thanks for having me tonigh...",2025-03-06
306,iYGrWwvYods,{'Difficulty achieving balanced student utiliz...,38.037491,-78.485500,5100780,Charlottesville Cty Public Schools sb,Virginia,['You see us as standing around copers bemoani...,"[-1, -1, -1, 25, -1, -1, -1, -1, -1, -1, 27, -...","['closure', 'merging', 'utilization']","[""We will have another opportunity at the conc...","['NoneOfTheAbove', 'LongTermSchoolClosureOrRea...","[1, 3, 9]","[""All right? Well, thanks for having me tonigh...",2025-03-06
307,GDkHaRWby9k,{'Lack of effective notification and communica...,29.540665,-98.346056,4824990,Judson Isd sb,Texas,"[""E E E E E E E E E E E E E E E E E E E E E He...","[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -...","['closure', 'closing', 'merge']","[""Statements and questions from the audience w...","['LongTermSchoolClosureOrReassignment', 'NoneO...",[0],"[""Statements and questions from the audience w...",2025-02-20
308,9_HN8AwFqrk,{'Lack of transparency and clarity in the stat...,40.800128,-72.797925,3606840,Center Moriches Union Free School District sb,New York,"[""Hello everybody Welcome to tonight's meeting...","[-1, -1, -1, -1, -1, -1, -1, -1, 84, 20, 20, -...","['closing', 'merge']","[""Hello everybody Welcome to tonight's meeting...","['LongTermSchoolClosureOrReassignment', 'NoneO...",[0],"[""Hello everybody Welcome to tonight's meeting...",2024-11-20



view meeting dataframe:



Unnamed: 0,video_id,leaid,problem_solution_dictionary
0,-uQwG7aDRn8,5103240,{'School consolidation cannot be implemented i...
1,019bEg34Ocs,612120,{'Difficulty retaining quality coaches for ath...
2,028ZucUjcFE,636800,{'Closure of James Monroe Middle School leadin...
3,0SwFGXnr4sg,3603840,{'Adding additional school closure holidays wi...
4,0mil-QvH3aM,3701800,{'Long bus rides and increased transportation ...
...,...,...,...
194,y_xtrqFio4I,691024,{'Confusion and concern regarding the classifi...
195,ysIJwsO9UB4,3411880,{'Sixth grade students facing disruption by be...
196,zCi50vxmH2E,900450,{'Community perception that school closures me...
197,zUrnjR2K-U4,633840,{'Low enrollment in the Parent Participation P...



check meeting aggregated dataframe list by looking at first updated problem solution dictionary
 {'School consolidation cannot be implemented in time to achieve savings for the next budget year due to required extensive community engagement and planning': ['Conduct extensive community engagement as part of the planning process before considering consolidation for future budget cycles', 'Gather and provide data on population and vacancies to inform potential consolidation discussions'], 'Overpopulation and rapid development are straining capacity in South side Richmond schools, leading to heavy reliance on temporary trailers': ['Consider building a new school and utilizing district-owned properties to address overpopulation', 'Keep alternative site discussions active, such as those related to Ruy Road, for future planning'], 'Uncertainty about the effectiveness of current partnerships and allocation of additional funds (e.g., Pen Foster with Richmond Success Academy)': ['Delay or remov

Unnamed: 0,leaid,problem_solution_dictionary
0,100240,{'Declining school enrollment making it financ...
1,102820,{'Aging school facilities built as early as th...
2,200180,{'Current school buildings are undersized and ...
3,401460,{'Lack of public awareness about programs and ...
4,403040,{'Need to reduce staffing due to budget constr...
...,...,...
129,5103840,{'Loss of in-person instructional time for stu...
130,5303750,{'Overcrowding in existing high schools due to...
131,5307770,{'Rising complexity and cost of special educat...
132,5400330,{'Unsafe and hostile school environment due to...



check district aggregated dataframe list by looking at first updated problem solution dictionary
 {'Declining school enrollment making it financially unsustainable to keep the school open': ['Hold a community meeting to explain situation and potential outcomes, including transportation logistics', "Decide on the school's future by February to provide clarity for planning", 'Collect accurate data from families on their intentions regarding staying or leaving the school'], 'Uncertainty about school closure leading to staff anxiety and potential resignations': ['Commit to a clear decision timeline (by February) to reduce uncertainty for staff and allow for planning', 'Communicate regularly with staff about the status and process to keep them informed'], 'Difficulties reaching families to gather input or update contact information': ['Immediately attempt to contact families whose information is outdated or unreachable to gather necessary feedback', 'Send out letters and other communicatio

'Community Trust, Engagement, and Communication Gaps'

In [11]:
# now that we have problem data dictionary, problems and corresponding solutions dict for each district, 
# need to tag each of the keys (the problem phrases) in each of the problem solution dicts of each district

# --------------
# do case example with first 5 rows aka 5 districts
district_df_test = district_df.head(5)

district_lst = district_df_test['leaid'].to_list()
district_prob_dicts_lst = district_df_test['problem_solution_dictionary'].to_list()


tagged_district_df = pd.DataFrame(columns=['leaid','problem_tag','solutions_list'])

for leaid, district_dict in zip(district_lst,district_prob_dicts_lst):
    
    # go through each prob key in the dict and tag
    for key in district_dict:
        this_prob_str = key
        tag = get_problem_tag(prob_str = this_prob_str, ps_dictionary = PS_DICTIONARY)

        # add new row
        new_district_row = {'leaid': leaid, 'problem_tag': tag, 'solutions_list': district_dict[key]}
        tagged_district_df.loc[len(tagged_district_df)] = new_district_row
        

display(tagged_district_df)

# check tagging of unrelated strings
print(get_problem_tag(prob_str = 'children having arguments over bring your pet to school day', ps_dictionary = PS_DICTIONARY))

# TO DO: map leaid info back to metadata
    
    


Unnamed: 0,leaid,problem_tag,solutions_list
0,100240,Declining Enrollment and Financial Pressures,[Hold a community meeting to explain situation...
1,100240,Operational and Logistical Challenges,[Commit to a clear decision timeline (by Febru...
2,100240,"Community Trust, Engagement, and Communication...",[Immediately attempt to contact families whose...
3,100240,"Community Trust, Engagement, and Communication...",[Host community meetings to hear and address p...
4,100240,Operational and Logistical Challenges,[Collect preference data from families on desi...
...,...,...,...
99,200180,Facilities and Infrastructure Issues,[Encourage partnerships and shared use of scho...
100,401460,"Community Trust, Engagement, and Communication...",[Increase public relations efforts to inform t...
101,401460,Operational and Logistical Challenges,[Participate in external studies (such as thos...
102,403040,Declining Enrollment and Financial Pressures,[Offer staff whose positions are being elimina...


other


In [None]:
tagged_district_df.to_csv('prob_solution_5_district_example.csv', index=False)    


In [13]:

# --------------
# do larger case example with first 50 districts
district_df_test = district_df.head(50)

district_lst = district_df_test['leaid'].to_list()
district_prob_dicts_lst = district_df_test['problem_solution_dictionary'].to_list()


tagged_district_df = pd.DataFrame(columns=['leaid','problem_tag','solutions_list'])

for leaid, district_dict in zip(district_lst,district_prob_dicts_lst):
    
    # go through each prob key in the dict and tag
    for key in district_dict:
        this_prob_str = key
        tag = get_problem_tag(prob_str = this_prob_str, ps_dictionary = PS_DICTIONARY)

        # add new row
        new_district_row = {'leaid': leaid, 'problem_tag': tag, 'solutions_list': district_dict[key]}
        tagged_district_df.loc[len(tagged_district_df)] = new_district_row
        

display(tagged_district_df)

# check tagging of unrelated strings
print(get_problem_tag(prob_str = 'children having arguments over bring your pet to school day', ps_dictionary = PS_DICTIONARY))

# TO DO: map leaid info back to metadata
    
    


Unnamed: 0,leaid,problem_tag,solutions_list
0,100240,Declining Enrollment and Financial Pressures,[Hold a community meeting to explain situation...
1,100240,Operational and Logistical Challenges,[Commit to a clear decision timeline (by Febru...
2,100240,"Community Trust, Engagement, and Communication...",[Immediately attempt to contact families whose...
3,100240,"Community Trust, Engagement, and Communication...",[Host community meetings to hear and address p...
4,100240,Operational and Logistical Challenges,[Collect preference data from families on desi...
...,...,...,...
471,1601800,"Community Trust, Engagement, and Communication...",[Develop and communicate clear financial and e...
472,1601800,Loss of School and Community Identity,[Conduct up-to-date community surveys to asses...
473,1601800,Declining Enrollment and Financial Pressures,[Evaluate fiscal responsibility by reviewing s...
474,1601800,Operational and Logistical Challenges,"[Develop contingency plans for facility use, i..."


other


In [14]:
tagged_district_df.to_csv('prob_solution_50_district_example.csv', index=False)    
