## Variables and Imports

In [2]:
import os
import pandas as pd
import streamlit as st


OPENAI_API_KEY = st.secrets.get("OPENAI_API_KEY")
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_ENDPOINT"] = "https://api.smith.langchain.com"
os.environ["LANGCHAIN_API_KEY"] = "ls__b060afcb95f84771b911bb20e5955706"
# os.environ ["LANGCHAIN_PROJECT"] = "My Project Name" # Optional: "default" is used if not set

DATA_PATH = "data"
TRANSCRIPT_PATH = "data/transcripts"
ANNOTATIONS_PATH = "data/annotations"
ASSISTANT_ID = "asst_XfSqQlAPl7opg5fsMCjE9lfL"
DATA_FILES = os.listdir("data")



### Import new transcripts from data directory. 


In [10]:
from langchain_community.document_loaders import Docx2txtLoader

new_transcripts = []

def load_transcripts_from_folder(folder_path):
    # create a dataframe with the file name and the transcript
    transcripts = []
    for file in os.listdir(folder_path):
        if file.endswith('.docx'):  # Process only .docx files
            doc = Docx2txtLoader(os.path.join(folder_path, file))
            transcript = doc.load()[0].page_content
            transcripts.append({"file": file, "transcript": transcript})
    return transcripts

# Create DataFrame for the new transcripts
new_transcripts_df = pd.DataFrame(load_transcripts_from_folder(DATA_PATH), columns=["file", "transcript"])
new_transcripts_df.head(5)

# move the new transcripts to the transcripts folder
for file in new_transcripts_df['file']:
    os.rename(os.path.join(DATA_PATH, file), os.path.join(TRANSCRIPT_PATH, file))



### Merge all transcripts with survey data and create a dataframe with all data.

In [14]:

import re
survey_data = pd.read_csv("data/survey_data.csv")

def merge_transcripts_with_surveys(transcripts_df, survey_data):
    dataframe = transcripts_df.copy()
    for index, row in transcripts_df.iterrows():
        file_name = row['file']
        transcript = row['transcript']
    
        # Separate keywords with whitespace characters, hyphens, or underscores
        keywords = re.findall(r"[\w'-]+", file_name)
        
        # filter keywords to first two name-sized words
        keywords = [keyword for keyword in keywords if len(keyword) >= 3]
        first_name = keywords[0]
        last_name = keywords[1]
        
        # find survey data for this person by filtering each name
        filtered_data = survey_data[survey_data['RecipientFirstName'].str.contains(first_name, case=False) & survey_data['RecipientLastName'].str.contains(last_name, case=False)]
        
        if filtered_data.empty:
            print(f"Could not find survey data for {first_name} {last_name}")
            continue

        # add survey data to transcript dataframe: Age Group, Email, Role
        dataframe.loc[index, 'ResponseId'] = filtered_data['ResponseId'].values[0]
        dataframe.loc[index, 'FirstName'] = filtered_data['RecipientFirstName'].values[0]
        dataframe.loc[index, 'LastName'] = filtered_data['RecipientLastName'].values[0]
        dataframe.loc[index, 'Email'] = filtered_data['RecipientEmail'].values[0]
        dataframe.loc[index, 'AgeGroup'] = filtered_data['Q15'].values[0]
        dataframe.loc[index, 'InstitutionName'] = filtered_data['InstitutionName'].values[0]
        dataframe.loc[index, 'District'] = filtered_data['ParentName'].values[0]
        dataframe.loc[index, 'City'] = filtered_data['MailingCity'].values[0]
        dataframe.loc[index, 'State'] = filtered_data['MailingState'].values[0]
        dataframe.loc[index, 'Role'] = filtered_data['Q2'].values[0]
        dataframe.loc[index, 'Subjects'] = filtered_data['Q3'].values[0]
        dataframe.loc[index, 'Courses'] = filtered_data['Q4'].values[0]
        dataframe.loc[index, 'TopOfMind'] = filtered_data['Q5'].values[0]
        dataframe.loc[index, 'Carolina Familiarity'] = filtered_data['Q6_1'].values[0]
        dataframe.loc[index, 'Fisher Familiarity'] = filtered_data['Q6_2'].values[0]
        dataframe.loc[index, 'Flinn Scientific Familiarity'] = filtered_data['Q6_3'].values[0]
        dataframe.loc[index, 'PLTW Familiarity'] = filtered_data['Q6_4'].values[0]
        dataframe.loc[index, 'Sargent Welch Familiarity'] = filtered_data['Q6_5'].values[0]
        dataframe.loc[index, 'Thomas Scientific Familiarity'] = filtered_data['Q6_6'].values[0]
        dataframe.loc[index, 'Wards/VWR Familiarity'] = filtered_data['Q6_7'].values[0]
        dataframe.loc[index, 'BioRad Familiarity'] = filtered_data['Q6_8'].values[0]
        dataframe.loc[index, 'BioCorp Familiarity'] = filtered_data['Q6_9'].values[0]
        dataframe.loc[index, 'Amazon Familiarity'] = filtered_data['Q6_10'].values[0]
        dataframe.loc[index, 'Nasco Familiarity'] = filtered_data['Q6_11'].values[0]
        dataframe.loc[index, 'Frey/School Specialty Familiarity'] = filtered_data['Q6_12'].values[0]
        dataframe.loc[index, 'Primary Vendor'] = filtered_data['Q7'].values[0]
        dataframe.loc[index, 'Top Vendor Qualities'] = filtered_data['Q8'].values[0]
        dataframe.loc[index, 'Years in Eduacation'] = filtered_data['Q14'].values[0]
    return dataframe

# Merge the new transcripts with the survey data
new_transcripts_merged_df = merge_transcripts_with_surveys(new_transcripts_df, survey_data)

# Save the new transcripts dataframe to a CSV file
new_transcripts_merged_df.to_csv("data/new_transcripts.csv", index=False)


In [15]:
# Load the survey data

survey_data = pd.read_csv("data/survey_data.csv")

# Load data from the transcripts folder
transcripts = load_transcripts_from_folder(TRANSCRIPT_PATH)
transcripts_df = pd.DataFrame(transcripts, columns=["file", "transcript"])

# Merge the survey data with the transcripts
all_transcripts = merge_transcripts_with_surveys(transcripts_df, survey_data)

# Save the merged transcripts to a CSV file
all_transcripts.to_csv("data/merged_transcripts.csv", index=False)


## Process new interview snapshots

Set up chain

In [6]:
from langchain.chains.openai_functions import (
    create_structured_output_runnable,
)
from langchain.chat_models import ChatOpenAI
from langchain.prompts import ChatPromptTemplate
instructions = """As the Interview Snapshot Compiler, my role is to assist Carolina Biological Supply Company in their market research by creating interview snapshots from primary sources. I output markdown in the following format: 
## [Interviewee Name], [Institution Name], [City, State]

Category: [Carolina loyalist | Flinn loyalist | Carolina + Flinn | Other]  
Generation: [Generation name and year range]

### Quick Facts
- **Position:** [Position]
- **Teaching Areas:** [Teaching Areas]
- **Background:** [Background]
- **School Type:** [School Type]
- **Purchasing Role:** [Purchasing Role]
- **Unique Fact:** [Unique Fact]

### Memorable Quote
- "[Memorable quote]" [timestamp] ([brief context])
- "[Memorable quote]" [timestamp] ([brief context])
- "[Memorable quote]" [timestamp] ([brief context])

### Buyer’s Journey
- [Brief notes from each step of this buyer's journey such as: Identification of Needs, Research and Consideration, Decision-Making, Vendor Selection, Post-Purchase Evaluation]

### Insights
- [Insight from interview]

### Opportunities
- [Opportunity/need identified]


### Video, Transcript & Survey Responses
- [to be added later]
"""

# Chat Prompt Template from instructions
prompt = ChatPromptTemplate.from_messages(
    [
      ("system", instructions),
      ("human", "Process transcript for email: {email} transcript:\n {text}"),
    ]
  )

llm = ChatOpenAI(model="gpt-4-1106-preview", temperature=0)

chain = prompt | llm


### Create a list of transcripts to process, skipping any with csv and md files already created.

In [16]:
import os

# Create a list of rows to process
rows_to_process = list(new_transcripts_df.iterrows())

# filter out rows that have already been processed
for index, row in new_transcripts_df.iterrows():
    participant_email = row['Email']
    md_file_path = f"{ANNOTATIONS_PATH}/{participant_email}.md"
    csv_file_path = f"{ANNOTATIONS_PATH}/{participant_email}.csv"
    if os.path.exists(csv_file_path) and os.path.exists(md_file_path):
        rows_to_process = [r for r in rows_to_process if r[0] != index]
        

def process_snapshot(row):
    transcript = row[1]['transcript']
    participant_email = row[1]['Email']
    print(f"Processing transcript for {participant_email}")
    # Run the chain
    result = chain.invoke({"email": participant_email, "text": transcript})
    # Save the results to an md file
    markdown = result.content
    md_file_path = f"{ANNOTATIONS_PATH}/{participant_email}.md"
    with open(md_file_path, "w") as f:
        f.write(markdown)
    return markdown
snapshots_to_process = rows_to_process
# display the emails of the rows to process
emails_to_process = [r[1]['Email'] for r in rows_to_process]
emails_to_process

['sean.taylor@sbcusd.k12.ca.us',
 'pqmcgee@madison.k12.wi.us',
 'markniebojeski@paps.net']

### Process the interview snapshots and output md files for each snapshot.

Run this to process snapshots one at a time

In [17]:
# process first snapshot and remove it from the list
snapshot = process_snapshot(snapshots_to_process[0])
snapshots_to_process = snapshots_to_process[1:]
snapshot

Processing transcript for sean.taylor@sbcusd.k12.ca.us


'## Sean Taylor, Sierra High School, San Bernardino, CA\n\nCategory: Other  \nGeneration: Millennial (1981-1996)\n\n### Quick Facts\n- **Position:** Biology Teacher and Department Chair\n- **Teaching Areas:** Biology, Coding, Forensic Science\n- **Background:** Teaching since 2009, focus on skill development, claim evidence, and reasoning\n- **School Type:** Continuation High School\n- **Purchasing Role:** Department Chair with autonomy in purchasing decisions\n- **Unique Fact:** School has a focus on mental and emotional health, and building relationships is a priority.\n\n### Memorable Quote\n- "Our kids are typically with us for six weeks to try to catch up on credits as opposed to the normal 18." [01:48] (Discussing the unique challenges of teaching at a continuation high school)\n- "I currently teach biology mostly, but I\'ve been doing a little bit of coding, a little bit of forensic science from here and there..." [01:48] (Describing his teaching areas)\n- "I want every kid to b

Run to process the remaining snapshots

In [18]:
# process the rest of the snapshots
for snapshot in snapshots_to_process:
    process_snapshot(snapshot)

Processing transcript for pqmcgee@madison.k12.wi.us
Processing transcript for markniebojeski@paps.net


### Stitch together snapshot markdown files into a single file

In [4]:
# stitch the md files together into a single md file

SNAPSHOTS_DIR = "data/snapshots/"

md_files = os.listdir(SNAPSHOTS_DIR)

# sort the files by email
md_files.sort(key=lambda f: f.split(".")[0])

# read the files into a list
mds = []
for md_file in md_files:
    with open(f"{SNAPSHOTS_DIR}/{md_file}", "r") as f:
        mds.append(f.read())

# join the list into a single string
md = "\n\n".join(mds)

# write the string to a file
with open("data/snapshots.md", "w") as f:
    f.write(md)


Process interview snapshots

In [None]:
# run manually to process the next transcript until all are processed
def process_next_transcript():
    # get the next transcript
    row = next(rows_to_process)
    process_transcript(row)
    
process_next_transcript()