In [256]:

from curses import meta
import os
import time
import json
import streamlit as st
import pandas as pd
import openai
from langchain.document_loaders import Docx2txtLoader

OPENAI_API_KEY = st.secrets.get("OPENAI_API_KEY")
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_ENDPOINT"] = "https://api.smith.langchain.com"
os.environ["LANGCHAIN_API_KEY"] = "ls__b060afcb95f84771b911bb20e5955706"
# os.environ ["LANGCHAIN_PROJECT"] = "My Project Name" # Optional: "default" is used if not set

TRANSCRIPT_PATH = "data/transcripts"
ANNOTATIONS_PATH = "data/annotations"
ASSISTANT_ID = "asst_XfSqQlAPl7opg5fsMCjE9lfL"
transcript_files = os.listdir(TRANSCRIPT_PATH)



### Create a data file with all transcript filenames and raw text.

In [257]:
transcripts = []  # List to store file and transcript information

for file in transcript_files:
    if file.endswith('.docx'):  # Process only .docx files
        doc = Docx2txtLoader(os.path.join(TRANSCRIPT_PATH, file))
        transcript = doc.load()[0].page_content
        transcripts.append({"file": file, "transcript": transcript})

# Create DataFrame from the collected data
transcripts_df = pd.DataFrame(transcripts, columns=["file", "transcript"])
transcripts_df.head(5)

Unnamed: 0,file,transcript
0,Sean Taylor Catapult-X Educator Interview - De...,"This transcript was exported on Dec 18, 2023 -..."
1,Sarah Dorey Webster Catapult-X Educator Interv...,"This transcript was exported on Dec 18, 2023 -..."
2,Neil T. McGovern Catapult-X Educator Interview...,"This transcript was exported on Dec 18, 2023 -..."
3,Amanda Fuller CB Amazon loyalist_Millennial.docx,"This transcript was exported on Dec 18, 2023 -..."
4,Kelda Bailess Catapult-X Educator Interview - ...,"This transcript was exported on Dec 18, 2023 -..."


### Create a table with transcripts and key survey data

In [258]:

import re
survey_data = pd.read_csv("data/survey_data.csv")

for index, row in transcripts_df.iterrows():
    # Extract last name from file name
    file_name = row['file']
    transcript = row['transcript']
    
    # Separate keywords with whitespace characters, hyphens, or underscores
    keywords = re.findall(r"[\w'-]+", file_name)
    
    # filter keywords to first two name-sized words
    keywords = [keyword for keyword in keywords if len(keyword) >= 3]
    first_name = keywords[0]
    last_name = keywords[1]
    
    # find survey data for this person by filtering each name
    filtered_data = survey_data[survey_data['RecipientFirstName'].str.contains(first_name, case=False) & survey_data['RecipientLastName'].str.contains(last_name, case=False)]
    
    if filtered_data.empty:
        print(f"Could not find survey data for {first_name} {last_name}")
        continue

    # add survey data to transcript dataframe: Age Group, Email, Role
    transcripts_df.loc[index, 'ResponseId'] = filtered_data['ResponseId'].values[0]
    transcripts_df.loc[index, 'FirstName'] = filtered_data['RecipientFirstName'].values[0]
    transcripts_df.loc[index, 'LastName'] = filtered_data['RecipientLastName'].values[0]
    transcripts_df.loc[index, 'Email'] = filtered_data['RecipientEmail'].values[0]
    transcripts_df.loc[index, 'AgeGroup'] = filtered_data['Q15'].values[0]
    transcripts_df.loc[index, 'InstitutionName'] = filtered_data['InstitutionName'].values[0]
    transcripts_df.loc[index, 'District'] = filtered_data['ParentName'].values[0]
    transcripts_df.loc[index, 'City'] = filtered_data['MailingCity'].values[0]
    transcripts_df.loc[index, 'State'] = filtered_data['MailingState'].values[0]
    transcripts_df.loc[index, 'Role'] = filtered_data['Q2'].values[0]
    transcripts_df.loc[index, 'Subjects'] = filtered_data['Q3'].values[0]
    transcripts_df.loc[index, 'Courses'] = filtered_data['Q4'].values[0]
    transcripts_df.loc[index, 'TopOfMind'] = filtered_data['Q5'].values[0]
    transcripts_df.loc[index, 'Carolina Familiarity'] = filtered_data['Q6_1'].values[0]
    transcripts_df.loc[index, 'Fisher Familiarity'] = filtered_data['Q6_2'].values[0]
    transcripts_df.loc[index, 'Flinn Scientific Familiarity'] = filtered_data['Q6_3'].values[0]
    transcripts_df.loc[index, 'PLTW Familiarity'] = filtered_data['Q6_4'].values[0]
    transcripts_df.loc[index, 'Sargent Welch Familiarity'] = filtered_data['Q6_5'].values[0]
    transcripts_df.loc[index, 'Thomas Scientific Familiarity'] = filtered_data['Q6_6'].values[0]
    transcripts_df.loc[index, 'Wards/VWR Familiarity'] = filtered_data['Q6_7'].values[0]
    transcripts_df.loc[index, 'BioRad Familiarity'] = filtered_data['Q6_8'].values[0]
    transcripts_df.loc[index, 'BioCorp Familiarity'] = filtered_data['Q6_9'].values[0]
    transcripts_df.loc[index, 'Amazon Familiarity'] = filtered_data['Q6_10'].values[0]
    transcripts_df.loc[index, 'Nasco Familiarity'] = filtered_data['Q6_11'].values[0]
    transcripts_df.loc[index, 'Frey/School Specialty Familiarity'] = filtered_data['Q6_12'].values[0]
    transcripts_df.loc[index, 'Primary Vendor'] = filtered_data['Q7'].values[0]
    transcripts_df.loc[index, 'Top Vendor Qualities'] = filtered_data['Q8'].values[0]
    transcripts_df.loc[index, 'Years in Eduacation'] = filtered_data['Q14'].values[0]
transcripts_df

# save the dataframe to a csv file
transcripts_df.to_csv("data/transcripts.csv", index=False)

# save a dataframe with the transcripts dropped
transcripts_df_trimmed = transcripts_df.drop(columns=['transcript'])
    

### Create a table for relevant interview segments, with timestamps, transcript segment, themes, and survey data.

Create a chain for getting annotations from transcript text

In [259]:
import email
from re import A
from langchain.chains.openai_functions import (
    create_openai_fn_chain,
    create_openai_fn_runnable,
    create_structured_output_chain,
    create_structured_output_runnable,
)
from langchain.chat_models import ChatOpenAI
from langchain.prompts import ChatPromptTemplate

from typing import List
from langchain_core.pydantic_v1 import BaseModel, Field




class Annotation(BaseModel):
  email: str = Field(..., description="The email of the interviewee.")
  text_segment: str = Field(..., description="A statement by the customer that relates to one of the study themes. Does not apply to statments by Kimbery or Daylene.")
  context: str = Field(..., description="The full context of the text segment, including the interviewer question.")
  themes: List[str] = Field(..., description="The themes that apply to the text segment.")
  brand: str = Field(..., description="Brand that applies to the text segment.")
  time_stamp: str = Field(..., description="The time stamp of the text segment.")  

class AnnotationsList(BaseModel):
    annotations: List[Annotation]


instructions = """As Transcript Pro, analyze educator interview transcripts for market research. Key areas include:

  1. **Brand Perception:** Views on brands like Carolina, Flinn Scientific, Amazon, VWR, Ward's, etc.
  2. **Product Quality:** Discussions about product durability, effectiveness, quality.
  3. **Customer Service:** Experiences with customer service.
  4. **Purchasing Experience:** Ease or difficulty in purchasing.
  5. **Digital Resources:** Use of digital/virtual teaching tools.
  6. **Environmental Sustainability:** Eco-friendly practices in education.
  7. **Educational Policies:** Policy influence on purchases.
  8. **Customer Experience:** Brand experiences, positive or negative.
  9. **Buying Habits:** Timing and methods of buying.
  10. **Purchasing Patterns:** What is bought from various vendors.
  11. **Vendor Comparison:** Comparisons between Carolina Biological and others.
  12. **Budget and Timing:** Budget and purchase timing considerations.
  13. **Generational Insights:** Generational differences in buying.

  Focus on processing and summarizing interviewee statements accurately and objectively, maintaining consistency in coding.
  """

  # Chat Prompt Template from instructions
prompt = ChatPromptTemplate.from_messages(
    [
      ("system", instructions),
      ("human", "Process transcript for email: {email} transcript:\n {text}"),
    ]
  )

runnable = create_structured_output_runnable(AnnotationsList, ChatOpenAI(model="gpt-4-1106-preview", temperature=0), prompt)


### Process the transcripts to annotations

In [260]:
import os
import concurrent.futures

def process_transcript(row):
    # get the transcript text and email
    text = row['transcript']
    participant_email = row['Email']
    
    # check if the CSV file already exists
    csv_file_path = f"{ANNOTATIONS_PATH}/{participant_email}.csv"
    if os.path.exists(csv_file_path):
        print(f"CSV file for {participant_email} already exists. Skipping...")
        return
    else:
        print(f"Processing transcript for {participant_email}")
    # run the transcript through the model
    try:
        response = runnable.invoke({"email": participant_email, "text": text})
        annotations_df = pd.DataFrame.from_records([annotation.dict() for annotation in response.annotations])
        # merge annotations with transcript data
        annotations_df.to_csv(csv_file_path, index=False)
        # save annotations to a csv file
        print(f"Saved annotations for {participant_email}")
    except Exception as e:
        print(f"Error processing transcript for {participant_email}. Trying again...")
        try:
            response = runnable.invoke({"email": participant_email, "text": text})
            annotations_df = pd.DataFrame.from_records([annotation.dict() for annotation in response.annotations])
            # merge annotations with transcript data
            annotations_df.to_csv(csv_file_path, index=False)
            # save annotations to a csv file
            print(f"Saved annotations for {participant_email}")
        except Exception as e:
            print(f"Error processing transcript for {participant_email} again. Skipping...")

# Create a list of rows to process
rows_to_process = transcripts_df.iterrows()

# Process the transcripts using concurrent.futures
with concurrent.futures.ThreadPoolExecutor() as executor:
    executor.map(process_transcript, rows_to_process)

print("Done processing transcripts")

CSV file for sean.taylor@sbcusd.k12.ca.us already exists. Skipping...
CSV file for sdoreyweb@wcboe.org already exists. Skipping...
CSV file for nmcgovern@pmsd.org already exists. Skipping...
Processing transcript for amanda.fuller@pikeroadschools.org
Saved annotations for amanda.fuller@pikeroadschools.org
CSV file for kelda.bailess@vwsd.org already exists. Skipping...
Processing transcript for ruberg@eths202.org
Saved annotations for ruberg@eths202.org
CSV file for jmagargal@upperdarbysd.org already exists. Skipping...
CSV file for reade.burke@fortbendisd.com already exists. Skipping...
CSV file for kirsten.mahovlich@clevelandmetroschools.org already exists. Skipping...
CSV file for mburdsall@danville.k12.in.us already exists. Skipping...
CSV file for vfudrini@mhrd.org already exists. Skipping...
CSV file for libby.frost@decaturschools.org already exists. Skipping...
CSV file for pqmcgee@madison.k12.wi.us already exists. Skipping...
CSV file for teresa_a_massey@dekalbschoolsga.org alre

## Combine the annotations and save as annotations.csv

In [261]:

# combine the csv files into one
from operator import le


csv_files = os.listdir("data/annotations/")
annotations = []
for file in csv_files:
    if file.endswith('.csv'):  # Process only .csv files
        annotation = pd.read_csv(os.path.join("data/annotations/", file))
        # merge with survey data
        annotation_merged = pd.merge(annotation, transcripts_df_trimmed, left_on='email', right_on='Email')
        annotations.append(annotation_merged)
annotations
        
annotations_df = pd.concat(annotations)
annotations_df.to_csv("data/annotations.csv", index=False)
annotations_df

Unnamed: 0,email,text_segment,context,theme,brand,time_stamp,file,ResponseId,FirstName,LastName,...,Wards/VWR Familiarity,BioRad Familiarity,BioCorp Familiarity,Amazon Familiarity,Nasco Familiarity,Frey/School Specialty Familiarity,Primary Vendor,Top Vendor Qualities,Years in Eduacation,themes
0,teresa_a_massey@dekalbschoolsga.org,"I buy, I'm the one that even though I may not ...","Teresa Massey (02:38):\n\nOkay, so I buy, I'm ...","['Purchasing Experience', 'Buying Habits']",[],02:38,Teresa Massey Catapult-X Educator Interview - ...,R_3nknLxkysbILp8B,Teresa,Massey,...,Current Vendor,Current Vendor,Aware of (don't use),Aware of (don't use),Current Vendor,Current Vendor,"Carolina Biological,Fisher/Thermo Fisher Scien...","District approved vendor,Reliable",Over 20 years,
1,teresa_a_massey@dekalbschoolsga.org,I really prefer something that they can manipu...,Teresa Massey (03:31):\n\nI really prefer some...,"['Product Quality', 'Digital Resources']",[],03:31,Teresa Massey Catapult-X Educator Interview - ...,R_3nknLxkysbILp8B,Teresa,Massey,...,Current Vendor,Current Vendor,Aware of (don't use),Aware of (don't use),Current Vendor,Current Vendor,"Carolina Biological,Fisher/Thermo Fisher Scien...","District approved vendor,Reliable",Over 20 years,
2,teresa_a_massey@dekalbschoolsga.org,"Well, our timing is never the same from year t...","Teresa Massey (04:30):\n\nWell, our timing is ...",['Budget and Timing'],[],04:30,Teresa Massey Catapult-X Educator Interview - ...,R_3nknLxkysbILp8B,Teresa,Massey,...,Current Vendor,Current Vendor,Aware of (don't use),Aware of (don't use),Current Vendor,Current Vendor,"Carolina Biological,Fisher/Thermo Fisher Scien...","District approved vendor,Reliable",Over 20 years,
3,teresa_a_massey@dekalbschoolsga.org,This past year we decided on what we were goin...,"Teresa Massey (05:21):\n\nIn a way, we kind of...","['Purchasing Patterns', 'Digital Resources']",[],05:21,Teresa Massey Catapult-X Educator Interview - ...,R_3nknLxkysbILp8B,Teresa,Massey,...,Current Vendor,Current Vendor,Aware of (don't use),Aware of (don't use),Current Vendor,Current Vendor,"Carolina Biological,Fisher/Thermo Fisher Scien...","District approved vendor,Reliable",Over 20 years,
4,teresa_a_massey@dekalbschoolsga.org,We use it for a while. It's not new to the sch...,Teresa Massey (06:51):\n\nWe use it for a whil...,['Digital Resources'],['STEMscopes'],06:51,Teresa Massey Catapult-X Educator Interview - ...,R_3nknLxkysbILp8B,Teresa,Massey,...,Current Vendor,Current Vendor,Aware of (don't use),Aware of (don't use),Current Vendor,Current Vendor,"Carolina Biological,Fisher/Thermo Fisher Scien...","District approved vendor,Reliable",Over 20 years,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,ruberg@eths202.org,"Honestly, the best experiences I've had are ou...","Kimberly Herder (27:55): So on those catalogs,...",,Carolina,27:55,Greg Ruber Strohm Catapult-X Educator Intervie...,R_11bIskMPuh55EKW,Gregory,Ruber,...,Aware of (don't use),Aware of (don't use),Never heard of,Current Vendor,Current Vendor,Never heard of,"Carolina Biological,Amazon","District approved vendor,Free shipping (unlimi...",4-9 years,"['Customer Experience', 'Vendor Comparison']"
1,ruberg@eths202.org,I purchase lab supplies. Sometimes it's from l...,Kimberly Herder (14:39): Where do you get the ...,,General,14:39,Greg Ruber Strohm Catapult-X Educator Intervie...,R_11bIskMPuh55EKW,Gregory,Ruber,...,Aware of (don't use),Aware of (don't use),Never heard of,Current Vendor,Current Vendor,Never heard of,"Carolina Biological,Amazon","District approved vendor,Free shipping (unlimi...",4-9 years,"['Purchasing Patterns', 'Vendor Comparison']"
2,ruberg@eths202.org,I think I do. I'm very fortunate that I'm at a...,Kimberly Herder (16:26): Do you have a budget?...,,General,16:26,Greg Ruber Strohm Catapult-X Educator Intervie...,R_11bIskMPuh55EKW,Gregory,Ruber,...,Aware of (don't use),Aware of (don't use),Never heard of,Current Vendor,Current Vendor,Never heard of,"Carolina Biological,Amazon","District approved vendor,Free shipping (unlimi...",4-9 years,['Budget and Timing']
3,ruberg@eths202.org,I start that process by asking the students. S...,"Kimberly Herder (06:57): Okay, so you're doing...",,General,06:57,Greg Ruber Strohm Catapult-X Educator Intervie...,R_11bIskMPuh55EKW,Gregory,Ruber,...,Aware of (don't use),Aware of (don't use),Never heard of,Current Vendor,Current Vendor,Never heard of,"Carolina Biological,Amazon","District approved vendor,Free shipping (unlimi...",4-9 years,['Educational Policies']
