In [1]:

# Importing Necessary Libraries
import os
import pandas as pd
from dotenv import load_dotenv
load_dotenv()

# Constants
DATA_PATH = "data"
TRANSCRIPT_PATH = "data/transcripts"
ANNOTATIONS_PATH = "data/annotations"
AZURE_DEPLOYMENT_GPT = "eddo-gpt4"


# Load transcript data
transcripts_df = pd.read_csv(os.path.join(DATA_PATH, "merged_transcripts.csv"))
transcripts_df.head()    

# Load annotations data
# Load the annotations from the individual csv files
path = os.path.join(DATA_PATH, "annotations.csv")
annotations_df = pd.read_csv(path, index_col=0)
annotations_df

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


Unnamed: 0,speaker,theme,context,sentiment_score,brand,identified_purchases,start_time,end_time,email,last_name,...,Wards/VWR Familiarity,BioRad Familiarity,BioCorp Familiarity,Amazon Familiarity,Nasco Familiarity,Frey/School Specialty Familiarity,Primary Vendor,Top Vendor Qualities,Years in Eduacation,interview_id
0,Robert.Lehman,Purchasing Experience,Robert mentions the difficulties of purchasing...,-0.4,,[],06:04,07:03,robert.lehman@pgcps.org,Lehman,...,Aware of (don't use),Aware of (don't use),Aware of (don't use),Aware of (don't use),Aware of (don't use),Never heard of,"Carolina Biological,Flinn Scientific",District approved vendor,4-9 years,23
1,Robert.Lehman,Educational Policies,Robert discusses how educational policies infl...,-0.3,,[],06:04,07:03,robert.lehman@pgcps.org,Lehman,...,Aware of (don't use),Aware of (don't use),Aware of (don't use),Aware of (don't use),Aware of (don't use),Never heard of,"Carolina Biological,Flinn Scientific",District approved vendor,4-9 years,23
2,Robert.Lehman,Digital Resources,Robert talks about the shift towards digital t...,0.1,,[],04:49,05:25,robert.lehman@pgcps.org,Lehman,...,Aware of (don't use),Aware of (don't use),Aware of (don't use),Aware of (don't use),Aware of (don't use),Never heard of,"Carolina Biological,Flinn Scientific",District approved vendor,4-9 years,23
3,Robert.Lehman,Budget and Timing,Robert explains the budgeting process within t...,0.0,,[],05:44,06:09,robert.lehman@pgcps.org,Lehman,...,Aware of (don't use),Aware of (don't use),Aware of (don't use),Aware of (don't use),Aware of (don't use),Never heard of,"Carolina Biological,Flinn Scientific",District approved vendor,4-9 years,23
4,Robert.Lehman,Buying Habits,Robert explains his decision to spend out of p...,0.2,,[],03:21,03:44,robert.lehman@pgcps.org,Lehman,...,Aware of (don't use),Aware of (don't use),Aware of (don't use),Aware of (don't use),Aware of (don't use),Never heard of,"Carolina Biological,Flinn Scientific",District approved vendor,4-9 years,23
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
898,Mr. Ruber-Strohm,Digital Resources,Wants a video included with lab kits to help a...,0.7,,[],49:07,49:07,ruberg@eths202.org,Ruber,...,Aware of (don't use),Aware of (don't use),Never heard of,Current Vendor,Current Vendor,Never heard of,"Carolina Biological,Amazon","District approved vendor,Free shipping (unlimi...",4-9 years,9
899,Mr. Ruber-Strohm,Customer Service,Would utilize a safety video included in a kit...,0.6,,[],50:13,50:13,ruberg@eths202.org,Ruber,...,Aware of (don't use),Aware of (don't use),Never heard of,Current Vendor,Current Vendor,Never heard of,"Carolina Biological,Amazon","District approved vendor,Free shipping (unlimi...",4-9 years,9
900,Mr. Ruber-Strohm,Product Quality,"Prefers diversity of results in experiments, a...",0.5,,[],51:20,51:20,ruberg@eths202.org,Ruber,...,Aware of (don't use),Aware of (don't use),Never heard of,Current Vendor,Current Vendor,Never heard of,"Carolina Biological,Amazon","District approved vendor,Free shipping (unlimi...",4-9 years,9
901,Mr. Ruber-Strohm,Educational Policies,Teacher values real-life science experiences o...,0.8,,[],52:06,53:42,ruberg@eths202.org,Ruber,...,Aware of (don't use),Aware of (don't use),Never heard of,Current Vendor,Current Vendor,Never heard of,"Carolina Biological,Amazon","District approved vendor,Free shipping (unlimi...",4-9 years,9


In [2]:
import re

def convert_timestamp_to_seconds(timestamp):
    """Convert a timestamp in the format MM:SS to total seconds."""
    minutes, seconds = map(int, timestamp.split(":"))
    return minutes * 60 + seconds

def extract_snippet(transcript, start_timestamp, end_timestamp):
    """
    Extracts a snippet from the transcript between any two given timestamps.
    The function handles cases where the start or end timestamp may not align exactly with a speaker's timestamp.
    """
    # Convert timestamps to seconds for comparison
    start_seconds = convert_timestamp_to_seconds(start_timestamp)
    end_seconds = convert_timestamp_to_seconds(end_timestamp)

    # Split transcript into sections and iterate through them
    sections = transcript.split('\n')
    snippet = ""
    include_section = False
    for section in sections:
        # Extract the timestamp from the section, if present
        match = re.search(r'\((\d+:\d+)\):', section)
        if match:
            section_timestamp = convert_timestamp_to_seconds(match.group(1))
            # Check if the section's timestamp is within or after the desired start range and before the end range
            if section_timestamp >= start_seconds and not include_section:
                include_section = True
            elif section_timestamp > end_seconds and include_section:
                break

        # Add the section to the snippet if it's within the range
        if include_section:
            snippet += section + "\n"

    return snippet.strip()



In [3]:
# for each annotation, extract the snippet from the transcript

def extract_snippet_from_annotation(annotation):
    """Extract the snippet from the transcript for a given annotation."""
    email = annotation["email"]
    transcript = transcripts_df[transcripts_df["Email"] == email]["transcript"].values[0]
    
    start_timestamp = annotation["start_time"]
    end_timestamp = annotation["end_time"]
    
    return extract_snippet(transcript, start_timestamp, end_timestamp)

annotations_df["snippet"] = annotations_df.apply(extract_snippet_from_annotation, axis=1)

# show fancy markdown snippets
from IPython.display import Markdown, display
def printmd(string):
    display(Markdown(string))
    
def print_snippet(annotation):
    """Print the snippet for a given annotation."""
    printmd(f"**{annotation['email']}**")
    # theme
    printmd(f"*{annotation['theme']}*")
    # context
    printmd(f"*{annotation['context']}*")
    # sentiment
    printmd(f"**Sentiment Score** {annotation['sentiment_score']}")
    
    printmd(f"*{annotation['start_time']} - {annotation['end_time']}*")
    printmd(annotation["snippet"])
    
print_snippet(annotations_df.iloc[0])

annotations_df

**robert.lehman@pgcps.org**

*Purchasing Experience*

*Robert mentions the difficulties of purchasing from vendors not on the authorized list leading to personal expenses.*

**Sentiment Score** -0.4

*06:04 - 07:03*

Daylene Long (06:04):

Of range are you usually working with?

Robert.Lehman (06:09):

Usually between 1000 and $2,000. It sort of depends upon how the classes get divvied up and if there's something that somebody bought last year and they feel like, well this year I'm in pretty good stead that then they sort of let it go to somebody else. But then the other thing back to your previous question in terms of purchasing is that in terms of purchase orders and authorized vendors is that it's very set cut and dried if person's not on a vendor list. And this little publisher that did the additional supplemental materials for my AP was not on that list. So again, that sort of was the impetus for me to use my own money because I knew I wouldn't get money from some other source to do that. Yeah.

Daylene Long (07:01):

What's the name of that little publisher they're talking about?

Robert.Lehman (07:03):

It's called the Apes. It's got a weird name. It's called the Apes House Capital a PES house. And it's sort of a consortium of environmental teachers who follow along and track along with the regular AP set curriculum that comes out from the college board and gives you some supplemental things that you can do beyond some of the stuff that they have you do. That

Unnamed: 0,speaker,theme,context,sentiment_score,brand,identified_purchases,start_time,end_time,email,last_name,...,BioRad Familiarity,BioCorp Familiarity,Amazon Familiarity,Nasco Familiarity,Frey/School Specialty Familiarity,Primary Vendor,Top Vendor Qualities,Years in Eduacation,interview_id,snippet
0,Robert.Lehman,Purchasing Experience,Robert mentions the difficulties of purchasing...,-0.4,,[],06:04,07:03,robert.lehman@pgcps.org,Lehman,...,Aware of (don't use),Aware of (don't use),Aware of (don't use),Aware of (don't use),Never heard of,"Carolina Biological,Flinn Scientific",District approved vendor,4-9 years,23,Daylene Long (06:04):\n\nOf range are you usua...
1,Robert.Lehman,Educational Policies,Robert discusses how educational policies infl...,-0.3,,[],06:04,07:03,robert.lehman@pgcps.org,Lehman,...,Aware of (don't use),Aware of (don't use),Aware of (don't use),Aware of (don't use),Never heard of,"Carolina Biological,Flinn Scientific",District approved vendor,4-9 years,23,Daylene Long (06:04):\n\nOf range are you usua...
2,Robert.Lehman,Digital Resources,Robert talks about the shift towards digital t...,0.1,,[],04:49,05:25,robert.lehman@pgcps.org,Lehman,...,Aware of (don't use),Aware of (don't use),Aware of (don't use),Aware of (don't use),Never heard of,"Carolina Biological,Flinn Scientific",District approved vendor,4-9 years,23,Robert.Lehman (04:49):\n\nAnd now our county h...
3,Robert.Lehman,Budget and Timing,Robert explains the budgeting process within t...,0.0,,[],05:44,06:09,robert.lehman@pgcps.org,Lehman,...,Aware of (don't use),Aware of (don't use),Aware of (don't use),Aware of (don't use),Never heard of,"Carolina Biological,Flinn Scientific",District approved vendor,4-9 years,23,"Robert.Lehman (05:44):\n\nIt gets allocated, e..."
4,Robert.Lehman,Buying Habits,Robert explains his decision to spend out of p...,0.2,,[],03:21,03:44,robert.lehman@pgcps.org,Lehman,...,Aware of (don't use),Aware of (don't use),Aware of (don't use),Aware of (don't use),Never heard of,"Carolina Biological,Flinn Scientific",District approved vendor,4-9 years,23,Robert.Lehman (03:21):\n\nReason that I chose ...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
898,Mr. Ruber-Strohm,Digital Resources,Wants a video included with lab kits to help a...,0.7,,[],49:07,49:07,ruberg@eths202.org,Ruber,...,Aware of (don't use),Never heard of,Current Vendor,Current Vendor,Never heard of,"Carolina Biological,Amazon","District approved vendor,Free shipping (unlimi...",4-9 years,9,Mr. Ruber-Strohm (49:07):\n\nBut yeah. And the...
899,Mr. Ruber-Strohm,Customer Service,Would utilize a safety video included in a kit...,0.6,,[],50:13,50:13,ruberg@eths202.org,Ruber,...,Aware of (don't use),Never heard of,Current Vendor,Current Vendor,Never heard of,"Carolina Biological,Amazon","District approved vendor,Free shipping (unlimi...",4-9 years,9,Mr. Ruber-Strohm (50:13):\n\nSure. I'd use tha...
900,Mr. Ruber-Strohm,Product Quality,"Prefers diversity of results in experiments, a...",0.5,,[],51:20,51:20,ruberg@eths202.org,Ruber,...,Aware of (don't use),Never heard of,Current Vendor,Current Vendor,Never heard of,"Carolina Biological,Amazon","District approved vendor,Free shipping (unlimi...",4-9 years,9,"Mr. Ruber-Strohm (51:20):\n\nI mean, obviously..."
901,Mr. Ruber-Strohm,Educational Policies,Teacher values real-life science experiences o...,0.8,,[],52:06,53:42,ruberg@eths202.org,Ruber,...,Aware of (don't use),Never heard of,Current Vendor,Current Vendor,Never heard of,"Carolina Biological,Amazon","District approved vendor,Free shipping (unlimi...",4-9 years,9,Mr. Ruber-Strohm (52:06):\n\nSo in real scienc...


In [5]:
# Save the combined annotations to a csv file.
path = os.path.join(DATA_PATH, "annotations_snippets.csv")
annotations_df.to_csv(path)

# Save the combined annotations to a JSON file
path = os.path.join(DATA_PATH, "annotations_snippets.json")
annotations_df.to_json(path, orient="records", indent=4)

annotations_df

Unnamed: 0,speaker,theme,context,sentiment_score,brand,identified_purchases,start_time,end_time,email,last_name,...,BioRad Familiarity,BioCorp Familiarity,Amazon Familiarity,Nasco Familiarity,Frey/School Specialty Familiarity,Primary Vendor,Top Vendor Qualities,Years in Eduacation,interview_id,snippet
0,Robert.Lehman,Purchasing Experience,Robert mentions the difficulties of purchasing...,-0.4,,[],06:04,07:03,robert.lehman@pgcps.org,Lehman,...,Aware of (don't use),Aware of (don't use),Aware of (don't use),Aware of (don't use),Never heard of,"Carolina Biological,Flinn Scientific",District approved vendor,4-9 years,23,Daylene Long (06:04):\n\nOf range are you usua...
1,Robert.Lehman,Educational Policies,Robert discusses how educational policies infl...,-0.3,,[],06:04,07:03,robert.lehman@pgcps.org,Lehman,...,Aware of (don't use),Aware of (don't use),Aware of (don't use),Aware of (don't use),Never heard of,"Carolina Biological,Flinn Scientific",District approved vendor,4-9 years,23,Daylene Long (06:04):\n\nOf range are you usua...
2,Robert.Lehman,Digital Resources,Robert talks about the shift towards digital t...,0.1,,[],04:49,05:25,robert.lehman@pgcps.org,Lehman,...,Aware of (don't use),Aware of (don't use),Aware of (don't use),Aware of (don't use),Never heard of,"Carolina Biological,Flinn Scientific",District approved vendor,4-9 years,23,Robert.Lehman (04:49):\n\nAnd now our county h...
3,Robert.Lehman,Budget and Timing,Robert explains the budgeting process within t...,0.0,,[],05:44,06:09,robert.lehman@pgcps.org,Lehman,...,Aware of (don't use),Aware of (don't use),Aware of (don't use),Aware of (don't use),Never heard of,"Carolina Biological,Flinn Scientific",District approved vendor,4-9 years,23,"Robert.Lehman (05:44):\n\nIt gets allocated, e..."
4,Robert.Lehman,Buying Habits,Robert explains his decision to spend out of p...,0.2,,[],03:21,03:44,robert.lehman@pgcps.org,Lehman,...,Aware of (don't use),Aware of (don't use),Aware of (don't use),Aware of (don't use),Never heard of,"Carolina Biological,Flinn Scientific",District approved vendor,4-9 years,23,Robert.Lehman (03:21):\n\nReason that I chose ...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
898,Mr. Ruber-Strohm,Digital Resources,Wants a video included with lab kits to help a...,0.7,,[],49:07,49:07,ruberg@eths202.org,Ruber,...,Aware of (don't use),Never heard of,Current Vendor,Current Vendor,Never heard of,"Carolina Biological,Amazon","District approved vendor,Free shipping (unlimi...",4-9 years,9,Mr. Ruber-Strohm (49:07):\n\nBut yeah. And the...
899,Mr. Ruber-Strohm,Customer Service,Would utilize a safety video included in a kit...,0.6,,[],50:13,50:13,ruberg@eths202.org,Ruber,...,Aware of (don't use),Never heard of,Current Vendor,Current Vendor,Never heard of,"Carolina Biological,Amazon","District approved vendor,Free shipping (unlimi...",4-9 years,9,Mr. Ruber-Strohm (50:13):\n\nSure. I'd use tha...
900,Mr. Ruber-Strohm,Product Quality,"Prefers diversity of results in experiments, a...",0.5,,[],51:20,51:20,ruberg@eths202.org,Ruber,...,Aware of (don't use),Never heard of,Current Vendor,Current Vendor,Never heard of,"Carolina Biological,Amazon","District approved vendor,Free shipping (unlimi...",4-9 years,9,"Mr. Ruber-Strohm (51:20):\n\nI mean, obviously..."
901,Mr. Ruber-Strohm,Educational Policies,Teacher values real-life science experiences o...,0.8,,[],52:06,53:42,ruberg@eths202.org,Ruber,...,Aware of (don't use),Never heard of,Current Vendor,Current Vendor,Never heard of,"Carolina Biological,Amazon","District approved vendor,Free shipping (unlimi...",4-9 years,9,Mr. Ruber-Strohm (52:06):\n\nSo in real scienc...
