In [6]:
# I need to count how many text files are in podscribe_transcription/hibt_main

import os
import re

# Set the directory you want to start from
rootDir = 'podscribe_transcription/hibt_main'
count = 0
for dirName, subdirList, fileList in os.walk(rootDir):
    for fname in fileList:
        if fname.endswith('.txt'):
            count += 1
            # print('\t%s' % fname)
print(count)

260


In [7]:
# Podscribe does a decent job labeling ads with the advertisers name, we'll remove these from the transcripts

def remove_ads(transcript):
    # Split the transcript into lines
    lines = transcript.split('\n')

    # List to hold lines that are part of the main content, including speaker identifiers
    content_lines = []

    # Adjusted regular expression to match speaker identifiers with hours, minutes, and seconds
    speaker_regex = r'^\d+ \((\d+h )?(\d+m )?\d+s\):'

    # Loop through the lines with an index so we can look ahead
    i = 0
    while i < len(lines):
        # Check if the line matches the pattern for a speaker
        if re.match(speaker_regex, lines[i]):
            # Include this line (speaker identifier)
            content_lines.append(lines[i])
            
            # Check and include the next line if it exists
            if i + 1 < len(lines):
                content_lines.append(lines[i + 1])  # Include the next line (content)
                content_lines.append("")  # Add an empty line for separation
                
            i += 2  # Move past the next line since we've included it
        else:
            i += 1  # Increment to check the next line if this isn't a speaker

    # Join the content lines back into a single string, ensuring separation
    cleaned_transcript = '\n'.join(content_lines)
    return cleaned_transcript


In [12]:
import re
from collections import defaultdict

def identify_host(transcript):
    # Split the transcript into lines
    lines = transcript.split('\n')

    # Dictionaries to hold counts
    welcome_counts = defaultdict(int)
    question_counts = defaultdict(int)

    # Updated regular expression to match speaker identifiers with any timestamp
    speaker_regex = r'^(\d+) \((\d+h )?\d+m \d+s\):'

    current_speaker = None
    for line in lines:
        # Check if the line contains a speaker identifier
        speaker_match = re.match(speaker_regex, line)
        if speaker_match:
            current_speaker = speaker_match.group(1)
        else:
            # If the line doesn't contain a speaker identifier, it's part of the current speaker's text
            if current_speaker is not None:
                # Count 'welcome to' and 'welcome back'
                if 'welcome to' in line.lower() or 'welcome back' in line.lower():
                    welcome_counts[current_speaker] += 1

                # Count question marks
                question_counts[current_speaker] += line.count('?')

    # Identify the host and the speaker with the most questions
    # If multiple speakers have the same welcome count, this method still picks one, 
    # which might need manual verification for accuracy
    welcome_speaker = max(welcome_counts, key=welcome_counts.get) if welcome_counts else None
    most_questions_speaker = max(question_counts, key=question_counts.get) if question_counts else None

    # check if the welcome_speaker and most_questions_speaker is the same if they are then welcome_questions_match = True else False
    if welcome_speaker == most_questions_speaker:
        host_speaker = welcome_speaker
        welcome_questions_match = True
    elif welcome_speaker is None:
        host_speaker = most_questions_speaker
        welcome_questions_match = False
    else:
        host_speaker = welcome_speaker
        welcome_questions_match = False

    return host_speaker, welcome_questions_match


In [13]:
def insert_marker_before_host(transcript, host_speaker):
    # Split the transcript into lines
    lines = transcript.split('\n')

    # Regular expression to match any speaker identifiers with timestamps
    speaker_regex = r'^(\d+) \((\d+h )?\d+m \d+s\):'

    # List to hold the modified lines
    modified_lines = []

    for line in lines:
        speaker_match = re.match(speaker_regex, line)
        if speaker_match:
            current_speaker = speaker_match.group(1)
            # If the current speaker is the host, insert the marker before adding the line
            if current_speaker == host_speaker:
                modified_lines.append("###")  # Insert marker
            modified_lines.append(line)
        else:
            modified_lines.append(line)

    # Join the modified lines back into a single string
    modified_transcript = '\n'.join(modified_lines)
    return modified_transcript


In [10]:
# first we loop through the folder and remove the ads from the transcripts and place them in a new folder
# for each file in hibt_test we will remove the ads and place them into hibt_test_cleaned
for dirName, subdirList, fileList in os.walk('podscribe_transcription/hibt_test'):
    for fname in fileList:
        if fname.endswith('.txt'):
            with open(dirName + '/' + fname, 'r') as file:
                transcript = file.read()
                cleaned_transcript = remove_ads(transcript)
                with open('podscribe_transcription/hibt_test_cleaned/' + fname, 'w') as file:
                    file.write(cleaned_transcript)

In [14]:
# Now lets create a default dictionary to hold the file name of each of the files in hibt_test_cleaned to help hold the host_speaker if the welcome_questions_match is True 
# We'll have a dictionary structure like {filename: {host: host_speaker, welcome_questions_match: True/False}}

import os
from collections import defaultdict

# Initialize the defaultdict
hibt_test_meta = defaultdict(lambda: {"host": None, "welcome_questions_match": False})

# Directory path
dir_path = "podscribe_transcription/hibt_test_cleaned"

# Loop through each file in the directory
for filename in os.listdir(dir_path):
    # Open the file and read the transcript
    with open(os.path.join(dir_path, filename), 'r') as file:
        transcript = file.read()

    # Identify the host and whether the welcome questions match
    host_speaker, welcome_questions_match = identify_host(transcript)

    # Add data to the dictionary
    hibt_test_meta[filename]["host"] = host_speaker
    hibt_test_meta[filename]["welcome_questions_match"] = welcome_questions_match

print(hibt_test_meta)

defaultdict(<function <lambda> at 0x1070cb490>, {'Orangetheory Fitness_ Ellen Latham-transcript.txt': {'host': '0', 'welcome_questions_match': True}, 'Mailchimp_ Ben Chestnut-transcript.txt': {'host': '0', 'welcome_questions_match': True}, 'reCAPTCHA and Duolingo_ Luis von Ahn-transcript.txt': {'host': '4', 'welcome_questions_match': True}, 'Fitbit_ James Park-transcript.txt': {'host': '3', 'welcome_questions_match': True}, 'Ring_ Jamie Siminoff-transcript.txt': {'host': '1', 'welcome_questions_match': True}, 'Patagonia_ Yvon Chouinard-transcript.txt': {'host': '0', 'welcome_questions_match': False}, 'Coinbase_ Brian Armstrong-transcript.txt': {'host': '3', 'welcome_questions_match': True}, 'Etsy_ Rob Kalin-transcript.txt': {'host': '0', 'welcome_questions_match': True}, 'Twitch_ Emmett Shear-transcript.txt': {'host': '1', 'welcome_questions_match': True}, 'Robinhood_ Vlad Tenev-transcript.txt': {'host': '3', 'welcome_questions_match': True}})


In [15]:
# Loop through the cleaned transcripts and insert markers for the host comments
for dirName, subdirList, fileList in os.walk('podscribe_transcription/hibt_test_cleaned'):
    for fname in fileList:
        if fname.endswith('.txt'):
            host_speaker = hibt_test_meta[fname]['host']  # Get the host speaker number for the file
            with open(os.path.join(dirName, fname), 'r') as file:
                transcript = file.read()
                # Insert marker before host comments
                marked_transcript = insert_marker_before_host(transcript, host_speaker)
                # Save the marked transcript in the hibt_test_marked directory
                marked_dir = 'podscribe_transcription/hibt_test_marked'
                if not os.path.exists(marked_dir):
                    os.makedirs(marked_dir)  # Create the directory if it doesn't exist
                with open(os.path.join(marked_dir, fname), 'w') as file:
                    file.write(marked_transcript)

Working with MongoDB

In [24]:
import os
from dotenv import load_dotenv
import pymongo

load_dotenv()

True

In [None]:
client = pymongo.MongoClient(os.getenv('mongodb_uri'))

In [28]:
from langchain.prompts import PromptTemplate
from langchain.output_parsers import PydanticOutputParser

In [60]:
comp_guest_prompt = """You are an expert at parsing things out of a file string. 
    You are given a file string that should contain a company name (could be multiple) and a guest name (could be multiple).
    You are to parse these out of the string name.
    ---
    {response_template}
    ---
    Examples:
    filename: "A biometric smart gun with Kai Kloepfer of Biofire-transcript.txt"
    response: ""company": ["Biofire"], "guest": ["Kai Kloepfer"]"

    filename: "ActOne Group_ Janice Bryant Howroyd (2018)-transcript.txt"
    response: ""company": ["ActOne Group"], "guest": ["Janice Bryant Howroyd"]"

    filename: "HIBT/podscribe_transcription/hibt_main/McBride Sisters Wine (Part 1 of 2)_ Robin McBride and Andréa McBride John-transcript.txt"
    response: ""company": ["McBride Sisters Wine"], "guest": ["Robin McBride", "Andréa McBride John"]"

    filename: "HIBT/podscribe_transcription/hibt_main/reCAPTCHA and Duolingo_ Luis von Ahn-transcript.txt"
    response: ""company": ["reCAPTCHA", "Duolingo"], "guest": ["Luis von Ahn"]"
    ---
    Parse the company and guest names from the following file string:
    {file_string}
"""

In [61]:
from pydantic import BaseModel
from typing import List, Optional
import json

class comp_guest(BaseModel):
    company: List[str]
    guest: List[str]

parser = PydanticOutputParser(pydantic_object=comp_guest)
parser.get_format_instructions()

prompt_template = PromptTemplate(
        template=comp_guest_prompt,
        input_variables=["file_string"],
        partial_variables={"response_template": parser.get_format_instructions()},
    )


In [64]:
# import google.generativeai as genai
# import os

# genai.configure(api_key=os.environ.get("GOOGLE_API_KEY"))

# llm = genai.GenerativeModel(model_name = "gemini-pro")

In [65]:
from langchain_google_genai import ChatGoogleGenerativeAI
llm = ChatGoogleGenerativeAI(model="gemini-pro", google_api_key=os.getenv("GOOGLE_API_KEY"))

# Initialize the defaultdict
parsed_filenames = []

# Directory path
dir_path = "podscribe_transcription/hibt_test_marked"

# Loop through each file in the directory
for filename in os.listdir(dir_path):
    # Open the file and read the transcript
    formatted_input = prompt_template.format_prompt(file_string=filename)
    response = llm.invoke(formatted_input.to_string())
    parsed_output = json.loads(parser.parse(response.content).json())
    # add the filename to the parsed output
    parsed_output['filename'] = filename
    parsed_filenames.append(parsed_output)

parsed_filenames

## Everything below this was inital tests for the different functions

In [26]:
test_transcripts = ["podscribe_transcription/hibt_main/Coinbase_ Brian Armstrong-transcript.txt", 
                    "podscribe_transcription/hibt_main/Casper_ Philip Krim-transcript.txt",
                    "podscribe_transcription/hibt_main/Strava_ Mark Gainey and Michael Horvath-transcript.txt",
                    "podscribe_transcription/hibt_main/Shopify_ Tobias Lütke-transcript.txt"]

for file_path in test_transcripts:
    with open(file_path, 'r') as f:
        transcript_example = f.read()
        
    cleaned_transcript = remove_ads(transcript_example)
    
    # Create a new file path by replacing 'hibt_main' with 'hibt_main_cleaned' in the original path
    new_file_path = file_path.replace('hibt_main', 'hibt_main_cleaned')
    with open(new_file_path, 'w') as f:
        f.write(cleaned_transcript)

In [30]:
test_transcripts = ["podscribe_transcription/hibt_main_cleaned/Coinbase_ Brian Armstrong-transcript.txt", 
                    "podscribe_transcription/hibt_main_cleaned/Casper_ Philip Krim-transcript.txt",
                    "podscribe_transcription/hibt_main_cleaned/Strava_ Mark Gainey and Michael Horvath-transcript.txt",
                    "podscribe_transcription/hibt_main_cleaned/Shopify_ Tobias Lütke-transcript.txt"]

for file_path in test_transcripts:
    with open(file_path, 'r') as f:
        transcript_example = f.read()

    host, most_questions, welcome_counts, question_counts = process_transcript(transcript_example)
    print(file_path)
    print("\t Host Speaker (based on 'welcome' phrases):", host)
    print("\t Speaker with Most Questions:", most_questions)
    print("\t Welcome Counts:", welcome_counts)
    print("\t Question Counts:", question_counts)
    print("\n")

podscribe_transcription/hibt_main_cleaned/Coinbase_ Brian Armstrong-transcript.txt
	 Host Speaker (based on 'welcome' phrases): 3
	 Speaker with Most Questions: 3
	 Welcome Counts: defaultdict(<class 'int'>, {'3': 2})
	 Question Counts: defaultdict(<class 'int'>, {'3': 70, '2': 50, '4': 0, '0': 0})


podscribe_transcription/hibt_main_cleaned/Casper_ Philip Krim-transcript.txt
	 Host Speaker (based on 'welcome' phrases): 2
	 Speaker with Most Questions: 2
	 Welcome Counts: defaultdict(<class 'int'>, {'2': 2})
	 Question Counts: defaultdict(<class 'int'>, {'2': 66, '1': 29, '3': 0, '4': 0, '5': 0, '0': 0, '6': 2})


podscribe_transcription/hibt_main_cleaned/Strava_ Mark Gainey and Michael Horvath-transcript.txt
	 Host Speaker (based on 'welcome' phrases): 6
	 Speaker with Most Questions: 6
	 Welcome Counts: defaultdict(<class 'int'>, {'6': 3})
	 Question Counts: defaultdict(<class 'int'>, {'4': 21, '5': 10, '6': 71, '2': 0})


podscribe_transcription/hibt_main_cleaned/Shopify_ Tobias Lüt

In [33]:
# read in "podscribe_transcription/hibt_main_cleaned/Coinbase_ Brian Armstrong-transcript.txt"

with open('podscribe_transcription/hibt_main_cleaned/Coinbase_ Brian Armstrong-transcript.txt', 'r') as file:
    transcript_text = file.read()

# Example usage
host_speaker = '3'  # Assuming '0' is the host's speaker number

modified_transcript = insert_marker_before_host(transcript_text, host_speaker)

with open('podscribe_transcription/hitb_main_marked/Coinbase_ Brian Armstrong-transcript.txt', 'w') as file:
    file.write(modified_transcript)


In [42]:
from langchain.text_splitter import CharacterTextSplitter

text_splitter = CharacterTextSplitter(
    separator="###",
    chunk_size=2000,
    chunk_overlap=200,
    length_function=len,
    is_separator_regex=False,
)


In [43]:
texts = text_splitter.create_documents([modified_transcript])
for text in texts:
    print(text)

Created a chunk of size 2191, which is longer than the specified 2000
Created a chunk of size 2109, which is longer than the specified 2000
Created a chunk of size 2844, which is longer than the specified 2000
Created a chunk of size 2085, which is longer than the specified 2000
Created a chunk of size 2316, which is longer than the specified 2000
Created a chunk of size 3950, which is longer than the specified 2000


page_content="3 (1m 35s):\nThe stories we collectively believe in have made it possible to invent the idea of a nation or a people or even a faith. So let me ask you a question. What is the value of gold? Why does it cost almost $60 a gram? You might say it's pretty and shiny or that it's difficult to mine or that it's useful in certain industrial applications, but all of these factors apply to many other metals that are much cheaper. The reason we value gold is because we believe in a story about gold, a story that tells us it's valuable, same with diamonds, same with the dollar bills in your wallet.\n\n###\n3 (2m 19s):\nNone of these things have any intrinsic value, but they do have value because we all agree they do, which brings us to today's episode because we're going to be talking about something that is very challenging for many people to believe in, in part, because we can't see it or feel it digital ones and zeros with names like Bitcoin and Ethereum and polka dot and thousan

In [22]:
# load the text file "/podscribe_transcription/hibt_main/Rivian_ RJ Scaringe-transcript.txt" as the transcript example


with open('podscribe_transcription/hibt_main/Aviator Nation_ Paige Mycoskie-transcript.txt', 'r') as file:
    transcript_example = file.read()

# apply the remove_ads() function to the transcript
# save the cleaned transcript to /podscribe_transcription/hibt_main_cleaned folder

cleaned_transcript = remove_ads(transcript_example)

with open('podscribe_transcription/hibt_main_cleaned/Aviator Nation_ Paige Mycoskie-transcript', 'w') as file:
    file.write(cleaned_transcript)