# Chat Podcast

Author: Kenneth Leung

## 03A. Pinecone Vectorstore
- Use Pinecone to build vectorstores of transcripts

___
## (1) Install and Import Dependencies

In [None]:
import json
import os
import pandas as pd
import pinecone
import yaml
from dotenv import load_dotenv
from pathlib import Path
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Pinecone

___
## (2) Configuration Settings

In [4]:
# Config settings
TRANSCRIPT_PATH = '../transcripts'

In [None]:
os.environ['PINECONE_API_KEY'] = 'your_key_here'
os.environ['PINECONE_ENV'] = 'your_key_here'
os.environ['OPENAI_API_KEY']
# load_dotenv(dotenv_path='.env', verbose=True)

___
## (3) Processing of Transcripts

In [5]:
# View all transcribed files
transcripts = sorted([str(x) for x in Path(TRANSCRIPT_PATH).glob('*.jsonl')])
transcripts

["..\\transcripts\\A Third Path to Talent Development - Delta's Michelle McCrackin.jsonl",
 "..\\transcripts\\AI in Aerospace - Boeing's Helen Lee.jsonl",
 "..\\transcripts\\AI in Your Living Room - Peloton's Sanjay Nichani.jsonl",
 "..\\transcripts\\Big Data in Agriculture - Land O'Lakes' Teddy Bekele.jsonl",
 "..\\transcripts\\Choreographing Human-Machine Collaboration - Spotify's Sidney Madison Prescott.jsonl",
 "..\\transcripts\\Digital First, Physical Second - Wayfair's Fiona Tan.jsonl",
 "..\\transcripts\\Extreme Innovation with AI - Stanley Black and Decker's Mark Maybury.jsonl",
 "..\\transcripts\\From Data to Wisdom - Novo Nordisk's Tonia Sideri.jsonl",
 "..\\transcripts\\From Journalism to Jeans - Levi Strauss' Katia Walsh.jsonl",
 "..\\transcripts\\Helping Doctors Make Better Decisions with Data - UC Berkley's Ziad Obermeyer.jsonl",
 "..\\transcripts\\Imagining Furniture (and the Future) with AI - IKEA Retail's Barbara Martin Coppola.jsonl",
 "..\\transcripts\\Inventing the 

In [6]:
lines = []

# Combine all JSONL files together
for transcript in transcripts:
    with open(transcript, "r", encoding="utf-8") as fp:
        for line in fp:
            line = json.loads(line) # Convert string dictionary to dict
            lines.append(line)

In [7]:
print(len(lines))

7152


In [8]:
lines[6]

{'title': "A Third Path to Talent Development - Delta's Michelle McCrackin",
 'date': 'Mar-23',
 'url': 'https://open.spotify.com/episode/50oRprIC6z0wJkpfLFQHDi',
 'id': "A Third Path to Talent Development - Delta's Michelle McCrackin-t32.56",
 'text': "I'm also the AI and Business Strategy guest editor at MIT Sloan Management Review.",
 'start': 32.56,
 'end': 38.019999999999996}

In [11]:
# Check text in every segment
for chunk in lines[5:8]:
    print(chunk['text'])

I'm Sam Ransbotham, Professor of Analytics at Boston College.
I'm also the AI and Business Strategy guest editor at MIT Sloan Management Review.
And I'm Shervin Kottubande, senior partner with BCG and one of the leaders of our AI business.


___
## (4) Extend Segment Texts
- We do not want each segment to be only one phrase/sentence long
- To make the indexing more useful and logical, we combine the texts of multiple segments together

In [12]:
# Chunking and striding
new_segments = []

chunk_size = 6  # No. of segment texts to combine
chunk_overlap = 3  # No. of segment texts to overlap

for i in range(0, len(lines), chunk_overlap):
    i_end = min(len(lines)-1, i + chunk_size)
    if lines[i]['title'] != lines[i_end]['title']:
        # Skip if audio file names are same
        continue
    text_list = []
    for chunk in lines[i:i_end]:
        text_list.append(chunk['text'])
    text = ' '.join(text_list)
    new_segments.append({
        'start': lines[i]['start'],
        'end': lines[i_end]['end'],
        'title': lines[i]['title'],
        'text': text,
        'id': lines[i]['id'],
        'url': lines[i]['url'],
        'date': lines[i]['date']
    })

In [13]:
len(new_segments)

2342

In [14]:
new_segments[0]

{'start': 0.0,
 'end': 38.019999999999996,
 'title': "A Third Path to Talent Development - Delta's Michelle McCrackin",
 'text': "How can organizations take advantage of existing deep domain knowledge? Find out how one airline is upscaling its frontline workforce on today's episode. I'm Michelle McCracken from Delta Airlines and you're listening to Me, Myself and AI. Welcome to Me, Myself and AI, a podcast on artificial intelligence and business. Each episode we introduce you to someone innovating with AI. I'm Sam Ransbotham, Professor of Analytics at Boston College.",
 'id': "A Third Path to Talent Development - Delta's Michelle McCrackin-t0.0",
 'url': 'https://open.spotify.com/episode/50oRprIC6z0wJkpfLFQHDi',
 'date': 'Mar-23'}

___
## (5) Setup Vectorstore with Pinecone

In [None]:
embeddings = OpenAIEmbeddings(openai_api_key=os.environ['OPENAI_API_KEY'])

In [None]:
# Initialize pinecone instance
pinecone.init(
    api_key=os.environ['PINECONE_API_KEY'],
    environment=os.environ['PINECONE_ENV'])

index_name = "chat-podcast"

if index_name not in pinecone.list_indexes():
    pinecone.create_index(
        index_name,
        1536, # Dimensions of OpenAI embeddings
        metric="cosine"
    )

index = pinecone.Index(index_name)
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}

In [None]:
new_segments[0]

{'start': 0.0,
 'end': 38.019999999999996,
 'title': "A Third Path to Talent Development - Delta's Michelle McCrackin",
 'text': "How can organizations take advantage of existing deep domain knowledge? Find out how one airline is upscaling its frontline workforce on today's episode. I'm Michelle McCracken from Delta Airlines and you're listening to Me, Myself and AI. Welcome to Me, Myself and AI, a podcast on artificial intelligence and business. Each episode we introduce you to someone innovating with AI. I'm Sam Ransbotham, Professor of Analytics at Boston College.",
 'id': "A Third Path to Talent Development - Delta's Michelle McCrackin-t0.0",
 'url': 'https://open.spotify.com/episode/50oRprIC6z0wJkpfLFQHDi',
 'date': 'Mar-23'}

In [None]:
# Convert segments into three lists for vectorstore upsert
texts = [elem['text'] for elem in new_segments]
ids = [elem['id'] for elem in new_segments]
metadatas = [{
            "text": elem["text"],
            "start": elem["start"],
            "end": elem["end"],
            "url": elem["url"],
            "date": elem["date"],
            "title": elem["title"]
            } for elem in new_segments]

In [None]:
docsearch = Pinecone.from_texts(texts=texts, 
                                embedding=embeddings, 
                                metadatas=metadatas,
                                ids=ids,
                                index_name=index_name)

In [None]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 2342}},
 'total_vector_count': 2342}

___
## (6) Check using Vector Similarity Search

In [None]:
query = "Which guest was invited to talk about the airline industry?"
docs = docsearch.similarity_search(query)

In [None]:
print(docs[0].page_content)

Shervin are excited to be talking today with Helen Li, Regional Director of Air Traffic Management and Airport Programs in China for the Boeing Company. Helen, thanks for taking the time to talk with us. Welcome. Thank you for having me. Let's get started. Helen, can you tell us about your current role at Boeing? I currently work at Boeing China in the Beijing office.


In [None]:
# References
# https://github.com/hwchase17/langchain/blob/master/langchain/vectorstores/pinecone.py