### IMPORTS

In [1]:
import os
import pytube
import speech_recognition as sr
import pandas as pd
import time

from pydub import AudioSegment
#Local version of the file containing the secret key
import gitignore as g

pd.set_option('max_colwidth', 400)

In [2]:
from openai import OpenAI

# Set up OpenAI API key
openai_api_key = g.OPENAI_API_KEY
client = OpenAI(api_key=openai_api_key)

# List all videos from a youtube channel

In [3]:
from googleapiclient.discovery import build

def get_channel_videos(api_key, channel_id):
    youtube = build('youtube', 'v3', developerKey=api_key)

    # Get the channel's uploads playlist ID
    request = youtube.channels().list(
        part='contentDetails',
        id=channel_id
    )
    response = request.execute()
    
    uploads_playlist_id = response['items'][0]['contentDetails']['relatedPlaylists']['uploads']
    
    # Get videos from the uploads playlist
    videos = []
    next_page_token = None
    
    while True:
        request = youtube.playlistItems().list(
            part='snippet',
            playlistId=uploads_playlist_id,
            maxResults=50,
            pageToken=next_page_token
        )
        response = request.execute()
        
        for item in response['items']:
            video_id = item['snippet']['resourceId']['videoId']
            video_url = f"https://www.youtube.com/watch?v={video_id}"
            videos.append(video_url)
        
        next_page_token = response.get('nextPageToken')
        if next_page_token is None:
            break
    
    return videos


In [4]:
youtube_videos_urls = get_channel_videos(api_key = g.YOUTUBE_API_KEY, channel_id = 'UCbZdXox6mKHdcT2QdVT-goQ')

# Useful sound treatment functions

In [5]:
# Useful sound treatment functions


def download_youtube_video(video_url, output_path='output'):
    if not os.path.exists(output_path):
        os.makedirs(output_path)
    
    yt = pytube.YouTube(video_url)
    video = yt.streams.filter(only_audio=True).first()
    output_file = video.download(output_path)
    return output_file

def convert_audio_to_wav(audio_file, output_format='wav'):
    audio = AudioSegment.from_file(audio_file)
    wav_file = f"{os.path.splitext(audio_file)[0]}.{output_format}"
    audio.export(wav_file, format=output_format)
    return wav_file

def transcribe_audio_chunk(audio_chunk):
    try:
        response = client.audio.transcriptions.create(
                model="whisper-1", file=audio_chunk)
        text = response.text

        return text
    except sr.UnknownValueError:
        return "(Unintelligible)"
    except sr.RequestError as e:
        return f"(RequestError: {e})"


def transcribe_audio(audio_file, video_url, chunk_length=30000):
 
    audio = AudioSegment.from_wav(audio_file)
    transcriptions = []

    for i in range(0, len(audio), chunk_length):
        start_time = i / 1000  # Convert to seconds
        audio_chunk = audio[i:i + chunk_length]
        audio_chunk_wav = f"temp_chunk_{i}.wav"
        audio_chunk.export(audio_chunk_wav, format="wav")
        with open(audio_chunk_wav, "rb") as audio_file:
            text = transcribe_audio_chunk( audio_file)
            transcriptions.append({'start_time': start_time, 'text': text, 'video_url': video_url})

        # Remove the temporary WAV file
        os.remove(audio_chunk_wav)
    
    return transcriptions


# Capture all videos and extract text:

In [6]:
print('Videos to process:', len(youtube_videos_urls))

Videos to process: 447


In [8]:

def main(video_urls):
    i=0
    for video_url in video_urls:
        i = i+1
        try:
            print(f"Processing video n° {i}: {video_url}")
            audio_file = download_youtube_video(video_url)
            wav_file = convert_audio_to_wav(audio_file)

            transcriptions = transcribe_audio(wav_file, video_url)
            
            # Extract video ID from the URL
            video_id = video_url.split('v=')[1]
            individual_csv_file = f'transcriptions_{video_id}.csv'
            
            # Save the individual transcription file
            df = pd.DataFrame(transcriptions)
            df.to_csv(individual_csv_file, index=False)
            
            # Append to the main transcription file
            with open(individual_csv_file, 'r') as ind_file:
                if not os.path.exists('transcriptions.csv'):
                    with open('transcriptions.csv', 'w') as main_file:
                        main_file.write(ind_file.read())
                else:
                    with open('transcriptions.csv', 'a') as main_file:
                        next(ind_file)  # Skip header row
                        main_file.write(ind_file.read())
            
            # Remove the temporary files
            os.remove(audio_file)
            os.remove(wav_file)
            os.remove(individual_csv_file)
        except: 
            print('This video can not be captured')
            pass
    print("Transcription completed and saved to individual and main CSV files.")

if __name__ == "__main__":
    # List of YouTube video URLs to process
  
    main(youtube_videos_urls)

Processing video n° 1: https://www.youtube.com/watch?v=4dAGl7RLWHc
Processing video n° 2: https://www.youtube.com/watch?v=W70rkratv0Q
Processing video n° 3: https://www.youtube.com/watch?v=9NUXbsiXo4U
Processing video n° 4: https://www.youtube.com/watch?v=MzTzZissnac
Processing video n° 5: https://www.youtube.com/watch?v=gi4m2MODsCA
Processing video n° 6: https://www.youtube.com/watch?v=CYYqjl9wRUM
Processing video n° 7: https://www.youtube.com/watch?v=AroRz9r9MLw
Processing video n° 8: https://www.youtube.com/watch?v=CMuSBWWp9Qc
Processing video n° 9: https://www.youtube.com/watch?v=zzKFJEPzZUM
Processing video n° 10: https://www.youtube.com/watch?v=YffxKI0Uwl0
Processing video n° 11: https://www.youtube.com/watch?v=w3HaMoO6j3g
Processing video n° 12: https://www.youtube.com/watch?v=qWpyKxVqNMk
Processing video n° 13: https://www.youtube.com/watch?v=bicaJPdpqIE
Processing video n° 14: https://www.youtube.com/watch?v=GorutlUSDFs
Processing video n° 15: https://www.youtube.com/watch?v=J

# add a summary of the chunk of text with chatGPT

In [7]:
def summarize_text(text):
    while True:
        try:
            response = client.chat.completions.create(
                model="gpt-4-turbo",
                messages=[{'role':'user', 'content':f"sum up in one sentence the following text: {text}"}])
            return response.choices[0].message.content

        except:
            print("let's have a 10sec. nap, shall we?")
            time.sleep(10)

In [8]:

def add_summaries_to_transcriptions(file_in, file_out):
    # Read the transcriptions CSV file
    df = pd.read_csv(file_in)

    # Apply the summarization function to each row in the DataFrame
    df['summary'] = df['text'].apply(summarize_text)

    # Save the updated DataFrame back to the CSV file
    df.to_csv(file_out, index=False)
    print(f"Summaries added and saved to {file_out}.")

add_summaries_to_transcriptions('transcriptions.csv', 'transcriptions_with_summary.csv')

Summaries added and saved to transcriptions_with_summary.csv.


### end of script