In [1]:
# Import modules
import moviepy.editor as mp  # Import video converter
import whisper               # Import audio transcription AI
import base64                # Base64 Decoder
import os
import requests
import urllib
from urllib.parse import unquote
# from urllib.parse import urlparse

In [2]:
def create_onedrive_directdownload(onedrive_link):
    data_bytes64 = base64.b64encode(bytes(onedrive_link, 'utf-8'))
    data_bytes64_String = data_bytes64.decode('utf-8').replace('/','_').replace('+','-').rstrip("=")
    resultUrl = f"https://api.onedrive.com/v1.0/shares/u!{data_bytes64_String}/root/content"
    return resultUrl

def get_link_file_size(direct_file_link):
    req = urllib.request.Request(direct_file_link, method='HEAD')
    f   = urllib.request.urlopen(req)
    if f.status == 200:
        file_size = float(f.headers['Content-Length'])
        file_size = (file_size * (1024**(-2))) # Change from bytes to gigabytes
    else:
        file_size = None
    return file_size

def download(url: str, dest_folder: str, onedrive=False):
    if not os.path.exists(dest_folder):
        os.makedirs(dest_folder)  # create folder if it does not exist

    if onedrive != False:
        req = urllib.request.Request(url, method='HEAD')
        f   = urllib.request.urlopen(req)
        filename = unquote((f.headers['Content-Disposition'].split("''"))[-1])
    else:
        filename = url.split('/')[-1].replace(" ", "_")  # be careful with file names
    filename = filename.split('?')[0]
    file_path = os.path.join(dest_folder, filename)

    r = requests.get(url, stream=True)
    if r.ok:
        print("saving to", os.path.abspath(file_path))
        with open(file_path, 'wb') as f:
            for chunk in r.iter_content(chunk_size=1024 * 8):
                if chunk:
                    f.write(chunk)
                    f.flush()
                    os.fsync(f.fileno())
    else:  # HTTP status code 4XX/5XX
        print("Download failed: status code {}\n{}".format(r.status_code, r.text))
    return file_path

In [3]:
onedrive = input("Is the file shared on OneDrive? y / n: ")

if onedrive.strip().lower() != "n" or "no" or "false":
    shared_link = input("What's the shared OneDrive link to the file? ")
    work_file_link = create_onedrive_directdownload(shared_link)
    work_file_size = get_link_file_size(work_file_link)
    if not (work_file_size < 4269):
        print("File size too big! Terminating operation...")
        del work_file
    else:
        work_file = download(work_file_link, "save", True)
else:
    work_file = input("What's the name of the file to be transcribed? ")

if work_file.split(".")[-1] == "mp4" or "mov" or "flv" or "mkv" or "wmv" or "webm" or "avi":
    # Python code to convert video to audio
    ## Create name for output audio file
    audio_file = f"{work_file[:-4]}.mp3"

    ## Insert Local Video File Path
    clip = mp.VideoFileClip(work_file)

    # Insert Local Audio File Path
    clip.audio.write_audiofile(audio_file)

saving to /work/save/Feeling Like Youre Never Enough.mp4
MoviePy - Writing audio in save/Feeling Like Youre Never Enough.mp3
                                                                      MoviePy - Done.


In [4]:
# Transcribe audio file
model = whisper.load_model("base")
audio = whisper.load_audio(audio_file)
result = model.transcribe(audio_file)

2022-09-22 13:16:13.277820: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-09-22 13:16:13.466526: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2022-09-22 13:16:13.470618: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-09-22 13:16:13.470633: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if yo

In [None]:
# Write transcript to plain text file
with open(f"{work_file}.txt", 'w') as transcript:
    transcript.write(result["text"])
    transcript.close()

# Write vtt subtitle file
subtitle = ""
subtitle = "WEBVTT\n\n"

for segment in result["segments"]:
    # Get seconds in hours (whole number only) and pad with zeros
    start_hr = ((str(segment["start"]/3600).split(".")[0]).zfill(2))
    end_hr = ((str(segment["end"]/3600).split(".")[0]).zfill(2))
    
    # Get seconds in minutes (whole number only) and pad with zeros
    start_min = ((str(segment["start"]/60).split(".")[0]).zfill(2)) 
    end_min = ((str(segment["end"]/60).split(".")[0]).zfill(2))

    # Get remainder seconds up to 3 decimal places
    start_sec = str("%.3f"%(segment["start"] % 60))
    end_sec   = str("%.3f"%(segment["end"] % 60))

    segment_text = segment["text"]
    subtitle = subtitle + f"{start_hr}:{start_min}:{start_sec} --> {end_hr}:{end_min}:{end_sec}\n{segment_text}\n\n"

with open(f"{work_file}.vtt", 'w') as fsub:
    fsub.write(subtitle)
    fsub.close()


In [8]:
# Clean up video/audio files used in OneDrive
if onedrive.strip().lower() != "n" or "no" or "false":
    for wfile in [work_file, audio_file]:
        if os.path.isfile(wfile):
            os.remove(wfile)

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=847d9e76-bbe2-432e-a3ee-480739ce897b' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>