-
Notifications
You must be signed in to change notification settings - Fork 2.2k
/
youtube.ts
126 lines (116 loc) Β· 3.67 KB
/
youtube.ts
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
import { TranscriptResponse, YoutubeTranscript } from "youtube-transcript";
import { Innertube } from "youtubei.js";
import { Document } from "@langchain/core/documents";
import { BaseDocumentLoader } from "@langchain/core/document_loaders/base";
/**
* Configuration options for the YoutubeLoader class. Includes properties
* such as the videoId, language, and addVideoInfo.
*/
interface YoutubeConfig {
videoId: string;
language?: string;
addVideoInfo?: boolean;
}
/**
* Metadata of a YouTube video. Includes properties such as the source
* (videoId), description, title, view_count, author, and category.
*/
interface VideoMetadata {
source: string;
description?: string;
title?: string;
view_count?: number;
author?: string;
category?: string;
}
/**
* A document loader for loading data from YouTube videos. It uses the
* youtube-transcript and youtubei.js libraries to fetch the transcript
* and video metadata.
* @example
* ```typescript
* const loader = new YoutubeLoader(
* "https:
* "en",
* true,
* );
* const docs = await loader.load();
* ```
*/
export class YoutubeLoader extends BaseDocumentLoader {
private videoId: string;
private language?: string;
private addVideoInfo: boolean;
constructor(config: YoutubeConfig) {
super();
this.videoId = config.videoId;
this.language = config?.language;
this.addVideoInfo = config?.addVideoInfo ?? false;
}
/**
* Extracts the videoId from a YouTube video URL.
* @param url The URL of the YouTube video.
* @returns The videoId of the YouTube video.
*/
private static getVideoID(url: string): string {
const match = url.match(
/.*(?:youtu.be\/|v\/|u\/\w\/|embed\/|watch\?v=)([^#&?]*).*/
);
if (match !== null && match[1].length === 11) {
return match[1];
} else {
throw new Error("Failed to get youtube video id from the url");
}
}
/**
* Creates a new instance of the YoutubeLoader class from a YouTube video
* URL.
* @param url The URL of the YouTube video.
* @param config Optional configuration options for the YoutubeLoader instance, excluding the videoId.
* @returns A new instance of the YoutubeLoader class.
*/
static createFromUrl(
url: string,
config?: Omit<YoutubeConfig, "videoId">
): YoutubeLoader {
const videoId = YoutubeLoader.getVideoID(url);
return new YoutubeLoader({ ...config, videoId });
}
/**
* Loads the transcript and video metadata from the specified YouTube
* video. It uses the youtube-transcript library to fetch the transcript
* and the youtubei.js library to fetch the video metadata.
* @returns An array of Documents representing the retrieved data.
*/
async load(): Promise<Document[]> {
let transcript: TranscriptResponse[] | undefined;
const metadata: VideoMetadata = {
source: this.videoId,
};
try {
transcript = await YoutubeTranscript.fetchTranscript(this.videoId, {
lang: this.language,
});
if (transcript === undefined) {
throw new Error("Transcription not found");
}
if (this.addVideoInfo) {
const youtube = await Innertube.create();
const info = (await youtube.getBasicInfo(this.videoId)).basic_info;
metadata.description = info.short_description;
metadata.title = info.title;
metadata.view_count = info.view_count;
metadata.author = info.author;
}
} catch (e: unknown) {
throw new Error(
`Failed to get YouTube video transcription: ${(e as Error).message}`
);
}
const document = new Document({
pageContent: transcript.map((item) => item.text).join(" "),
metadata,
});
return [document];
}
}