-
Notifications
You must be signed in to change notification settings - Fork 2.2k
/
youtube.ts
132 lines (121 loc) Β· 3.87 KB
/
youtube.ts
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
import { TranscriptResponse, YoutubeTranscript } from "youtube-transcript";
import { Innertube } from "youtubei.js";
import { Document } from "@langchain/core/documents";
import { BaseDocumentLoader } from "../base.js";
import { logVersion020MigrationWarning } from "../../util/entrypoint_deprecation.js";
/* #__PURE__ */ logVersion020MigrationWarning({
oldEntrypointName: "document_loaders/web/youtube",
newPackageName: "@langchain/community",
});
/**
* Configuration options for the YoutubeLoader class. Includes properties
* such as the videoId, language, and addVideoInfo.
*/
interface YoutubeConfig {
videoId: string;
language?: string;
addVideoInfo?: boolean;
}
/**
* Metadata of a YouTube video. Includes properties such as the source
* (videoId), description, title, view_count, author, and category.
*/
interface VideoMetadata {
source: string;
description?: string;
title?: string;
view_count?: number;
author?: string;
category?: string;
}
/**
* A document loader for loading data from YouTube videos. It uses the
* youtube-transcript and youtubei.js libraries to fetch the transcript
* and video metadata.
* @example
* ```typescript
* const loader = new YoutubeLoader(
* "https:
* "en",
* true,
* );
* const docs = await loader.load();
* ```
*/
export class YoutubeLoader extends BaseDocumentLoader {
private videoId: string;
private language?: string;
private addVideoInfo: boolean;
constructor(config: YoutubeConfig) {
super();
this.videoId = config.videoId;
this.language = config?.language;
this.addVideoInfo = config?.addVideoInfo ?? false;
}
/**
* Extracts the videoId from a YouTube video URL.
* @param url The URL of the YouTube video.
* @returns The videoId of the YouTube video.
*/
private static getVideoID(url: string): string {
const match = url.match(
/.*(?:youtu.be\/|v\/|u\/\w\/|embed\/|watch\?v=)([^#&?]*).*/
);
if (match !== null && match[1].length === 11) {
return match[1];
} else {
throw new Error("Failed to get youtube video id from the url");
}
}
/**
* Creates a new instance of the YoutubeLoader class from a YouTube video
* URL.
* @param url The URL of the YouTube video.
* @param config Optional configuration options for the YoutubeLoader instance, excluding the videoId.
* @returns A new instance of the YoutubeLoader class.
*/
static createFromUrl(
url: string,
config?: Omit<YoutubeConfig, "videoId">
): YoutubeLoader {
const videoId = YoutubeLoader.getVideoID(url);
return new YoutubeLoader({ ...config, videoId });
}
/**
* Loads the transcript and video metadata from the specified YouTube
* video. It uses the youtube-transcript library to fetch the transcript
* and the youtubei.js library to fetch the video metadata.
* @returns An array of Documents representing the retrieved data.
*/
async load(): Promise<Document[]> {
let transcript: TranscriptResponse[] | undefined;
const metadata: VideoMetadata = {
source: this.videoId,
};
try {
transcript = await YoutubeTranscript.fetchTranscript(this.videoId, {
lang: this.language,
});
if (transcript === undefined) {
throw new Error("Transcription not found");
}
if (this.addVideoInfo) {
const youtube = await Innertube.create();
const info = (await youtube.getBasicInfo(this.videoId)).basic_info;
metadata.description = info.short_description;
metadata.title = info.title;
metadata.view_count = info.view_count;
metadata.author = info.author;
}
} catch (e: unknown) {
throw new Error(
`Failed to get YouTube video transcription: ${(e as Error).message}`
);
}
const document = new Document({
pageContent: transcript.map((item) => item.text).join(" "),
metadata,
});
return [document];
}
}