# Whisper+Nemo+ChatGPT实现音视频内容总结

[![Open in Kaggle](https://kaggle.com/static/images/open-in-kaggle.svg)](https://www.kaggle.com/code/cybozucyao/sample-kintone-narrator-cn/notebook)  

[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/kintone-samples/SAMPLE-kintone-narrator-cn/blob/main/whisper-nemo-chatgpt.ipynb)

## 安装依赖

In [1]:
!pip install yt_dlp
!pip install git+https://github.com/facebookresearch/demucs#egg=demucs
!pip install openai tiktoken
!pip3 install torch torchvision torchaudio
!pip install git+https://github.com/m-bain/whisperx.git
!pip install faster_whisper
!apt install -y ffmpeg sox libsndfile1
!pip install --upgrade hydra-core llvmlite omegaconf --ignore-installed
!python -m pip install git+https://github.com/NVIDIA/NeMo.git@main
!pip install --upgrade Cython jiwer braceexpand webdataset librosa sentencepiece
!pip install --upgrade youtokentome pyannote-audio transformers pandas inflect editdistance
!pip install -U pytorch-lightning

Collecting yt_dlp
  Downloading yt_dlp-2023.7.6-py2.py3-none-any.whl (3.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.0/3.0 MB[0m [31m39.4 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting mutagen (from yt_dlp)
  Downloading mutagen-1.47.0-py3-none-any.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.4/194.4 kB[0m [31m20.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pycryptodomex (from yt_dlp)
  Downloading pycryptodomex-3.18.0-cp35-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m70.3 MB/s[0m eta [36m0:00:00[0m
Collecting brotli (from yt_dlp)
  Downloading Brotli-1.0.9-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (2.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.7/2.7 MB[0m [31m74.7 MB/s[0m eta [36m0:00:00[0m:00:01[0m
[?25hInst

## 下载视频，你也可以选择上传文件

In [2]:
import yt_dlp
yt_url = 'https://www.youtube.com/watch?v=tgdJkAx3fJM'
ydl_opts = {
    'format': 'm4a/bestaudio/best',
    'outtmpl': '%(id)s.%(ext)s',
    'postprocessors': [{
        'key': 'FFmpegExtractAudio',
        'preferredcodec': 'wav',
    }],
}

with yt_dlp.YoutubeDL(ydl_opts) as ydl:
    error_code = ydl.download(yt_url)
    video_info = ydl.extract_info(yt_url, download=False)
    file_name = f"{video_info['id']}.wav"

[youtube] Extracting URL: https://www.youtube.com/watch?v=tgdJkAx3fJM
[youtube] tgdJkAx3fJM: Downloading webpage
[youtube] tgdJkAx3fJM: Downloading ios player API JSON
[youtube] tgdJkAx3fJM: Downloading android player API JSON
[youtube] tgdJkAx3fJM: Downloading m3u8 information
[info] tgdJkAx3fJM: Downloading 1 format(s): 140
[download] Destination: tgdJkAx3fJM.m4a
[download] 100% of    7.41MiB in 00:00:00 at 21.17MiB/s  
[FixupM4a] Correcting container of "tgdJkAx3fJM.m4a"
[ExtractAudio] Destination: tgdJkAx3fJM.wav
Deleting original file tgdJkAx3fJM.m4a (pass -k to keep)
[youtube] Extracting URL: https://www.youtube.com/watch?v=tgdJkAx3fJM
[youtube] tgdJkAx3fJM: Downloading webpage
[youtube] tgdJkAx3fJM: Downloading ios player API JSON
[youtube] tgdJkAx3fJM: Downloading android player API JSON
[youtube] tgdJkAx3fJM: Downloading m3u8 information


## 对工作目录下的最新音/视频文件进行预处理

In [3]:
import os
import demucs.separate
import shlex

file_extension = ['.mp4', '.wav']
allowed_files = [file for file in os.listdir() if any(file.lower().endswith(ext) for ext in file_extension)]
input_file = max(allowed_files, key=lambda file: os.path.getctime(file))

demucs.separate.main(shlex.split(f'-n htdemucs --two-stems=vocals "{input_file}" -o "temp_outputs"'))
input_file = os.path.join(
        "temp_outputs", "htdemucs", os.path.basename(input_file[:-4]), "vocals.wav")

audio_file = "audio_16k.wav"
!rm -rf {audio_file}
!ffmpeg -i {input_file} -ac 1 -ar 16000 {audio_file}

Downloading: "https://dl.fbaipublicfiles.com/demucs/hybrid_transformer/955717e8-8726e21a.th" to /root/.cache/torch/hub/checkpoints/955717e8-8726e21a.th
100%|██████████| 80.2M/80.2M [00:00<00:00, 146MB/s] 


Selected model is a bag of 1 models. You will see that many progress bars per track.
Separated tracks will be stored in /kaggle/working/temp_outputs/htdemucs
Separating track tgdJkAx3fJM.wav


100%|██████████████████████████████████████████████| 485.54999999999995/485.54999999999995 [00:24<00:00, 19.52seconds/s]


ffmpeg version 4.4.2-0ubuntu0.22.04.1 Copyright (c) 2000-2021 the FFmpeg developers
  built with gcc 11 (Ubuntu 11.2.0-19ubuntu1)
  configuration: --prefix=/usr --extra-version=0ubuntu0.22.04.1 --toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu --arch=amd64 --enable-gpl --disable-stripping --enable-gnutls --enable-ladspa --enable-libaom --enable-libass --enable-libbluray --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libcodec2 --enable-libdav1d --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libgme --enable-libgsm --enable-libjack --enable-libmp3lame --enable-libmysofa --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-libpulse --enable-librabbitmq --enable-librubberband --enable-libshine --enable-libsnappy --enable-libsoxr --enable-libspeex --enable-libsrt --enable-libssh --enable-libtheora --enable-libtwolame --enable-libvidstab --enable-libvorbis --enable-libvpx --enab

## 语音转录&对齐


In [4]:
from faster_whisper import WhisperModel
import whisperx
import torch

device = "cuda"
compute_type = "float16"
whisper_model = "medium"

model = WhisperModel(whisper_model, device=device, compute_type=compute_type)
segments, info = model.transcribe(audio_file, beam_size=1, word_timestamps=False,vad_filter=True, vad_parameters=dict(min_silence_duration_ms=500))

whisper_results = []

for segment in segments:
  print("[%.2fs -> %.2fs] 「%s」\n" % (segment.start, segment.end, segment.text))
  whisper_results.append(segment._asdict())


del model
torch.cuda.empty_cache()

alignment_model, metadata = whisperx.load_align_model(language_code=info.language, device=device)
result_aligned = whisperx.align(whisper_results, alignment_model, metadata, audio_file, device)
word_ts = result_aligned["segments"]

del alignment_model
torch.cuda.empty_cache()

caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so: undefined symbol: _ZN3tsl6StatusC1EN10tensorflow5error4CodeESt17basic_string_viewIcSt11char_traitsIcEENS_14SourceLocationE']
caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io.so: undefined symbol: _ZTVN10tensorflow13GcsFileSystemE']
[NeMo W 2023-09-05 05:17:58 transformer_bpe_models:59] Could not import NeMo NLP collection which is required for speech translation model.


Downloading (…)90c7f7fb/config.json:   0%|          | 0.00/2.26k [00:00<?, ?B/s]

Downloading (…)7f7fb/vocabulary.txt:   0%|          | 0.00/460k [00:00<?, ?B/s]

Downloading (…)7f7fb/tokenizer.json:   0%|          | 0.00/2.20M [00:00<?, ?B/s]

Downloading model.bin:   0%|          | 0.00/1.53G [00:00<?, ?B/s]

[0.94s -> 3.60s] 「計算式プラグインはやっぱり便利ですか?」

[3.60s -> 8.42s] 「便利ですね。やっぱり元々のKintonの計算フィールドって結局、」

[8.42s -> 12.52s] 「文字列1行と計算フィールドにしか入らないんですけど。」

[12.52s -> 16.04s] 「はい、もうKintonさんある。」

[16.04s -> 19.16s] 「今日はですね、後ろまたなべちゃんを呼んで、」

[19.16s -> 23.10s] 「備品管理のKinton活用事例っていうのをご紹介していきたいなと思います。」

[23.10s -> 29.68s] 「備品管理だけじゃなくて、僕らが最近教えている計算式プラグインを使った事例となりますので、」

[29.68s -> 33.78s] 「こんなこともできるんだよっていうのを参考になればいいかなと思っております。」

[33.78s -> 38.48s] 「うちのなべちゃんはですね、クルーデータとか、計算式プラグインをひたすら使いまくってますから、」

[38.48s -> 42.16s] 「それ実際にお客さんの導入事例としてあるということだったので、」

[42.16s -> 44.82s] 「持ち込み企画として動画に撮っております。」

[44.82s -> 49.46s] 「ということで、じゃあなべちゃん、まずは実際に概要の部分だけ共有してもらった上で、」

[49.46s -> 52.40s] 「画面共有してデモを見せる順番でお願いします。」

[52.40s -> 53.94s] 「アプリの概要ですね。」

[53.94s -> 58.18s] 「わかりました。アプリ的には、社内の備品管理っていうところを目指しています。」

[58.18s -> 62.48s] 「外国人向けの日本語教育されている会社さんの事例で作ったやつで、」

[62.48s -> 67.18s] 「その日本語のテティスト周り、同じテティストも10冊ぐらい持っていたとして、」

[67.18s -> 70.12s] 「その10冊それぞれに番号を付けて管理しています。」

[70.12s -> 74.66s] 「今誰が借りてるっていうのがわかるようにしてほしいっていう話があったので、」

[74

Downloading (…)rocessor_config.json:   0%|          | 0.00/158 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.57k [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/29.3k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.27G [00:00<?, ?B/s]

## Nemo分类

In [5]:
import os
import json
import wget
from omegaconf import OmegaConf
from nemo.collections.asr.models.msdd_models import ClusteringDiarizer
import torch

diarize_manifest = {
  'audio_filepath': f'./{audio_file}',
  'offset': 0,
  'duration':  None,
  'label': "infer",
  'text': "-",
  'num_speakers': None,
  'rttm_filepath': f'./diarized/pred_rttms/audio_16k.rttm',
  'uniq_id': ""
}

if not os.path.exists('./manifest.json'):
  with open('./manifest.json', 'w') as f:
    f.write(json.dumps(diarize_manifest))

MODEL_CONFIG = os.path.join('./','diar_infer_meeting.yaml')
if not os.path.exists(MODEL_CONFIG):
    config_url = "https://raw.githubusercontent.com/NVIDIA/NeMo/main/examples/speaker_tasks/diarization/conf/inference/diar_infer_meeting.yaml"
    MODEL_CONFIG = wget.download(config_url, './')

config = OmegaConf.load(MODEL_CONFIG)
config.num_workers = 4
config.batch_size = 32

config.diarizer.manifest_filepath = './manifest.json'
config.diarizer.out_dir = os.path.join('./', 'diarized')
config.diarizer.speaker_embeddings.model_path = 'titanet_large'
config.diarizer.speaker_embeddings.parameters.window_length_in_sec = [1.5, 1.0, 0.5]
config.diarizer.speaker_embeddings.parameters.shift_length_in_sec = [0.75, 0.5, 0.25]
config.diarizer.speaker_embeddings.parameters.multiscale_weights = [0.33, 0.33, 0.33]
config.diarizer.speaker_embeddings.parameters.save_embeddings = False

config.diarizer.ignore_overlap = False
config.diarizer.oracle_vad = False
config.diarizer.collar = 0.25


config.diarizer.vad.model_path = 'vad_multilingual_marblenet'
config.diarizer.oracle_vad = False

model = ClusteringDiarizer(cfg=config)
model.diarize()

del model
torch.cuda.empty_cache()

[NeMo I 2023-09-05 05:19:29 clustering_diarizer:127] Loading pretrained vad_multilingual_marblenet model from NGC
[NeMo I 2023-09-05 05:19:29 cloud:68] Downloading from: https://api.ngc.nvidia.com/v2/models/nvidia/nemo/vad_multilingual_marblenet/versions/1.10.0/files/vad_multilingual_marblenet.nemo to /root/.cache/torch/NeMo/NeMo_1.21.0rc0/vad_multilingual_marblenet/670f425c7f186060b7a7268ba6dfacb2/vad_multilingual_marblenet.nemo
[NeMo I 2023-09-05 05:19:30 common:913] Instantiating model from pre-trained checkpoint


[NeMo W 2023-09-05 05:19:30 modelPT:161] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.
    Train config : 
    manifest_filepath: /manifests/ami_train_0.63.json,/manifests/freesound_background_train.json,/manifests/freesound_laughter_train.json,/manifests/fisher_2004_background.json,/manifests/fisher_2004_speech_sampled.json,/manifests/google_train_manifest.json,/manifests/icsi_all_0.63.json,/manifests/musan_freesound_train.json,/manifests/musan_music_train.json,/manifests/musan_soundbible_train.json,/manifests/mandarin_train_sample.json,/manifests/german_train_sample.json,/manifests/spanish_train_sample.json,/manifests/french_train_sample.json,/manifests/russian_train_sample.json
    sample_rate: 16000
    labels:
    - background
    - speech
    batch_size: 256
    shuffle: true
    is_tarred: false
    tarred_audio_filepaths: null
    tarred_shard_strategy: sca

[NeMo I 2023-09-05 05:19:30 features:289] PADDING: 16
[NeMo I 2023-09-05 05:19:30 save_restore_connector:249] Model EncDecClassificationModel was successfully restored from /root/.cache/torch/NeMo/NeMo_1.21.0rc0/vad_multilingual_marblenet/670f425c7f186060b7a7268ba6dfacb2/vad_multilingual_marblenet.nemo.
[NeMo I 2023-09-05 05:19:30 clustering_diarizer:157] Loading pretrained titanet_large model from NGC
[NeMo I 2023-09-05 05:19:30 cloud:68] Downloading from: https://api.ngc.nvidia.com/v2/models/nvidia/nemo/titanet_large/versions/v1/files/titanet-l.nemo to /root/.cache/torch/NeMo/NeMo_1.21.0rc0/titanet-l/11ba0924fdf87c049e339adbf6899d48/titanet-l.nemo
[NeMo I 2023-09-05 05:19:34 common:913] Instantiating model from pre-trained checkpoint


[NeMo W 2023-09-05 05:19:34 modelPT:161] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.
    Train config : 
    manifest_filepath: /manifests/combined_fisher_swbd_voxceleb12_librispeech/train.json
    sample_rate: 16000
    labels: null
    batch_size: 64
    shuffle: true
    is_tarred: false
    tarred_audio_filepaths: null
    tarred_shard_strategy: scatter
    augmentor:
      noise:
        manifest_path: /manifests/noise/rir_noise_manifest.json
        prob: 0.5
        min_snr_db: 0
        max_snr_db: 15
      speed:
        prob: 0.5
        sr: 16000
        resample_type: kaiser_fast
        min_speed_rate: 0.95
        max_speed_rate: 1.05
    num_workers: 15
    pin_memory: true
    
[NeMo W 2023-09-05 05:19:34 modelPT:168] If you intend to do validation, please call the ModelPT.setup_validation_data() or ModelPT.setup_multiple_validation_data() method 

[NeMo I 2023-09-05 05:19:34 features:289] PADDING: 16
[NeMo I 2023-09-05 05:19:35 save_restore_connector:249] Model EncDecSpeakerLabelModel was successfully restored from /root/.cache/torch/NeMo/NeMo_1.21.0rc0/titanet-l/11ba0924fdf87c049e339adbf6899d48/titanet-l.nemo.
[NeMo I 2023-09-05 05:19:35 speaker_utils:93] Number of files to diarize: 1
[NeMo I 2023-09-05 05:19:35 clustering_diarizer:309] Split long audio file to avoid CUDA memory issue


splitting manifest: 100%|██████████| 1/1 [00:10<00:00, 10.41s/it]


[NeMo I 2023-09-05 05:19:46 classification_models:273] Perform streaming frame-level VAD
[NeMo I 2023-09-05 05:19:46 collections:301] Filtered duration for loading collection is  0.00 hours.
[NeMo I 2023-09-05 05:19:46 collections:302] Dataset loaded with 10 items, total duration of  0.13 hours.
[NeMo I 2023-09-05 05:19:46 collections:304] # 10 files loaded accounting to # 1 labels


vad: 100%|██████████| 10/10 [00:06<00:00,  1.43it/s]

[NeMo I 2023-09-05 05:19:53 clustering_diarizer:262] Converting frame level prediction to speech/no-speech segment in start and end times format.



creating speech segments: 100%|██████████| 1/1 [00:00<00:00,  2.22it/s]

[NeMo I 2023-09-05 05:19:53 clustering_diarizer:287] Subsegmentation for embedding extraction: scale0, ./diarized/speaker_outputs/subsegments_scale0.json
[NeMo I 2023-09-05 05:19:53 clustering_diarizer:343] Extracting embeddings for Diarization
[NeMo I 2023-09-05 05:19:53 collections:301] Filtered duration for loading collection is  0.00 hours.
[NeMo I 2023-09-05 05:19:53 collections:302] Dataset loaded with 616 items, total duration of  0.26 hours.
[NeMo I 2023-09-05 05:19:53 collections:304] # 616 files loaded accounting to # 1 labels



[1/3] extract embeddings: 100%|██████████| 20/20 [00:01<00:00, 10.93it/s]

[NeMo I 2023-09-05 05:19:55 clustering_diarizer:287] Subsegmentation for embedding extraction: scale1, ./diarized/speaker_outputs/subsegments_scale1.json
[NeMo I 2023-09-05 05:19:55 clustering_diarizer:343] Extracting embeddings for Diarization
[NeMo I 2023-09-05 05:19:55 collections:301] Filtered duration for loading collection is  0.00 hours.
[NeMo I 2023-09-05 05:19:55 collections:302] Dataset loaded with 925 items, total duration of  0.26 hours.
[NeMo I 2023-09-05 05:19:55 collections:304] # 925 files loaded accounting to # 1 labels



[2/3] extract embeddings: 100%|██████████| 29/29 [00:02<00:00, 13.93it/s]


[NeMo I 2023-09-05 05:19:57 clustering_diarizer:287] Subsegmentation for embedding extraction: scale2, ./diarized/speaker_outputs/subsegments_scale2.json
[NeMo I 2023-09-05 05:19:57 clustering_diarizer:343] Extracting embeddings for Diarization
[NeMo I 2023-09-05 05:19:57 collections:301] Filtered duration for loading collection is  0.00 hours.
[NeMo I 2023-09-05 05:19:57 collections:302] Dataset loaded with 1850 items, total duration of  0.26 hours.
[NeMo I 2023-09-05 05:19:57 collections:304] # 1850 files loaded accounting to # 1 labels


[3/3] extract embeddings: 100%|██████████| 58/58 [00:02<00:00, 19.72it/s]
clustering: 100%|██████████| 1/1 [00:03<00:00,  3.08s/it]

[NeMo I 2023-09-05 05:20:04 clustering_diarizer:464] Outputs are saved in /kaggle/working/diarized directory
[NeMo I 2023-09-05 05:20:04 der:176] Cumulative Results for collar 0.25 sec and ignore_overlap False: 
     FA: 0.0000	 MISS 0.0000	                 Diarization ER: 0.0000	, Confusion ER:0.0000





## 合并

设定说话人的名字，按照说话的顺序，写入speakers

In [6]:
speakers = ["ハルクさん","鍋島さん"]

speaker_ts = []
with open('./diarized/pred_rttms/audio_16k.rttm', 'r') as f:
    lines = f.readlines()
    for line in lines:
        line_list = line.split(' ')
        s = int(float(line_list[5]) * 1000)
        e = s + int(float(line_list[8]) * 1000)
        speaker_ts.append([s, e, int(line_list[11].split('_')[-1])])

def calculate_overlap_percentage(a_start, a_end, b_start, b_end):
    overlap_start = max(a_start, b_start)
    overlap_end = min(a_end, b_end)

    if overlap_start <= overlap_end:
        overlap_duration = overlap_end - overlap_start
        a_duration = a_end - a_start
        b_duration = b_end - b_start

        overlap_percentage = max(overlap_duration / min(a_duration, b_duration), overlap_duration / max(a_duration, b_duration))

        return overlap_percentage

    return 0.0

def find_max_overlap_or_closest(start, end, array, getter=lambda item: (item[0], item[1])):
    max_overlap_percentage = 0.0
    max_overlap_element = None
    closest_element = None
    closest_distance = float('inf')

    for item in array:
        a_start, a_end = getter(item)

        overlap_percentage = calculate_overlap_percentage(a_start, a_end, start, end)

        if overlap_percentage > max_overlap_percentage:
            max_overlap_percentage = overlap_percentage
            max_overlap_element = item

        b_midpoint = (start + end) / 2
        a_midpoint = (a_start + a_end) / 2
        distance = abs(b_midpoint - a_midpoint)

        if distance < closest_distance:
            closest_distance = distance
            closest_element = item

    return max_overlap_percentage, closest_element if max_overlap_percentage == 0.0 else max_overlap_element

draft = ''
for wrd_dict in word_ts:
    ws, we, wrd = int(wrd_dict['start'] * 1000), int(wrd_dict['end'] * 1000), wrd_dict['text']
    max_overlap_percentage, element = find_max_overlap_or_closest(ws, we, speaker_ts)
    if max_overlap_percentage < 0.5:
      print("Warning: max_overlap_percentage:%.2fs, [%.2fs -> %.2fs]「%s」" % (max_overlap_percentage, ws/1000, we/1000, wrd))
    draft += "%s: 「%s」\n" % (speakers[element[2]], wrd)
print(draft)

ハルクさん: 「計算式プラグインはやっぱり便利ですか?」
鍋島さん: 「便利ですね。やっぱり元々のKintonの計算フィールドって結局、」
鍋島さん: 「文字列1行と計算フィールドにしか入らないんですけど。」
ハルクさん: 「はい、もうKintonさんある。」
ハルクさん: 「今日はですね、後ろまたなべちゃんを呼んで、」
ハルクさん: 「備品管理のKinton活用事例っていうのをご紹介していきたいなと思います。」
ハルクさん: 「備品管理だけじゃなくて、僕らが最近教えている計算式プラグインを使った事例となりますので、」
ハルクさん: 「こんなこともできるんだよっていうのを参考になればいいかなと思っております。」
ハルクさん: 「うちのなべちゃんはですね、クルーデータとか、計算式プラグインをひたすら使いまくってますから、」
ハルクさん: 「それ実際にお客さんの導入事例としてあるということだったので、」
ハルクさん: 「持ち込み企画として動画に撮っております。」
ハルクさん: 「ということで、じゃあなべちゃん、まずは実際に概要の部分だけ共有してもらった上で、」
ハルクさん: 「画面共有してデモを見せる順番でお願いします。」
鍋島さん: 「アプリの概要ですね。」
鍋島さん: 「わかりました。アプリ的には、社内の備品管理っていうところを目指しています。」
鍋島さん: 「外国人向けの日本語教育されている会社さんの事例で作ったやつで、」
鍋島さん: 「その日本語のテティスト周り、同じテティストも10冊ぐらい持っていたとして、」
鍋島さん: 「その10冊それぞれに番号を付けて管理しています。」
鍋島さん: 「今誰が借りてるっていうのがわかるようにしてほしいっていう話があったので、」
鍋島さん: 「それで作ったアプリっていうのを今回、事例として中身は少し入れ替えたり、」
鍋島さん: 「中のレコードとかは僕らがオリジナルで作った状態にはなるんですけど、」
鍋島さん: 「それをお見せできればと思います。」
ハルクさん: 「いわゆるそのボールペンとか鉛筆みたいな消耗品の管理じゃなくて、」
ハルクさん: 「消耗しないものでね、本とかマイメーター、その教科書みたいな感じで、」
ハルクさん: 「一つ一つ固有で管理をしていて、例えば教科書のAは誰が入れてるっていうのを」
ハルクさん: 「しかした

## Azure OpanAI ChatGPT 总结

设定你的Azure OpenAI信息  
参考：  
https://github.com/openai/openai-python#microsoft-azure-endpoints

In [7]:
import openai

openai.api_type = "azure"
openai.api_base = ""
openai.api_version = "2023-03-15-preview"
openai.api_key = ""
engine = ""
model = "gpt-3.5-turbo-16k"
max_tokens = 16 * 1024

In [14]:
import tiktoken

prompt = "I want you to act as a conference summarization assistant and a multilingual translator. I will provide you with transcribed texts from the conference in different languages. You need to correct homophones and condense the content into summaries, emphasizing the key points of each speaker. I request that you provide the output in JSON format, including the following fields: The 'title' field will store the summarized title, which should not exceed 50 words. The 'summary' field will contain the summary you generate. Here is the conference content:"

encoding = tiktoken.encoding_for_model(model)
system_tokens = len(encoding.encode(prompt))
tokens_every_message = 3
tokens_res_assistant = 3
pre_tokens = len(encoding.encode("system")) + len(encoding.encode("user")) + tokens_res_assistant + 2 * tokens_every_message
total_tokens = len(encoding.encode(draft)) + pre_tokens + system_tokens
print(f"This request consumed {total_tokens} tokens.")
max_res_tokens = max_tokens - total_tokens -1

if max_res_tokens < 0:
    print("Warning: This request has exceeded the maximum processing limit of GPT. Please change the model.")
    exit()
if max_res_tokens < 400:
    print("Warning: The response for this request is fewer than 400 tokens. Please take note.")

completion = openai.ChatCompletion.create(
  engine=engine,
  messages = [
    {"role": "system", "content": prompt},
    {"role": "user", "content": draft}
  ],
  temperature=0.7,
  max_tokens=max_res_tokens,
  top_p=0.95,
  frequency_penalty=0,
  presence_penalty=0,
  stop=None)
result  = completion.choices[0].message.content
print(result)

This request consumed 5238 tokens.
{
  "title": "備品管理のKinton活用事例",
  "summary": "ハルクさんは備品管理のKinton活用事例を紹介しました。備品管理アプリは消耗品ではなく、固有の備品の管理を目的としており、借りた人の情報や貸出可能な備品のリストが表示されます。さらに、計算式プラグインを使って自動化された機能も紹介されました。鍋島さんはアプリの概要や計算式プラグインの使い方を説明しました。計算式プラグインはテーブルの最後の行の情報を他のフィールドに自動的に反映させることができます。この事例は計算式プラグインの便利さを示し、導入率の高さも紹介されました。"
}


## 将总结内容上传

设定kintone app信息  
如domain设定为 xxxx.cybozu.cn 或者 xxxx.cybozu.com  
app_id为数字  
api_token的为对应的app的有添加权限的token

In [15]:
kintone_domain = ""
app_id = 1
api_token = ""

In [16]:
import json
import requests

info = json.loads(result)
url = f"https://{kintone_domain}/k/v1/record.json"
headers = {
    "Content-Type": "application/json",
    "X-Cybozu-API-Token":api_token,
}
data = {
    "app":app_id,
    "record":{
        'title':{
            'value':info['title']
        },
        'summary':{
            'value':info['summary']
        }
    }
}
resp=requests.post(url,json=data,headers=headers)
print(resp.text)

{"id":"2","revision":"1"}
