In [None]:
import s3
lecture_bottom = s3.extract('dev/Top_Lecture/', 'SON', 'ALL')

In [None]:
tt = '의,가,이,은,들,는 안녕하세요 김민수입니다.' 
dp = DataProcessor()
' '.join(dp.text_preprocess(tt))

### Input Video -> S3 -> Video -> Audio -> Text -> Preprocess

In [1]:
import dash
from dash import dcc
from dash import html
from dash.dependencies import Input, Output

import boto3
import base64
import io
from datetime import datetime
from moviepy.editor import VideoFileClip
import speech_recognition as sr

import settings
from data import DataProcessor

# Initialize
data_processor = DataProcessor()

# AWS S3 credentials
s3 = boto3.client('s3', 
                  aws_access_key_id=settings.DB_SETTINGS['_s3']['ACCESS_KEY_ID'],
                  aws_secret_access_key=settings.DB_SETTINGS['_s3']['ACCESS_SECRET_KEY'])
transcribe = boto3.client('transcribe', 
                          aws_access_key_id=settings.DB_SETTINGS['_s3']['ACCESS_KEY_ID'],
                          aws_secret_access_key=settings.DB_SETTINGS['_s3']['ACCESS_SECRET_KEY'])
bucket_name = settings.DB_SETTINGS['_s3']['BUCKET_NAME']

# Create a Dash app
app = dash.Dash(__name__)

# Define layout
app.layout = html.Div([
    dcc.Upload(
        id='upload-video',
        children=html.Div([
            'Drag and Drop or ',
            html.A('Select a Video')
        ]),
        style={
            'width': '100%',
            'height': '60px',
            'lineHeight': '60px',
            'borderWidth': '1px',
            'borderStyle': 'dashed',
            'borderRadius': '5px',
            'textAlign': 'center',
            'margin': '10px'
        },
        multiple=False
    ),
    html.Div(id='output-upload')
])

# Define callback
@app.callback(Output('output-upload', 'children'),
              Input('upload-video', 'contents'))
def upload_video(contents):
    if contents is not None:
        content_type, content_string = contents.split(',')
        decoded = base64.b64decode(content_string)
        # Get current time and format it as a string
        path = 'user/video/'
        current_time = datetime.now().strftime("%Y%m%d%H%M%S")
        filename = f'{current_time[:8]}_{current_time[8:]}_user_video.mp4'
        s3.upload_fileobj(io.BytesIO(decoded), bucket_name, path + filename)
        
        # Download the video from S3
        s3.download_file(bucket_name, path + filename, 'extracted_video.mp4')
        
        # Extract audio from video
        video = VideoFileClip('extracted_video.mp4')
        video.audio.write_audiofile('extracted_audio.wav')
        
        # Transcribe audio file into text
        r = sr.Recognizer() 
        with sr.AudioFile('extracted_audio.wav') as source: 
            audio = r.record(source) 
            text = r.recognize_google(audio, language="ko-KR") 
            # preprocess text
            text = data_processor.text_preprocess(text)

        # Save the transcript to a .txt file and upload it to S3
        transcript_file = io.BytesIO(' '.join(text).encode())
        print(transcript_file.getvalue().decode('utf-8')) # 적재된 데이터 확인
        s3.upload_fileobj(transcript_file, bucket_name, f'user/transcript/{current_time[:8]}_{current_time[8:]}_transcript.txt')
        
        return html.Div([
            'Video and transcript successfully uploaded to S3'
        ])

# Run app
if __name__ == '__main__':
    app.run_server(debug=True)

### 

### S3 -> Dash

In [1]:
from data import DataProcessor
from figure import FigureGenerator


# Initialize DataProcessor and FigureGenerator
data_processor = DataProcessor()
figure_generator = FigureGenerator('dev/Top_Lecture/', 'SON', 
                                   'dev/Other_Lecture/', 'BYUN',
                                   'user/transcript/', '20231219173149')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\hslio\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


<<<<< Txt List >>>>>
['dev/Top_Lecture/중학뉴런수학2학년(상)_SON_GPT_41672.txt']
<<<<< Txt List >>>>>
['dev/Other_Lecture/중학뉴런수학2학년(상)_BYUN_GPT_62031.txt']
<<<<< Txt List >>>>>
['user/transcript/20231219173149_transcript.txt']


In [3]:
import s3

ss = 
print(ss)

user/transcript/20231219_173850_transcript.txt
