In [1]:
import os
import numpy as np
import pandas as pd
# read,concatenate, split audio
import pydub
from pydub import AudioSegment
from scipy.io import wavfile
# shuffle data in dataframe
from sklearn.utils import shuffle
# transform audio to feature vector
import librosa
# report result
from sklearn.metrics import classification_report,confusion_matrix
# svm
from sklearn import svm
# read, save model
import joblib
# convert video to audio
import moviepy.editor as mp
import subprocess

# load model

In [2]:
class praise_detect():
    def __init__(self, model_file):
        self.model_file = model_file
        # load model
        self.model = joblib.load(self.model_file)        

    def predict_praise(self, x_feature):
        '''x_feature is 2d-array'''
        self.result = self.model.predict(x_feature)      
        return self.result    

In [3]:
model = praise_detect('/home/anhnvh/Desktop/Kidtopi/emotion_classification/model/praise_svm_ver2.pkl')

# load and preprocess video

In [4]:
video_path = '/home/anhnvh/Desktop/Kidtopi/emotion_classification/input/video/'
audio_path = '/home/anhnvh/Desktop/Kidtopi/emotion_classification/input/audio/'
audio_part_path = '/home/anhnvh/Desktop/Kidtopi/emotion_classification/input/audio_part/'
video_files = os.listdir(video_path)

In [5]:
# create a function, split_audio_from_video
def extract_audio_from_video(video_name):
    # find the index of '.mp4' in the video name
    idx_extension = video_name.find('.mp4')
    # audio name, it's saved in the folder
    audio_name = video_name[:idx_extension] + '.wav'
    # way 1: https://zulko.github.io/moviepy/ref/VideoClip/VideoClip.html
    # clip = mp.VideoFileClip(video_path + video_name)
    # clip.audio.write_audiofile(audio_path + audio_name)
    
    # way 2: extract audio full time from video
    command = "ffmpeg -i {0} -async 1  {1}".format(video_path + video_name, audio_path + audio_name)

    subprocess.call(command, shell=True)
    
# convert milliseconds to seconds minutes hours and convert seconds minutes hours to millis
def convertMillis_to_Time(millis):
    seconds=int((millis/1000)%60)
    minutes=int((millis/(1000*60))%60)
    hours=int((millis/(1000*60*60))%24)
    return '{0}:{1}:{2}'.format(hours, minutes, seconds)
def convertTime_to_Millis(hours, minutes, seconds):
    return (hours*60*60*1000 + minutes*60*1000 + seconds*1000)

# dectect the time part (2s) in video
def part_time_video(video_name):
    # extract audio in video
    extract_audio_from_video(video_name)
    # find the index of '.mp4' in the video name
    idx_extension = video_name.find('.mp4')
    # audio name, it's saved in the folder
    audio_name = video_name[:idx_extension] + '.wav'
    # load and detect total time of audio
    sample_rate, audio = wavfile.read(audio_path + audio_name)
    #time_audio = float("{:10.0f}".format(len(audio) / sample_rate))
    time_audio = int(len(audio)/ sample_rate)
    # find the start and end time for each audio part (ms)
    ls_startTime = [x*1000 for x in np.arange(0, time_audio - 1, 1)]
    ls_endTime = [x*1000 for x in np.arange(2, time_audio + 1, 1)]
    ls_startTimeReal = [convertMillis_to_Time(i) for i in ls_startTime]
    ls_endTimeReal = [convertMillis_to_Time(i) for i in ls_endTime]
    
    # create new dataframe, df_part_time
    df_part_time = pd.DataFrame()
    df_part_time['startTime'] = ls_startTime
    df_part_time['endTime'] = ls_endTime
    df_part_time['start_H_M_S'] = ls_startTimeReal
    df_part_time['end_H_M_S'] = ls_endTimeReal
    df_part_time['video_name'] = video_name
    df_part_time['audio_name'] = audio_name
    return df_part_time[['video_name', 'audio_name', 'startTime', 'endTime', 'start_H_M_S', 'end_H_M_S']]

In [6]:
df_video = pd.DataFrame()
for video in video_files:
    df_video = df_video.append(part_time_video(video))

df_video = df_video.reset_index().reset_index()
df_video = df_video.drop('index', axis= 1)
df_video.columns = ['id_x','video_name', 'audio_name', 'startTime', 'endTime', 'start_H_M_S', 'end_H_M_S']
df_video.id_x = df_video.id_x.astype('str') + 'x'

In [7]:
# predict praise of audio parts in audio
def predict_praise(id_x):
    '''id_x is string type'''
    print(id_x)
    # find index of id_x
    idx = df_video[df_video.id_x == id_x].index[-1]
    # take audio name, startTime, endTime
    audio_name = df_video.loc[idx, 'audio_name']
    startTime = df_video.loc[idx, 'startTime']
    endTime = df_video.loc[idx, 'endTime']
    # plit audio part in audio
    def split_audio_part( output_format = 'wav'):
        # Opening file and extracting segment
        song = pydub.AudioSegment.from_wav(audio_path + audio_name)
        extract = song[startTime:endTime]
        # Saving
        extract.export(audio_part_path + id_x + '.wav', format= output_format)
    # split audio part
    split_audio_part()
    
    sample_rate, audio = wavfile.read(audio_part_path + id_x + '.wav')
    time_audio = float("{:10.2f}".format(len(audio) / sample_rate))
    # convert the audio part have enough 2s
    def normalize_audio():
        output_format = 'wav'
        time_excess = 2.00 - time_audio
        audio_fix_path = '/home/anhnvh/Desktop/Kidtopi/emotion_classification/data/audio_fix_error/'
        audio_fix_name = 'audio_fix.wav'
        audio_add = split_audio_part(0, 0, 0, time_excess, audio_fix_name, audio_fix_path)
        # concatenate audio
        audio_part = audio_part + audio_add
        # return save new audio name 
        return audio_part.export(audio_part_path + id_x + '.wav', format= output_format)
      
    if (time_audio > 0) and (time_audio <2):
        normalize_audio(audio_name)
    y1, sr1 = librosa.load(audio_part_path + id_x + '.wav')
    # result is 2d-array
    b = librosa.feature.mfcc(y1,sr1)
    # transform 2d-array(m x n ) to 2d-array (1 x (mn))
    x_feature =  np.reshape(b, (1,np.product(b.shape)))
    target = model.predict_praise(x_feature)[-1]
    # remove audio part
    os.remove(audio_part_path + id_x + '.wav')
    return target

In [8]:
import time

In [9]:
t1 = time.time()
predict_praise('995x')
t2 = time.time()
t2 - t1

995x


0.7246670722961426

In [10]:
t1 = time.time()
df_video['target'] = df_video.id_x.apply(predict_praise)
t2 = time.time()
t2 - t1

0x
1x
2x
3x
4x
5x
6x
7x
8x
9x
10x
11x
12x
13x
14x
15x
16x
17x
18x
19x
20x
21x
22x
23x
24x
25x
26x
27x
28x
29x
30x
31x
32x
33x
34x
35x
36x
37x
38x
39x
40x
41x
42x
43x
44x
45x
46x
47x
48x
49x
50x
51x
52x
53x
54x
55x
56x
57x
58x
59x
60x
61x
62x
63x
64x
65x
66x
67x
68x
69x
70x
71x
72x
73x
74x
75x
76x
77x
78x
79x
80x
81x
82x
83x
84x
85x
86x
87x
88x
89x
90x
91x
92x
93x
94x
95x
96x
97x
98x
99x
100x
101x
102x
103x
104x
105x
106x
107x
108x
109x
110x
111x
112x
113x
114x
115x
116x
117x
118x
119x
120x
121x
122x
123x
124x
125x
126x
127x
128x
129x
130x
131x
132x
133x
134x
135x
136x
137x
138x
139x
140x
141x
142x
143x
144x
145x
146x
147x
148x
149x
150x
151x
152x
153x
154x
155x
156x
157x
158x
159x
160x
161x
162x
163x
164x
165x
166x
167x
168x
169x
170x
171x
172x
173x
174x
175x
176x
177x
178x
179x
180x
181x
182x
183x
184x
185x
186x
187x
188x
189x
190x
191x
192x
193x
194x
195x
196x
197x
198x
199x
200x
201x
202x
203x
204x
205x
206x
207x
208x
209x
210x
211x
212x
213x
214x
215x
216x
217x
218x
219x
220x
221x


1551x
1552x
1553x
1554x
1555x
1556x
1557x
1558x
1559x
1560x
1561x
1562x
1563x
1564x
1565x
1566x
1567x
1568x
1569x
1570x
1571x
1572x
1573x
1574x
1575x
1576x
1577x
1578x
1579x
1580x
1581x
1582x
1583x
1584x
1585x
1586x
1587x
1588x
1589x
1590x
1591x
1592x
1593x
1594x
1595x
1596x
1597x
1598x
1599x
1600x
1601x
1602x
1603x
1604x
1605x
1606x
1607x
1608x
1609x
1610x
1611x
1612x
1613x
1614x
1615x
1616x
1617x
1618x
1619x
1620x
1621x
1622x
1623x
1624x
1625x
1626x
1627x
1628x


494.29007172584534

In [14]:
def take_text(target):
    if target == 0:
        return '0'
    elif target == 1:
        return 'very good'
    elif target == 2:
        return 'good'

In [16]:
df_video['text'] = df_video.target.apply(take_text)

In [13]:
df_video[df_video.target != 0].shape

(68, 8)

In [18]:
writer = pd.ExcelWriter('/home/anhnvh/Desktop/Kidtopi/emotion_classification/output/result_ver2.xlsx', engine = 'xlsxwriter')
df_video.to_excel(writer, 'Sheet1', index= False)
writer.save()