-
Notifications
You must be signed in to change notification settings - Fork 0
/
smi2srt.py
260 lines (206 loc) · 9.63 KB
/
smi2srt.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
#!usr/bin/env python3
__author__ = 'Myeongseon Choi key262yek@gmail.com'
__date__ = "2017/12/11"
__version__ = '1.1.0'
__version_info__ = (1,1,0) #Provide removing files including. previous error.
'''================================= Import Module ==============================='''
import os, logging,platform
import stat #Change permission.
import win32security #unresolved , you can get it from pypiwin32
import ntsecuritycon as con #unresolved
import re #Regex
import chardet #Unresolved import
'''================================ Parameter declaration ========================='''
default_language = 'ko-KR'
'''================================= Class ==============================='''
class smiItem() :
def __init__(self,path,head,body) :
self.path = path #Path of smi file. It need for exporting srt file.
self.head = head #Head of smi
self.body = body #Body of smi file
language_info = self.find_language()
# structure : CC name : (srt_subtitles, language name)
# ex) 'ENCC' : ([...], 'en-US')
# converted srt files for each languages
self.converted = {name : [[],lang] for name,lang in language_info}
logging.info('Declare smi files is done. Directory : {}'.format(self.path))
def find_language(self) :
#Find languages of subtitles and its local name.
#ex) .ENCC {Name: 'English Captions'; lang: en-US; SAMIType : CC;}
#space after period and ENCC, space and text between {}
language_setting = re.findall('[ ]*\.([^ ]*)[ ]*\{([^}]*)\}',self.head)
language_info = []
for name,setting in language_setting :
#Remove space.
nospace_setting = re.sub(r'\s+','',setting)
#split by ';' last term must be blank(i.e. ''). we need to remove it also.
split_setting = re.split(';|:',nospace_setting)[:-1]
#make dictionary from it
dict_setting = {category : split_setting[2*idx+1] for idx,category in enumerate(split_setting[::2])}
if 'lang' in dict_setting :
lang = dict_setting['lang']
else :
lang = default_language #In my case, 'ko-KR' is default.
logging.warning('the language entity is empty in the head of the file {}'.format(self.path))
language_info.append((name,lang))
logging.debug('find_language finished')
return language_info
def remove_meaningless_tags(self) :
#Remove tags which are not supported for srt subtitle format.
meaningless = ['b','i','u','basefont','bdo','big','blockquote','caption','center','col','colgroup','dd','dl','dt','div','h1','h2','h3','h4','h5','h6','hr','img','li','ol','p','pre','q','s','small','span','strike','sub','sup','table','tbody','td','th','tr','tt','tfoot','thead','ul']
for key in meaningless :
key_hat = r'</?'+re.escape(key)+r'>'
self.body = re.sub(key_hat,'',self.body,flags=re.IGNORECASE)
#Remove <P Class = ..CC ID = ... > tag
self.body = re.sub(r'<[ ]*[Pp] [Cc]lass[ =]*[A-Za-z]*[Cc]{2}[ ]*[IDid]{2}[^>]*>[^<]*','',self.body)
#Remove <font color .. > tag
self.body = re.sub(r'<[ /]*font[^>]*>','',self.body,flags=re.IGNORECASE)
#Remove <! ... --> tag. those are comment format of sami.
self.body = re.sub(r'<![-]{,2}(.*?)[-]{1,2}>','',self.body)
#Logging
logging.debug('remove_meaningless_tags finished')
'''
def replace_br_tags(self) :
#Change <br> tags to '\n'
self.body = re.sub(r'<br>','\n',self.body,flags = re.IGNORECASE)
logging.debug('change_br_tags finished')
'''
def convert_subtitles(self) :
#Find all text and save in proper list of subtitles.
end_point,smi_starttime = 0,0
for synctag in re.finditer(r'<sync\s*start[ =]*(\d*)>',self.body,re.IGNORECASE) : # Find the tag <STNC Start = (time)>
'''Matching algorithm.
<sync start = (start time)>(end_point)
.
.
.
(start_point)<sync start = (end time)>
search item between end_point of previous tag and start_point of current tag.
This algorithm may ignore the last subtitle which is not wrapped by <sync start> tags, however for convenient I use this method.
'''
#start_point = index of the first character '<' of <sync..
#smitime = time information of subtitle
#end point of previous tag is needed to search P Class tag. Hence replace it in the end of current iteration.
start_point = synctag.start()
smitime = int(synctag.group(1))
#It this tag is the first tag of the group of subtitles, move to next tag.
if smi_starttime == 0 or smitime < smi_starttime :
start_point = synctag.start()
end_point = synctag.end()
smi_starttime = smitime
continue
smi_endtime = smitime
srt_starttime = srt_time(smi_starttime)
srt_endtime = srt_time(smi_endtime)
#Read <P Class = nametag>
#There can be multiple P Class tags in a single Sync tag. (((?!<[ ]*P).)*) makes it works including such cases.
for langtag in re.finditer(r'<[ ]*[Pp][ ]*[Cc]lass[ =]*([A-Za-z]*CC)[ ]*>(((?!<[ ]*P).)*)',self.body[end_point:start_point]) :
name = langtag.group(1)
msg = langtag.group(2)
if ' ' in msg :
continue
if not name in self.converted :
#Sometimes, there are some tricky files which do not declare CC names in the head. Moreover the CC name in body is 'UnknownC'
self.converted[name] = ([],default_language) #It may be more probable the unknowncc are the default lanugage.
logging.warning('the unknowncc appear in the file {}'.format(self.path))
subtitles,lang = self.converted[name]
subtitle = srt_format(srt_starttime,srt_endtime,msg)
subtitles.append(subtitle)
smi_starttime = smitime
end_point = synctag.end()
logging.debug('convert_subtitles is finished')
def write_srt(self) :
#write srt file from subtitles
filename = self.path[:-3] #remove 'smi' from the original smi file.
for subtitles,lang in self.converted.values() :
if not subtitles : #There exist the smi files which do not include subtitles of language declared in the head.
continue
srt_output = open(filename+lang[:2]+'.srt','wb')
for idx,subtitle in enumerate(subtitles) :
subtitle_with_number = '{}\r\n'.format(idx+1) + subtitle
srt_output.write(subtitle_with_number.encode('utf-8'))
srt_output.close()
logging.info('Converted finished. file directory : {}'.format(self.path))
''' ====================================== Fucntions ============================================'''
#Read contents from smi files.
def read_smi(smi_path) :
#Read smi file.
smi = open(smi_path,'rb')
contents = smi.read()
smi.close()
#Check the encoding of contents
encoding = chardet.detect(contents)['encoding']
#Raise error when there is no encoding detected.
if encoding == None :
logging.warning('Cannot find a proper encoding of file {}'.format(smi_path))
return None
#Re-encode contents into unicode
contents = contents.decode(encoding)
#Remove newline character. It is need for regex multiline search
contents = re.sub(r'[\n\r\t]+','',contents)
#cut head and body by <HEAD> and <BODY> tags.
head_contents = re.search('<HEAD>(.*)</HEAD>',contents,re.IGNORECASE)
body_contents = re.search('<BODY>(.*)</BODY>',contents,re.IGNORECASE)
if not (head_contents and body_contents):
logging.warning('Head or Body is missing in the file {}'.format(smi_path))
return None
#Declare and return smi class
logging.debug('read_smi finished for the file : {}'.format(smi_path))
return smiItem(smi_path,head_contents.group(1),body_contents.group(1))
def srt_time(smitime) :
#smi time = 0000000 (only ms)
#srt time = hours:minutes:seconds,ms
#Convert input smi time to srt time format.
time,ms = divmod(smitime,1000)
time,sec = divmod(time,60)
hour,minute = divmod(time,60)
logging.debug('srt_time finished')
return '{}:{}:{},{}'.format(hour,minute,sec,ms)
def srt_format(srt_starttime,srt_endtime,msg) :
#Make srt subtitle from given times and msg.
#subtitle number will be added while writing.
msg = msg.replace('<br>','\r\n')
logging.debug('srt_format finished')
return '{} --> {}\r\n{}\r\n\r\n'.format(srt_starttime,srt_endtime,msg)
def remove_empty_srt(srt_path) :
#Error correcting.
#Becuase of failure of algorithm, a lot of empty srt files are made in various directory.
srt_file = open(srt_path,'rb')
if not srt_file :
return
text = srt_file.read()
if not text :
if platform.system() is 'Windows' :
userx, domain, type = win32security.LookupAccountName("","Everyone")
sd = win32security.GetFileSecurity(srt_path,win32security.DACL_SECURITY_INFORMATION)
dacl = sd.GetSecurityDescriptorDacl()
dacl.AddAccessAllowedAce(win32security.ACL_REVISION , con.FILE_ALL_ACCESS, userx)
sd.SetSecurityDescriptorDacl(1,dacl,0)
win32security.SetFileSecurity(srt_path,win32security.DACL_SECURITY_INFORMATION,sd)
else :
os.chmod(srt_path, stat.S_IWOTH)
os.remove(srt_path)
logging.info('The empty srt file is found in directory {}'.format(srt_path))
return
''' ================================= Main ==================================='''
#Get directories of smi files in the subdirectories of current working directory
cwd = os.getcwd()
logging.basicConfig(filename = 'smi2srt.log',level = logging.WARNING)
logging.debug('program start')
smifiles = []
for current_directory,subdirectories,files in os.walk(cwd) :
for name in files : #Check the name of files.
if name.endswith('.srt') : #Check empty srt files.
remove_empty_srt(os.path.join(current_directory,name))
elif name.endswith(".smi") : #and not name[:-3]+'ko.srt' in files :
#Non converted files only. (However you should check that the already existed file could be empty because of previous error.)
smifiles.append(os.path.join(current_directory,name))
logging.debug('file directory reading is end')
for smi_path in smifiles :
smi_contents = read_smi(smi_path)
if not smi_contents :
continue
smi_contents.remove_meaningless_tags()
smi_contents.convert_subtitles()
smi_contents.write_srt()
logging.debug('whole program finished well.')