### Unpack all zip and rar files 

In [None]:
import os 
import re
from os import path
import patoolib               #for unpacking
from pyunpack import Archive  # for unpacking
from shutil import copyfile
import pysrt
from mafan import simplify
from guess_language import guess_language
from itertools import compress

##### go here to install guess_language : https://bitbucket.org/spirit/guess_language 

##### instructions for apt-get packages not available 
##### http://askubuntu.com/questions/283020/ubuntu-12-04-package-issues
##### follow https://github.com/ponty/pyunpack to install pyunpack

In [None]:
def check(f):
    filename, file_extension = os.path.splitext(f)
    if file_extension == '.zip' or file_extension == '.rar' or file_extension == '.7z':
        return True
    else:
        return False
    
def check_dir(folder):
    if not os.path.exists(folder):
        os.makedirs(folder)        # If not create the directory, inside their home directory
        return True
    
def check_srt(f):
    filename, file_extension = os.path.splitext(f)
    if file_extension == '.srt':
        return True
    else:
        return False
def check_ass(f):
    filename, file_extension = os.path.splitext(f)
    if file_extension == '.ass':
        return True
    else:
        return False
    
## use either gbk or big5 or utf-8
def open_srt(f):
    try:
        some_subs = pysrt.open(f,encoding = 'gbk') 
        return some_subs
    except:
        pass
    try:
        some_subs = pysrt.open(f,encoding = 'big5') 
        return some_subs
    except:
        pass
    try:
        some_subs = pysrt.open(f) 
        return some_subs
    except:
        pass
    
    return None

def validate(srt):
    subs = open_srt(srt) 
    if subs is not None:
        languages = [process_language(ele) for ele in subs]
        return languages
    else:
        #print('Can not decode.')
        return None
    
def replace_special(line_str):
    line_str = re.sub('[『』【】「」♫．#♪]','',line_str)
    line_str = re.sub(r'\{.*?\}','',line_str)     # non greedy match 
    line_str = re.sub(r'\<.*?\>','',line_str)     # non greedy match 
    return line_str 

def check_chinese(text):
    ch = re.findall(r'[\u4e00-\u9fff]+', text)
    return ch
def check_english(text):
    en = re.findall(r"[A-Za-z]+", text)
    return en

def process_language(ele):
    try:
        text_list = ele.text.split('\n')
        text_list = [replace_special(x) for x in text_list]
        languages = [guess_language(l) for l in text_list]
        en_index = [l=='en' for l in languages]
        zh_index = [l=='zh' for l in languages]
        en_lines = list(compress(text_list,en_index))
        zh_lines = list(compress(text_list,zh_index))
        if len(en_lines)==0 or len(zh_lines)==0:
            return None
        en_text = ' '.join(en_lines)
        if len(check_chinese(en_text)) > 0:
            return None 
        zh_text = simplify(' '.join(zh_lines))
        if len(check_english(zh_text))>0:
            return None
        text_line = en_text + ' | ' + zh_text
        return text_line
    except:
        return None


In [None]:
os.getcwd()

In [None]:
process_folder = 'data/process4/'

data_raw = process_folder + 'data_download/'
data_unpack = process_folder + 'data_unpack/'
data_srt = process_folder + 'data_srt/'
data_ass = process_folder + 'data_ass/'
results_raw = process_folder + 'data_results_srt/'

folders = [process_folder,data_raw,data_unpack,data_srt,data_ass,results_raw]
[check_dir(x) for x in folders]

In [None]:
links = os.listdir(data_raw)
files = [f for f in links if check(f)]
len(files)

In [None]:
### if there are too many files, set it to run part of it forst 
#files = files[40000:60000]


In [None]:
## unpack all the data
for index,f in enumerate(files):
    try:
        f_name,f_ext = os.path.splitext(f)
        Archive(data_raw+f).extractall(data_unpack)
    except:
        pass

In [None]:
## get all srt files, including files in subfolders 
## get all ass files, influding files in sub folders 
unpack_links = []
unpack_names = []
for path, subdirs, files in os.walk(data_unpack):
    for name in files:
        unpack_links.append(os.path.join(path, name))
        unpack_names.append(name)
#unpack_links = os.listdir(data_unpack)
pair = zip(unpack_links,unpack_names)
asss = [f for f in pair if check_ass(f[0])]
pair = zip(unpack_links,unpack_names)
srts = [f for f in pair if check_srt(f[0])]
if len(srts)>0:
    [copyfile(f[0],data_srt+f[1]) for f in srts]
if len(asss)>0:
    [copyfile(f[0],data_ass+f[1]) for f in asss]

In [None]:
## search for srts with both chinese and english in it
## and transform them into txt, seperated by |, dump to results raw
data_srts = [data_srt + f for f in os.listdir(data_srt)]
counter = 0 
for index, srt in enumerate(data_srts):
    res = validate(srt)
    if res is not None: 
        total = len(res)
        ## we only gonna to use files with both english and chinese in it 
        if total > 0:
            none_values = sum([x is None for x in res ])
            ratio = float(none_values)/float(total)
            if ratio > 0.5:  
                pass         
            else:
                res = res[5:-5]
                res = [x for x in res if x is not None]
                counter+=1
                filename, file_extension = os.path.splitext(srt)
                fname = filename.split('/')[-1]
                fname = results_raw + str(counter)+'_'+fname+'.txt'
                with open(fname, mode='wt', encoding='utf-8') as myfile:
                    myfile.write('\n'.join(res))
                #print(res)
                if counter%50 == 0:
                    print(counter)