In [1]:
import requests
import re
import time
import random
import os
import datetime
import xmlrpc.client

### Update

In [19]:
# store dmhy.org historys in local cache, by page
def update_dmhy_cache(start_page, end_page):
    n = start_page 
    while n <= end_page:
        print("getting page {} ...".format(n))
        try:
            content = requests.get("https://share.dmhy.org/topics/list/page/{}".format(n)).text
            store_cache(content, 'dmhy/cache/{}.txt'.format(n))  # path: dmhy/cache/n.txt
            n+=1
        except Exception as e:
            print(datetime.datetime.today(), e)
            time.sleep(8)
        time.sleep(2+random.random())   # interval
        
def store_cache(content, filename):
    table = re.findall(r'<tbody>[\s\S]*</tbody>', content)[0]  
    rows = re.findall(r'<tr[\s\S]*?</tr>',table)                          
    new_items = []
    for i in rows:
        detail = re.findall(r'<td[\s\S]*?</td>',re.sub(r'[\n\t]','',i))  # cols in a row
        release_time = re.findall(r'<span.*?>(.*?)</span>',detail[0])[0]
        release_type = re.sub(r'<.*?>','',detail[1])
        release_title = re.findall(r'<a.*?>(.*?)</a>',detail[2])[-1]
        release_title = re.sub(r',','.',release_title)
        release_magnet = re.findall( r'href="([^"]*)"',detail[3])[0]
        release_size = re.sub(r'<.*?>','',detail[4])
        new_items.append(','.join([release_time, release_type, release_title, release_magnet,release_size])+'\n\n')
    with open(filename, 'w', encoding='utf8') as f:
        for i in  reversed(new_items):
            f.write(i)  

In [20]:
"""
update animate records from local cache by month
records in file: from old to new
"""

def update_dmhy(start_page, end_page):
    for i in reversed(range(start_page, end_page+1)):
        print("updating page {} ...".format(i))
        update_one_page(i)
        
def update_one_page(n):
    with open('dmhy/cache/{}.txt'.format(n), 'r', encoding='utf8') as f:
        records = [i for i in f.readlines() if len(i)>10]
    dict_records = {}
    for i in records:
        year = i[0:4]
        month = i[5:7]
        idx = '{}/{}'.format(year, month)
        filename = 'dmhy/{}-{}.txt'.format(year,month)
        if not idx in dict_records:    # load odd records by month
            prev_records(year, month, dict_records)
        if not i in dict_records[idx]: # ignore duplicate records
            dict_records[idx].add(i)
            with open(filename,'a+',encoding='utf8') as f:  # append mode
                f.write(i+'\n')
            
def prev_records(year, month, dict_records):
    filename = 'dmhy/{}-{}.txt'.format(year,month)
    idx = '{}/{}'.format(year, month)
    if not os.path.exists(filename):
        dict_records[idx] = set()
        return 
    with open(filename,'r',encoding='utf8') as f:
        dict_records[idx] = set([i for i in f.readlines() if len(i)>10])

In [22]:
# if update page 1-100, 101-200, 201-300, ..., in a reverse order. from oldest to newest, 
# otherwise the order of records will be incorrect. delete the file and try again
start = 1
end = 20
update_dmhy_cache(start,end)

getting page 1 ...
magnet:?xt=urn:btih:OULFEFZT4NFM2CQ2DENB22Q7A2JEPHP4&dn=&tr=http%3A%2F%2F104.238.198.186%3A8000%2Fannounce&tr=udp%3A%2F%2F104.238.198.186%3A8000%2Fannounce&tr=http%3A%2F%2Ftracker.openbittorrent.com%3A80%2Fannounce&tr=udp%3A%2F%2Ftracker3.itzmx.com%3A6961%2Fannounce&tr=http%3A%2F%2Ftracker4.itzmx.com%3A2710%2Fannounce&tr=http%3A%2F%2Ftracker.publicbt.com%3A80%2Fannounce&tr=http%3A%2F%2Ftracker.prq.to%2Fannounce&tr=http%3A%2F%2Fopen.acgtracker.com%3A1096%2Fannounce&tr=https%3A%2F%2Ft-115.rhcloud.com%2Fonly_for_ylbud&tr=http%3A%2F%2Ftracker1.itzmx.com%3A8080%2Fannounce&tr=http%3A%2F%2Ftracker2.itzmx.com%3A6961%2Fannounce&tr=udp%3A%2F%2Ftracker1.itzmx.com%3A8080%2Fannounce&tr=udp%3A%2F%2Ftracker2.itzmx.com%3A6961%2Fannounce&tr=udp%3A%2F%2Ftracker3.itzmx.com%3A6961%2Fannounce&tr=udp%3A%2F%2Ftracker4.itzmx.com%3A2710%2Fannounce&tr=http%3A%2F%2Ftr.bangumi.moe%3A6969%2Fannounce&tr=http%3A%2F%2Ft.nyaatracker.com%2Fannounce&tr=http%3A%2F%2Fopen.nyaatorrents.info%3A6544%2Fanno

KeyboardInterrupt: 

In [15]:
update_dmhy(start,end)

updating page 20 ...
updating page 19 ...
updating page 18 ...
updating page 17 ...
updating page 16 ...
updating page 15 ...
updating page 14 ...
updating page 13 ...
updating page 12 ...
updating page 11 ...
updating page 10 ...
updating page 9 ...
updating page 8 ...
updating page 7 ...
updating page 6 ...
updating page 5 ...
updating page 4 ...
updating page 3 ...
updating page 2 ...
updating page 1 ...


### Search

In [6]:
# read records of the latest n months
def read_history(months):
    files = [i for i in sorted(os.listdir('dmhy'), reverse=True) if len(i)==11][0:months]
    items = []
    for i in files:
        with open('dmhy/'+i, 'r',encoding='utf8') as f:
            records = f.readlines()
            for record in reversed(records):
                record_ = record[:-1].split(',')
                if len(record_) >3 and record_[3] != '':
                    items.append(record_)     
    return items

# search records by rules
def search_history(rules, months):
    items = read_history(months)
    results = {}
    for rule in re.findall(r'{[\s\S]*?}',rules):
        a = Rule(rule)
        for item in items:
            idx, score = a.match(item[1] + ' ' + item[2])
            if idx != -1:
                key = a.title_or[0] + ' ' + a.tostr(idx)
                if not key in results:
                    results[key] = []
                results[key].append( [score] + item[1:])
    for i in results.values():
        i.sort(key = lambda x: x[0], reverse=True)
    return results

# download search results, return a list of title + magnet links
# idx: for multiple matches
def download(results, idx=0):
    ret = []
    for i in results.keys():
        if i[-1] == '*':
            title = i[:-2]
            for j in results[i]:
                print(i, j[0], j[1], j[2], j[-1], sep=',')
                ret.append([title]+j[1:])
        else:
            title = i[:-3]
            if results[i][0][0] > 0:
                j = results[i][idx]
                print(i, j[0], j[1], j[2], j[-1], sep=',')
                ret.append([title] + j[1:])                
    return ret

class Rule():
    def __init__(self, s):
        temp = re.findall(r'title_or.*',s)[0]
        self.title_or = re.findall(r"'(.*?)'",temp)
        temp = re.findall(r'title_and.*',s)
        if len(temp) == 0:
            temp = [r"""'動畫','简|CHS|GB','1080'"""]
        self.title_and = re.findall(r"'(.*?)'",temp[0])
        temp = re.findall(r'epsode_re.*',s)
        if len(temp)==0:
            temp = [r"""'[^a-zA-Z0-9](\d\d)[^a-zA-Z0-9]'"""]
        temp = re.findall(r"'(.*?)'",temp[0])
        self.epsode_re = '' if len(temp)==0 else temp[0]  
        temp = re.findall(r'epsodes.*',s)
        if len(temp)==0:
            temp = [r"""'*'"""]
        epsodes_ = re.findall(r"'(.*?)'",temp[-1])
        self.epsodes = []
        for i in epsodes_:
            if i == '*':
                self.epsodes = ['*']
                break
            ii = i.split('-')
            if len(ii) == 1:
                self.epsodes.append(int(i))
            elif len(ii) == 2:
                self.epsodes += list(range(int(ii[0]),int(ii[1])+1))
                
    def show(self):
        return  [','.join(["'"+i+"'" for i in self.title_or]), 
                 ','.join(["'"+i+"'" for i in self.title_and]), 
                           "'"+self.epsode_re+"'",
                 ','.join(["'"+self.tostr(i)+"'" for i in self.epsodes])]
    def tostr(self,n):
        if n == '*':
            return n
        if n < 10:
            return '0{}'.format(n)
        else:
            return '{}'.format(n)

    def store(self):
        ll = self.show()
        r = '{\n\ttitle_or = ' + ll[0] + '\n\ttitle_and = ' + ll[1] + '\n\tepsode_re = ' + ll[2]  + '\n\tepsodes = ' + ll[3] + '\n}'
        return r
    
    def match(self,s):
        count_title_or = 0
        for i in self.title_or:
            if len(re.findall(i,s))>0:
                count_title_or += 1
        count_title_and = 0
        for i in self.title_and:
            if len(re.findall(i,s))>0:
                count_title_and += 1   
        if count_title_or > 0:
            if '*' in self.epsodes:
                return '*', count_title_and
            epsode_ = re.findall(self.epsode_re, s)
            if len(epsode_) > 0 and re.match(r'\d\d', epsode_[-1]):
                epsode = int(epsode_[-1])
                if epsode in self.epsodes:
                    return epsode, count_title_and
        return -1, 0
    def delete(self, epsode):
        while epsode in self.epsodes:
            self.epsodes.remove(epsode)

In [20]:
# title_or: accept if at least one regex is accepted. cannot be empty 
# title_and: init score is 0. each acception of regex contributes 1 point. items only have positive score will be downloaded
# epsode_re: regex that find the epsode number from title. only support two digits
# epsodes: list of epsode numbers to download. '01-03' as '01','02','03'. '*' means do not conside epsode number.

rules = """
{
	title_or = '美少年','Bishounen'
	title_and = '動畫','简|CHS|GB','简|CHS|GB','1080'
	epsode_re = '[^a-zA-Z0-9](\d\d)[^a-zA-Z0-9]'
	epsodes = '05'
}
"""

results = search_history(rules,10) 
download_list = download(results,idx=0)

美少年 05,1,動畫,[NC-Raws] 美少年侦探团 / Bishounen Tanteidan - 05 [B-Global][WEB-DL][720p][AVC AAC][ENG_TH_SRT][MKV],297.5MB


In [16]:
results

{'美少年 05': [[1,
   '動畫',
   '[NC-Raws] 美少年侦探团 / Bishounen Tanteidan - 05 [B-Global][WEB-DL][720p][AVC AAC][ENG_TH_SRT][MKV]',
   'magnet:?xt=urn:btih:D2QFMJAZV4TFSYAL5L2GW57H4VIADVWD&dn=&tr=http%3A%2F%2F104.238.198.186%3A8000%2Fannounce&tr=udp%3A%2F%2F104.238.198.186%3A8000%2Fannounce&tr=http%3A%2F%2Ftracker.openbittorrent.com%3A80%2Fannounce&tr=udp%3A%2F%2Ftracker3.itzmx.com%3A6961%2Fannounce&tr=http%3A%2F%2Ftracker4.itzmx.com%3A2710%2Fannounce&tr=http%3A%2F%2Ftracker.publicbt.com%3A80%2Fannounce&tr=http%3A%2F%2Ftracker.prq.to%2Fannounce&tr=http%3A%2F%2Fopen.acgtracker.com%3A1096%2Fannounce&tr=https%3A%2F%2Ft-115.rhcloud.com%2Fonly_for_ylbud&tr=http%3A%2F%2Ftracker1.itzmx.com%3A8080%2Fannounce&tr=http%3A%2F%2Ftracker2.itzmx.com%3A6961%2Fannounce&tr=udp%3A%2F%2Ftracker1.itzmx.com%3A8080%2Fannounce&tr=udp%3A%2F%2Ftracker2.itzmx.com%3A6961%2Fannounce&tr=udp%3A%2F%2Ftracker3.itzmx.com%3A6961%2Fannounce&tr=udp%3A%2F%2Ftracker4.itzmx.com%3A2710%2Fannounce&tr=http%3A%2F%2Ftr.bangumi.moe%3A69

In [21]:
# call aria2 to download  
# https://aria2.github.io/manual/en/html/aria2c.html#rpc-interface

s = xmlrpc.client.ServerProxy('http://192.168.3.10:6800/rpc')
for i in download_list:
    a = s.aria2.addUri([i[3]],{'dir': '/mnt/anime/{}'.format(i[0])})
    print(a)

02007f1cd5e7772f


In [23]:
print(s.aria2.tellStatus('02007f1cd5e7772f'))

{'bittorrent': {'announceList': [['http://104.238.198.186:8000/announce'], ['udp://104.238.198.186:8000/announce'], ['http://tracker.openbittorrent.com:80/announce'], ['udp://tracker3.itzmx.com:6961/announce'], ['http://tracker4.itzmx.com:2710/announce'], ['http://tracker.publicbt.com:80/announce'], ['http://tracker.prq.to/announce'], ['http://open.acgtracker.com:1096/announce'], ['https://t-115.rhcloud.com/only_for_ylbud'], ['http://tracker1.itzmx.com:8080/announce'], ['http://tracker2.itzmx.com:6961/announce'], ['udp://tracker1.itzmx.com:8080/announce'], ['udp://tracker2.itzmx.com:6961/announce'], ['udp://tracker3.itzmx.com:6961/announce'], ['udp://tracker4.itzmx.com:2710/announce'], ['http://tr.bangumi.moe:6969/announce'], ['http://t.nyaatracker.com/announce'], ['http://open.nyaatorrents.info:6544/announce'], ['http://t2.popgo.org:7456/annonce'], ['http://share.camoe.cn:8080/announce'], ['http://opentracker.acgnx.se/announce'], ['http://tracker.acgnx.se/announce'], ['http://nyaa.tra