In [1]:
import requests
import re
import time
import random
import os
import datetime

In [2]:
def update_dmhy_cache(start_page, end_page):
    n = start_page 
    while n <= end_page:
        print("getting page {} ...".format(n))
        try:
            content = requests.get("https://share.dmhy.org/topics/list/page/{}".format(n)).text
            store_cache(content, 'dmhy/cache/{}.txt'.format(n))
            n+=1
        except Exception as e:
            print(datetime.datetime.today(), e)
            time.sleep(8)
        time.sleep(2+random.random())   
        
def store_cache(content, filename):
    table = re.findall(r'<tbody>[\s\S]*</tbody>', content)[0]    
    rows = re.findall(r'<tr[\s\S]*?</tr>',table)                          
    new_items = []
    for i in rows:
        detail = re.findall(r'<td[\s\S]*?</td>',re.sub(r'[\n\t]','',i))  # cols in a row
        release_time = re.findall(r'<span.*?>(.*?)</span>',detail[0])[0]
        release_type = re.sub(r'<.*?>','',detail[1])
        release_title = re.findall(r'<a.*?>(.*?)</a>',detail[2])[-1]
        release_magnet = re.findall( r'href="([^"]*)"',detail[3])[-1]
        release_size = re.sub(r'<.*?>','',detail[4])
        new_items.append(','.join([release_time, release_type, release_title, release_magnet,release_size])+'\n\n')
    with open(filename, 'w', encoding='utf8') as f:
        for i in  reversed(new_items):
            f.write(i)  

In [3]:
def update_dmhy(start_page, end_page):
    for i in reversed(range(start_page, end_page+1)):
        print("updating page {} ...".format(i))
        update_one_page(i)
        
def update_one_page(n):
    with open('dmhy/cache/{}.txt'.format(n), 'r', encoding='utf8') as f:
        records = [i for i in f.readlines() if len(i)>10]
    dict_records = {}
    for i in records:
        year = i[0:4]
        month = i[5:7]
        idx = '{}/{}'.format(year, month)
        filename = 'dmhy/{}-{}.txt'.format(year,month)
        if not idx in dict_records:
            prev_records(year, month, dict_records)
        if not i in dict_records[idx]:
            dict_records[idx].add(i)
            with open(filename,'a+',encoding='utf8') as f:
                f.write(i+'\n')
            
def prev_records(year, month, dict_records):
    filename = 'dmhy/{}-{}.txt'.format(year,month)
    idx = '{}/{}'.format(year, month)
    if not os.path.exists(filename):
        dict_records[idx] = set()
        return 
    with open(filename,'r',encoding='utf8') as f:
        dict_records[idx] = set([i for i in f.readlines() if len(i)>10])

In [4]:
start = 1
end = 9
update_dmhy_cache(start,end)

getting page 1 ...
getting page 2 ...
getting page 3 ...
getting page 4 ...
getting page 5 ...
getting page 6 ...
getting page 7 ...
getting page 8 ...
getting page 9 ...


In [5]:
update_dmhy(start,end)

updating page 9 ...
updating page 8 ...
updating page 7 ...
updating page 6 ...
updating page 5 ...
updating page 4 ...
updating page 3 ...
updating page 2 ...
updating page 1 ...


In [139]:
def read_history(months):
    files = [i for i in sorted(os.listdir('dmhy'), reverse=True) if len(i)==11][0:months]
    items = []
    for i in files:
        with open('dmhy/'+i, 'r',encoding='utf8') as f:
            records = f.readlines()
            for record in reversed(records):
                record_ = record[:-1].split(',')
                if len(record_) >3 and record_[3] != '':
                    items.append(record_)     
    return items

class Rule():
    def __init__(self, s):
        temp = re.findall(r'title_or.*',s)[0]
        self.title_or = re.findall(r"'(.*?)'",temp)
        temp = re.findall(r'title_and.*',s)[0]
        self.title_and = re.findall(r"'(.*?)'",temp)
        temp = re.findall(r'epsode_re.*',s)[0]
        self.epsode_re = re.findall(r"'(.*?)'",temp)[0]
        temp = re.findall(r'epsodes.*',s)[-1]
        epsodes_ = re.findall(r"'(.*?)'",temp)
        self.epsodes = []
        for i in epsodes_:
            if i == '*':
                self.epsodes = ['*']
                break
            ii = i.split('-')
            if len(ii) == 1:
                self.epsodes.append(int(i))
            elif len(ii) == 2:
                self.epsodes += list(range(int(ii[0]),int(ii[1])+1))
                
    def show(self):
        return  [','.join(["'"+i+"'" for i in self.title_or]), 
                 ','.join(["'"+i+"'" for i in self.title_and]), 
                           "'"+self.epsode_re+"'",
                 ','.join(["'"+self.tostr(i)+"'" for i in self.epsodes])]
    def tostr(self,n):
        if n == '*':
            return n
        if n < 10:
            return '0{}'.format(n)
        else:
            return '{}'.format(n)

    def store(self):
        ll = self.show()
        r = '{\n\ttitle_or = ' + ll[0] + '\n\ttitle_and = ' + ll[1] + '\n\tepsode_re = ' + ll[2]  + '\n\tepsodes = ' + ll[3] + '\n}'
        return r
    
    def match(self,s):
        count_title_or = 0
        for i in self.title_or:
            if len(re.findall(i,s))>0:
                count_title_or += 1
        count_title_and = 0
        for i in self.title_and:
            if len(re.findall(i,s))>0:
                count_title_and += 1   
        if count_title_or > 0 and count_title_and == len(self.title_and):
            if '*' in self.epsodes:
                return '*'
            epsode_ = re.findall(self.epsode_re, s)
            if len(epsode_) > 0:
                epsode = int(epsode_[-1])
                if epsode in self.epsodes:
                    return epsode
        return -1

def search_history(rules, months):
    items = read_history(months)
    results = {}
    for rule in re.findall(r'{[\s\S]*?}',rules):
        a = Rule(rule)
        for item in items:
            idx = a.match(item[1] + ' ' + item[2])
            if idx != -1:
                key = a.title_or[0] + ' ' + a.tostr(idx)
                if not key in results:
                    results[key] = []
                results[key].append(item)
    return results

In [146]:
rules = """
{
    title_or = '进击的巨人','Shingeki no Kyojin'
    title_and = '動畫','简|CHS|GB','1080'
    epsode_re = '[^a-zA-Z0-9](\d\d)[^a-zA-Z0-9]'
    epsodes = '65-67','68'
}
{
    title_or = '怪物事变','Kemono Jihen'
    title_and = '動畫','简|CHS|GB','1080'
    epsode_re = '[^a-zA-Z0-9](\d\d)[^a-zA-Z0-9]'
    epsodes = '*'
}
"""

rr = search_history(rules,4) 

In [147]:
rr['怪物事变 *']

[['2021/02/08 14:15',
  '動畫',
  '【喵萌奶茶屋】★01月新番★[怪物事变/Kemono Jihen][05][1080p][简体][招募翻译校对]',
  'magnet:?xt=urn:btih:5EXB76EWKUSFAWHXCAWDGWX7LFEQH7LI&dn=&tr=http%3A%2F%2F104.238.198.186%3A8000%2Fannounce&tr=udp%3A%2F%2F104.238.198.186%3A8000%2Fannounce&tr=http%3A%2F%2Ftracker.openbittorrent.com%3A80%2Fannounce&tr=udp%3A%2F%2Ftracker3.itzmx.com%3A6961%2Fannounce&tr=http%3A%2F%2Ftracker4.itzmx.com%3A2710%2Fannounce&tr=http%3A%2F%2Ftracker.publicbt.com%3A80%2Fannounce&tr=http%3A%2F%2Ftracker.prq.to%2Fannounce&tr=http%3A%2F%2Fopen.acgtracker.com%3A1096%2Fannounce&tr=https%3A%2F%2Ft-115.rhcloud.com%2Fonly_for_ylbud&tr=http%3A%2F%2Ftracker1.itzmx.com%3A8080%2Fannounce&tr=http%3A%2F%2Ftracker2.itzmx.com%3A6961%2Fannounce&tr=udp%3A%2F%2Ftracker1.itzmx.com%3A8080%2Fannounce&tr=udp%3A%2F%2Ftracker2.itzmx.com%3A6961%2Fannounce&tr=udp%3A%2F%2Ftracker3.itzmx.com%3A6961%2Fannounce&tr=udp%3A%2F%2Ftracker4.itzmx.com%3A2710%2Fannounce&tr=http%3A%2F%2Ftr.bangumi.moe%3A6969%2Fannounce&tr=http%3A%2F%2Ft.nya