In [1]:
import requests
from bs4 import BeautifulSoup
from tld import get_tld
import re
from tqdm import tqdm
import pandas as pd
import pinyin
from hanziconvert import HanziConv
from numpy import np
import chinese


def findWords(list_of_sentences,list_of_keywords):

    regex = re.compile('.*(' + '|'.join(list_of_keywords) + ').*',re.IGNORECASE)
    return list(filter(regex.match,list_of_sentences))

def findWordsStrict(list_of_sentences,list_of_keywords):

    regex = re.compile(r'.*\W(' + '|'.join(list_of_keywords) + r')\W.*',re.IGNORECASE)
    return list(filter(regex.match,list_of_sentences))

class website():
    
    def __init__(self,url):
        
        self.url = url
        self.domain = get_tld(url,as_object = True).fld
        self.site = BeautifulSoup(requests.get(url).text,'html.parser')
        self.href = [a['href'] for a in self.site(href=True)]
        
    def cleanHref(self,href):
        
        if href[0] == '#': return re.split('#[^#]*$',self.url)[0] + href
        elif href[0] == '/': return 'http://' + self.domain + href
        else: return href
        
    def getHref(self,hrefs = ''):
        
        if hrefs == '': hrefs = self.href
            
        self.selfHref = []
        self.intHref = []
        self.extHref = []

        for href in hrefs:

            try:
                if href[0] == '#': self.selfHref.append(re.split('#[^#]*$',self.url)[0]+href)
                elif href[0] == '/': self.intHref.append('http://'+self.domain+href)
                else: self.extHref.append(href)
            except: pass

        self.selfHref = list(set(self.selfHref))
        self.intHref = list(set(self.intHref))
        self.extHref = list(set(self.extHref))

    def getTables(self,header=True):
        
        tables = self.site('table')
        self.dfs = []
        
        for table in tables:
            
            try:
                
                headers = []
                for cell in table('tr')[0](['td','th']):
                    try: headers.extend([cell.getText().replace(r'\n',' ').strip()] * int(cell['colspan']))
                    except: headers.append(cell.getText().replace(r'\n',' ').strip())

                contents = []
                for row in table('tr')[1:]:
                    content = []
                    for cell in row(['td','th']):
                        try: toAdd = cell.getText() + '\n\n' + self.cleanHref(cell.find(href=True)['href'])
                        except: toAdd = cell.getText()
                        try: content.extend([toAdd] * int(cell['colspan']))
                        except: content.append(toAdd)
                    contents.append(content)

                if header == True:
                    self.dfs.append(pd.DataFrame(contents,columns=headers).applymap(lambda x: x.replace(r'\n',' ').strip()))
                    
                else:
                    data = [headers]
                    data.extend(contents)
                    self.dfs.append(pd.DataFrame(data))
                
            except: pass
    
    def combineTables(self):
        
        self.combinedTables = []
        self.combinedTablesByIndex = {}
        indices = {}
        
        for df in self.dfs:
            if ', '.join(df.columns) in indices: indices[', '.join(df.columns)].append(df)
            else: indices[', '.join(df.columns)] = [df]
                
        for index in indices:
            self.combinedTablesByIndex[index] = pd.concat(indices[index],ignore_index=True)
            self.combinedTables.append(pd.concat(indices[index],ignore_index=True))
            
            
    def getLists(self):
        
        lists = self.site(['ol','ul'])
        self.lists = []
        
        for lst in lists:
            content = []
            for li in lst('li'):
                try: content.append(li.getText() + '\n\n' + self.cleanHref(li.find(href=True)['href']))
                except: content.append(li.getText())
            self.lists.append(content)
            
    def getRelevant(self,keywords=[''],keepNewline = False,keepDuplicates = False):
        
        try:
            regex = re.compile('(' + '|'.join(keywords) + ')',re.IGNORECASE)

            hrefs = []
            self.relevantText = []
            for relevantText in self.site(string = regex):
                try:
                    hrefs.extend([href['href'] for href in relevantText.parent.parent(href=True)])
                    if keepNewline: self.relevantText.append(relevantText.parent.getText())
                    else: self.relevantText.append(re.sub('\n+',';',relevantText.parent.getText().strip('\n')))
                except: pass

            self.getHref(hrefs)

            if keepDuplicates == False:
                self.relevantText = list(set(self.relevantText))
        
        except SyntaxError: print("'keywords' must be a list.")
            
class spiderman(website):
    
    def __init__(self):
        
        self.urls = []
        self.keywords = []
        self.searchTypes = []
    
    def addUrl(self,url):
        self.urls.append(url)
        
    def addKeyword(self,keyword):
        self.keywords.append(keyword)
        
    def delUrl(self,url):
        try: self.urls.remove(url)
        except: print("Can't for some reason")
        
    def delKeyword(self,keyword):
        try: self.keywords.remove(keyword)
        except: print("Can't for some reason")
        
    def spiderman(self,keywords=[''],combine=True,limit=20):
        
        self.dfs = []
        self.lists = []
        self.texts = {}
        counter = len(self.urls)
        
        for url in tqdm(self.urls):
            
            try:
                site = website(url)
                site.getTables()
                site.getLists()
                site.getRelevant(keywords)

                if counter < limit:

                    self.urls.extend(site.intHref)
                    self.urls.extend(site.extHref)
                    self.urls = list(set(self.urls))

                self.dfs.extend(site.dfs)
                self.lists.extend(site.lists)
                
                if url not in self.texts:
                    self.texts[url] = site.relevantText
                else:
                    self.texts[url].extend(site.relevantText)

                counter += 1

            except: print(url + ' cannot be accessed.')
        
        if combine:
            self.combineTables()



ModuleNotFoundError: No module named 'hanziconvert'

In [2]:
spider = spiderman()
spider.addUrl('https://www.bbc.com/zhongwen/simp')

spider.spiderman(keywords=[r'[\u4e00-\u9fff]+'],limit=200)

98it [03:06,  1.90s/it]                      


In [3]:
texts = []
for subline in [*spider.texts.values()]:
    texts.extend(subline)
texts = list(set(texts))

In [4]:
analyzer = chinese.ChineseAnalyzer()
tokens = {}
for i in tqdm(spider.texts):
    for sentence in spider.texts[i]:
        results = analyzer.parse(re.sub('[^\u4e00-\u9fff]+','',sentence))
        for token in results.tokens():
            if token in tokens: tokens[token] += 1
            else: tokens[token] = 1

100%|██████████| 97/97 [00:01<00:00, 49.32it/s]


In [25]:
df = pd.read_csv('Documents/Data/Vocab.csv')
chinese_words = list(set(df[df['Language']=='中文']['Word']))

In [26]:

def tryAdd(x,y):
    if pd.notnull(x):
        return x+y
    else: return y
for token in tqdm(tokens):
    chinese_words = list(set(df[df['Language']=='中文']['Word']))
    if token not in chinese_words:
        try: usages = analyzer.parse(token)[token][0].definitions
        except: usages = np.nan
        toAdd = pd.DataFrame([['中文',HanziConv.toSimplified(token),np.nan,np.nan,np.nan,usages]],columns=['Language','Word','Chinese Translit','Transliteration','Purposes','Usages'])
        df = df.append(toAdd)
        
    df.loc[df['Word'] == HanziConv.toSimplified(token),'Importance'] = df.loc[df['Word'] == HanziConv.toSimplified(token),'Importance'].apply(lambda x: tryAdd(x,tokens[token]))
    

100%|██████████| 12029/12029 [06:35<00:00, 30.41it/s]


In [28]:
import pinyin
def tryPinyin(x):
    try: return pinyin.get(x)
    except: return x

def tryTraditional(x):
    try: return HanziConv.toTraditional(x)
    except: return x
    
def tryJoin(x):
    try: 
        if type(x)==list: return ', '.join(x)
        else:
            found = re.findall("'([^']*)'",x)
            if len(found) == 0:
                return x
            else: return ', '.join(found)
    except: return x
    
def tryStrip(x):
    try: return x.strip()
    except: return x

df.loc[df['Language']=='中文','Transliteration'] = df.loc[df['Language']=='中文','Word'].apply(tryPinyin)
df.loc[df['Language']=='中文','Chinese Translit'] = df.loc[df['Language']=='中文','Word'].apply(tryTraditional)
df['Usages'] = df['Usages'].apply(tryJoin)
df = df.applymap(tryStrip)
df = df[~((df['Language']=='中文') & (pd.isnull(df['Usages'])) & (pd.isnull(df['Purposes'])))]


In [3]:
from hanziconvert import HanziConv

ModuleNotFoundError: No module named 'hanziconvert'

In [2]:
import chinese
analyzer = chinese.ChineseAnalyzer()

In [5]:
analyzer.parse('こんなにも数多くの皆様が支えてく').tokens()

['こ', 'ん', 'な', 'に', 'も', '数多', 'く', 'の', '皆', '様', 'が', '支', 'え', 'て', 'く']

In [9]:
site = spiderman.website('https://www.larousse.fr/dictionnaires/francais/pis/61142?q=pis#60742')

In [7]:
from wilcoxon import *

In [10]:
site.site


<!DOCTYPE html>

<html itemscope="" itemtype="http://schema.org/WebPage" lang="fr">
<!-- Larousse.fr, HLVELPDW1, 27/08/2020 10:43:56 -->
<script>
    dataLayer = [{'lang':"fr", 'page_type':"editorial", 'page_category':"langue francaise", 'page_subcategory':"dictionnaire de francais"}];         // Problème compatiblitiy [<head></head>] & [Modify by C#]
</script>
<head id="Head1" prefix="og: http://ogp.me/ns# fb: http://ogp.me/ns/fb# article: http://ogp.me/ns/article#"><meta charset="utf-8"/><meta content="Éditions Larousse" name="author"/><meta content="width=device-width, initial-scale=1" http-equiv="Content-Type" name="viewport"/><title>
	Définitions : pis - Dictionnaire de français Larousse
</title>
<!-- intégration[v.PROD]: régisPub + CNIL(v.OneTrust) -->
<!-- ################################################ -->
<!-- Démarrage de notification de consentement aux cookies de OneTrust -->
<!-- ----------------------------------------------------------------- -->
<!-- call OneTrust [v.