# <font color='purple'>Web Scraping</font>

### Imports

In [3]:
import pickle
import random
import warnings
import wikipedia
import pandas as pd

from tqdm import tqdm
from bs4 import BeautifulSoup
import urllib

from nltk import sent_tokenize, tokenize
from nltk.corpus import wordnet, gutenberg

tqdm.pandas()
warnings.filterwarnings("ignore")

# nltk.download('wordnet')
# nltk.download('gutenberg')

In [4]:
NO_OF_SENTENCES = 400

ANOLOGY_DATASET = './../data/Analogy_dataset.txt'
VALIDATION_DATASET = './../data/Validation.txt'
OUTPUT_DIR = './../data/pickle/'

### Functions

In [5]:
def scrape_data_from_tables(url, col_size=3, concatenate_string='', name=None):
    sentences = []
    html = urllib.request.urlopen(url).read()
    table = BeautifulSoup(html, "html.parser").find('table')

    for row in tqdm(table.find_all('tr')):
        cols = row.find_all('td')
        if len(cols) == col_size:
            if(name=='tenses'):
                sentences.append('Present: ' + cols[0].text.strip() + ' Past: ' +  cols[1].text.strip() + ' Past participle: ' +  cols[2].text.strip())
            else:
                sentences.append(cols[1].text.strip() + concatenate_string + cols[0].text.strip())    
    
    return sentences
            

def get_top_n_sentences(word1, word2, tokens, length_threshold=200):
    sentences = []
    count = 0
    count2 = 0
    for sentence in tokens:
        is_word1 = (word1 in sentence)
        is_word2 = (word2 in sentence)
        if(len(sentence)<length_threshold):
            if(is_word1 and is_word2):
                sentences.insert(0,sentence)
                count +=1
            elif(is_word1):
                sentences.insert(count, sentence)
                count2 += 1
            elif(is_word2):
                sentences.insert(count+count2,sentence)
            else:
                sentences.append(sentence)
    if(len(sentences)>NO_OF_SENTENCES):
        return sentences[:NO_OF_SENTENCES]
    return sentences


def fetch_states_information(url):
    sentences = []
    html = urllib.request.urlopen(url).read()
    tables = BeautifulSoup(html, "html.parser").find_all('table',{'class':'wikitable sortable plainrowheaders'})

    sentences = []
    for table in tqdm(tables):
        for row in table.find_all('tr'):
            value = row.find_all('th')
            if(len(value)==1):
                name = value[0].text.strip()
            cols = row.find_all('td')
            length = len(cols)
            if(length>7):
                sentences.append(name + ' capital is ' + cols[3].text.strip())
                sentences.append(name + ' zone is ' + cols[2].text.strip())
                sentences.append(name + ' abbreviation is ' + cols[1].text.strip())
                sentences.append(name + ' langauge is ' + cols[length-2].text.strip())
    return sentences



def get_sentences(word1, word2):
    try:
        summary = wikipedia.summary(word1)
        sentences = tokenize.sent_tokenize(summary.lower())
    except wikipedia.DisambiguationError as e:
        word = random.choice(e.options)
        try:
            summary = wikipedia.summary(word)
            sentences = tokenize.sent_tokenize(summary.lower())
        except:
            sentences = []
    except:
        sentences = []

    return get_top_n_sentences(word1, word2, sentences)


def get_wiki_sentences(word1, word2):
    sentences = []
    url = 'https://en.wikipedia.org/wiki/' + word1
    html = urllib.request.urlopen(url).read()
    table = BeautifulSoup(html, "html.parser")

    raw = BeautifulSoup.get_text(table)
    raw = raw.replace("\n", "")
    raw = raw.lower()

    sentences = sent_tokenize(raw)
    return get_top_n_sentences(word1, word2, sentences)


def get_wordnet_sentences(word):
    sentences = []
    syns = wordnet.synsets(word)
    for val in syns:
        sentences += val.examples()
        
    ## If no examples found, get synonyms and antonyms from wordnet
    if(len(sentences)<3):
        result = ''
        for val in syns:
            for l in val.lemmas():
                result += l.name() + ' '
                if l.antonyms():
                    result += l.antonyms()[0].name() + ' '
        sentences.append(result)
        
    return sentences

## <font color='dark-orange'>Web Scraping from URLs

In [6]:
nation_capitals_url = 'https://geographyfieldwork.com/WorldCapitalCities.htm'
currencies_url      = 'https://www.jagranjosh.com/general-knowledge/list-of-countries-and-currencies-of-the-world-1662462803-1'
nation_states_url   = 'https://en.wikipedia.org/wiki/States_and_union_territories_of_India'
tenses_url          = 'https://kpu.pressbooks.pub/effectiveenglish/chapter/__unknown__/'
single_plural_url   = 'https://www.englishbix.com/singular-and-plural-verbs-examples/'

In [7]:
sentences = []
sentences += scrape_data_from_tables(nation_capitals_url, col_size=2, concatenate_string=' capital is ')
sentences += scrape_data_from_tables(currencies_url, col_size=2, concatenate_string=' currency is ')
sentences += scrape_data_from_tables(tenses_url, col_size=3, concatenate_string='', name='tenses')
sentences += scrape_data_from_tables(tenses_url, col_size=3, concatenate_string='', name='plural')
sentences += fetch_states_information(nation_states_url)
print("Scraped sentences :" + str(len(sentences)))

### Saving Results
with open(OUTPUT_DIR + 'url_sentences_v0.pickle', 'wb') as file:
    pickle.dump(sentences, file, protocol=pickle.HIGHEST_PROTOCOL)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 205/205 [00:00<00:00, 35658.45it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 196/196 [00:00<00:00, 19645.45it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 88/88 [00:00<00:00, 26317.20it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 88/88 [00:00<00:00, 49167.28it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████

Scraped sentences :716





## <font color='dark-orange'>Wikipedia Scraping

In [8]:
### Get Information from wikipedia library
wiki_sentences = []

df = pd.read_csv(ANOLOGY_DATASET, sep=' ', names=['w1','w2','w3','w4'])
df2 = df[['w3','w4']].rename(columns={'w3':'w1', 'w4':'w2'})
df = pd.concat([df[['w1','w2']],df2], ignore_index=True)
df = df.drop_duplicates()

df['w1'] = df['w1'].apply(lambda x : x.lower())
df['w2'] = df['w2'].apply(lambda x : x.lower())

df['sentences1'] = df[['w1','w2']].progress_apply(lambda x : get_sentences(x['w1'],x['w2']), axis=1)
df['sentences2'] = df[['w1','w2']].progress_apply(lambda x : get_sentences(x['w2'],x['w1']), axis=1)

wiki_sentences += df.sentences1.sum()
wiki_sentences += df.sentences2.sum()

print("Scraped sentences :" + str(len(wiki_sentences)))


### Saving the Results
with open(OUTPUT_DIR + 'wiki_sentences.pickle', 'wb') as file:
    pickle.dump(wiki_sentences, file, protocol=pickle.HIGHEST_PROTOCOL)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 92/92 [03:32<00:00,  2.31s/it]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 92/92 [03:50<00:00,  2.51s/it]

Scraped sentences :2039





## <font color='dark-orange'>Wordnet Scraping

In [9]:
df = pd.read_csv(ANOLOGY_DATASET, sep=' ', names=['w1','w2','w3','w4'])

df2 = df[['w3','w4']].rename(columns={'w3':'w1', 'w4':'w2'})
df = pd.concat([df[['w1','w2']],df2], ignore_index=True)
df = df.drop_duplicates()

df['w1'] = df['w1'].apply(lambda x : x.lower())
df['w2'] = df['w2'].apply(lambda x : x.lower())

df['sentences1'] = df['w1'].progress_apply(lambda x : get_wordnet_sentences(x))
df['sentences2'] = df['w2'].progress_apply(lambda x : get_wordnet_sentences(x))

wordnet_sentences= []
wordnet_sentences += df.sentences1.sum()
wordnet_sentences += df.sentences2.sum()
print("Scraped sentences :" + str(len(wordnet_sentences)))

### Saving the Results
with open(OUTPUT_DIR + 'wordnet_sentences.pickle', 'wb') as file:
    pickle.dump(wordnet_sentences, file, protocol=pickle.HIGHEST_PROTOCOL)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 92/92 [00:01<00:00, 54.17it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 92/92 [00:00<00:00, 9764.32it/s]

Scraped sentences :283





In [10]:
df = pd.read_csv(VALIDATION_DATASET, sep=' ', names=['w1','w2','w3','w4'])

df2 = df[['w3','w4']].rename(columns={'w3':'w1', 'w4':'w2'})
df = pd.concat([df[['w1','w2']],df2], ignore_index=True)
df = df.drop_duplicates()

df['w1'] = df['w1'].apply(lambda x : x.lower())
df['w2'] = df['w2'].apply(lambda x : x.lower())

df['sentences1'] = df['w1'].progress_apply(lambda x : get_wordnet_sentences(x))
df['sentences2'] = df['w2'].progress_apply(lambda x : get_wordnet_sentences(x))

wordnet_sentences= []
wordnet_sentences += df.sentences1.sum()
wordnet_sentences += df.sentences2.sum()
print("Scraped sentences :" + str(len(wordnet_sentences)))

### Saving the Results
with open(OUTPUT_DIR + 'wordnet_sentences2.pickle', 'wb') as file:
    pickle.dump(wordnet_sentences, file, protocol=pickle.HIGHEST_PROTOCOL)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 326/326 [00:00<00:00, 2331.55it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 326/326 [00:00<00:00, 20595.00it/s]

Scraped sentences :9930





## <font color='dark-orange'>Large scale sentence fetch from Wikipedia

In [11]:
wiki_sentences = []

df = pd.read_csv(ANOLOGY_DATASET, sep=' ', names=['w1','w2','w3','w4'])
df2 = df[['w3','w4']].rename(columns={'w3':'w1', 'w4':'w2'})
df = pd.concat([df[['w1','w2']],df2], ignore_index=True)
df = df.drop_duplicates()

df['w1'] = df['w1'].apply(lambda x : x.lower())
df['w2'] = df['w2'].apply(lambda x : x.lower())

df['sentences1'] = df[['w1','w2']].progress_apply(lambda x : get_wiki_sentences(x['w1'],x['w2']), axis=1)
df['sentences2'] = df[['w1','w2']].progress_apply(lambda x : get_wiki_sentences(x['w2'],x['w1']), axis=1)

wiki_sentences += df.sentences1.sum()
wiki_sentences += df.sentences2.sum()
print("Scraped sentences :" + str(len(wiki_sentences)))

### Saving Results
with open(OUTPUT_DIR + 'wiki_sentences_large.pickle', 'wb') as file:
    pickle.dump(wiki_sentences, file, protocol=pickle.HIGHEST_PROTOCOL)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 92/92 [02:47<00:00,  1.82s/it]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 92/92 [01:51<00:00,  1.21s/it]

Scraped sentences :57822





## <font color='dark-orange'>Gutenberg Scraping

In [13]:
gutenberg_sentences = []

for fileid in gutenberg.fileids():   
    print(fileid)
    for sent_words in gutenberg.sents(fileid):
        sentence = ' '.join(sent_words)
        gutenberg_sentences.append(sentence)

print("Scraped sentences :" + str(len(gutenberg_sentences)))        

### Saving Results
with open(OUTPUT_DIR + 'gutenberg_sentences_large.pickle', 'wb') as file:
    pickle.dump(gutenberg_sentences, file, protocol=pickle.HIGHEST_PROTOCOL)

austen-emma.txt
austen-persuasion.txt
austen-sense.txt
bible-kjv.txt
blake-poems.txt
bryant-stories.txt
burgess-busterbrown.txt
carroll-alice.txt
chesterton-ball.txt
chesterton-brown.txt
chesterton-thursday.txt
edgeworth-parents.txt
melville-moby_dick.txt
milton-paradise.txt
shakespeare-caesar.txt
shakespeare-hamlet.txt
shakespeare-macbeth.txt
whitman-leaves.txt
Scraped sentences :98552
