# Quotes

In [1]:
from bs4 import BeautifulSoup
import concurrent.futures
import os
import pandas
import requests
from tqdm.notebook import tqdm_notebook

## Constants

In [2]:
TIMEOUT = 30
LANGUAGES_URL = "https://1000mostcommonwords.com/languages/"

## Get Languages

In [3]:
languages_response = requests.get(LANGUAGES_URL, timeout=TIMEOUT)
languages_html = languages_response.text
languages_soup = BeautifulSoup(languages_html, "html.parser")
list_items = languages_soup.find_all("li")

## Get 1000 Most Common Words

In [4]:
def chunks(list, n):
    # looping till length l
    for i in range(0, len(list), n):
        yield list[i:i + n]

In [5]:
def get_words(link : str, column : str):
    try:
        response = requests.get(link, timeout=TIMEOUT)
        html = response.text
        beautiful_soup = BeautifulSoup(html, "html.parser")
        tables = beautiful_soup.find_all("table")
        words = []
        # Process Tablee
        for index, table in enumerate(tables):
            rows = table.find("tbody").find_all("tr")
            for row in rows:
                cells = row.find_all("td")
                cells_list = list(cells)
                number_text = cells_list[0].getText().strip().lower()
                language_text = cells_list[1].getText().strip().lower()
                english_text = cells_list[2].getText().strip().lower()
                if "number" != number_text:
                    word = {}
                    word["english"] = english_text
                    word[column] = language_text
                    words.append(word)
        sorted_words = sorted(words, key=lambda h: h['english'])
        return sorted_words
    except:
        return []

In [6]:
languages = []

chunks_list_items = chunks(list_items, 10)

for chunk_list_items in chunks_list_items:
    for list_item in tqdm_notebook(chunk_list_items):
        anchor = list_item.find("a", href=True)
        language = anchor.text.lower()
        file_name = "-".join(language.split(" "))
        column = "_".join(language.split(" "))
        link = anchor.get("href", "")
        if "1000-most-common" in link:
            words = get_words(link, column)
            languages.append(language)
            words_data_frame = pandas.DataFrame(words)
            words_data_frame = words_data_frame.drop_duplicates()
            words_data_frame = words_data_frame.sort_values(by = ["english"])
            words_data_frame.to_csv(f"./languages/{language}.csv", index = False, header = True)

languages.sort()

with open('./languages.txt', 'w') as file_open:
    file_open.write('\n'.join(languages))

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

KeyError: 'english'

## Merge Languages

In [None]:
french_data_frame = pandas.read_csv("./languages/french.csv")
korean_data_frame = pandas.read_csv("./languages/korean.csv")
spanish_data_frame = pandas.read_csv("./languages/spanish.csv")
vietnamese_data_frame = pandas.read_csv("./languages/vietnamese.csv")

merged_data_frame = pandas.merge(french_data_frame, korean_data_frame, on="english", how="inner")
merged_data_frame = pandas.merge(merged_data_frame, spanish_data_frame, on="english", how="inner")
merged_data_frame = pandas.merge(merged_data_frame, vietnamese_data_frame, on="english", how="inner")

merged_data_frame.to_csv("./words.csv", index=False)

In [None]:
english_series = merged_data_frame["english"].value_counts()
english_series

english
a         1
region    1
rain      1
raise     1
ran       1
         ..
glad      1
glass     1
go        1
gold      1
your      1
Name: count, Length: 999, dtype: int64