In [13]:
# Main import block
import numpy as np
import pandas as pd
import requests
import cfscrape 
from bs4 import BeautifulSoup
import datetime
from datetime import date, datetime

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
import re
import asyncio


import time
from tqdm import tqdm

# Upload to google sheets
import gspread
#import df2gspread as d2g
from df2gspread import df2gspread as d2g
from oauth2client.service_account import ServiceAccountCredentials

# Upload to yandex
import yadisk

import signal
from contextlib import contextmanager

import warnings
warnings.filterwarnings('ignore')


In [14]:
##### Functions to maintain side manipulations #####


# Timeout class for reattempting connection
class TimeoutException(Exception):
    pass


@contextmanager
def time_limit(seconds):
    def signal_handler(signum, frame):
        raise TimeoutException("Timed out!")

    signal.signal(signal.SIGALRM, signal_handler)
    signal.alarm(seconds)
    try:
        yield
    finally:
        signal.alarm(0)


# Function to convert date from json
def date_format(date_raw):
    timestamp = date_raw / 1000
    date = datetime.fromtimestamp(timestamp)
    formatted_date = date.strftime("%Y-%m-%d %H:%M:%S")
    return formatted_date


def date_format_reverse():
    date_now = date.today()
    date_string = date_now.strftime("%Y-%m-%d %H:%M:%S")
    date_raw = datetime.strptime(date_string, "%Y-%m-%d %H:%M:%S").timestamp()
    date_raw = date_raw * 1000
    return int(date_raw)


# Function for uploading dataframes into the google docs
def google_upload(df, sheet_name):
    # Params used to connect to google api
    scope = [
        "https://spreadsheets.google.com/feeds",
        "https://www.googleapis.com/auth/drive",
    ]
    credentials = ServiceAccountCredentials.from_json_keyfile_name(
        "./misc/macro-parser-lme-c2f2972b48fc.json", scope
    )  # security token
    gc = gspread.authorize(credentials)

    # Key params for connection to particular document
    spreadsheet_key = "1WhLiXRcdlkG7NCvHac9unC8ROt4lcbY7GxrOEdezZ9s"  # document id
    wks_name = sheet_name  # sheet name that we use
    df_array = df.to_numpy()
    df_new = pd.DataFrame(df_array, columns=df.columns)
    # df_new.reset_index(drop=True, inplace=True)
    d2g.upload(df_new, spreadsheet_key, wks_name, credentials=credentials)
    # print(f'Uploading to {sheet_name} completed')
    time.sleep(3)


# yandex upload
def yandex_upload():
    token = pd.read_csv("./misc/token.csv", header=None)
    token = token.iloc[0, 0]

    y = yadisk.YaDisk(token=token)

    try:
        y.upload("./data/LME_db.xlsx", "/macro_db/LME_db.xlsx")
    except:
        y.remove("/macro_db/LME_db.xlsx")
        y.upload("./data/LME_db.xlsx", "/macro_db/LME_db.xlsx")


# Session creation via proxy
def get_session(url):

    # Free proxy function
    def get_free_proxies():
        url = "https://free-proxy-list.net/"
        soup = BeautifulSoup(requests.get(url).content, "html.parser")

        raw_list = []
        proxies = dict()
        trs = soup.find("table").find_all("tr")  # main table

        for i in trs[1:]:
            raw_list.append(i.find_all("td"))  # list of raw data rows

        for i in range(len(raw_list)):  # creating working proxy list
            try:
                if raw_list[i][6].text == "yes":  # taking only https
                    proxies[raw_list[i][3].text] = (
                        f"{raw_list[i][0].text}:{raw_list[i][1].text}"
                    )
            except IndexError:
                continue

        adress = pd.Series(proxies)  # creating proxy series

        return adress

    # create session
    session = requests.Session()

    # random proxy
    proxy = get_free_proxies()
    counter = 0

    while counter <= len(proxy):
        try:
            with time_limit(7):
                random_proxy = proxy.sample().values[0]
                session.proxies = {"https": random_proxy}
                response = session.get(url)
                break

        except OSError:
            counter += 1
            pass

        except TimeoutException:
            # print("NBK_tenge timed out! Another attempt")
            counter += 1
            print(f"Attempt {counter+1}")

    return response

### This is the main function block ###

In [15]:
################################################################
##########################   LME ###############################
################################################################
# Асинхронная функция
async def lme_selenium_async():
    url = "https://www.lme.com/Metals/Non-ferrous#tabIndex=1"

    service = Service()
    driver = webdriver.Chrome(service=service)

    try:
        driver.maximize_window()
        driver.get(url=url)

        time.sleep(2)

        driver.execute_script(
            "window.scrollTo(0, window.scrollY + window.innerHeight);"
        )
        driver.execute_script(
            "window.scrollTo(0, window.scrollY + window.innerHeight);"
        )

        time.sleep(2)

        html_code = driver.page_source

        soup = BeautifulSoup(html_code, "html.parser")
        # Вытаскиваем данные из табличного блока
        data_raw = soup.find_all("div", class_="metal-block-row__blocks")

        # Приводим к тексту
        metalls_raw = data_raw[0].text

        # Убираем все лишнее
        metalls_raw = metalls_raw.replace(" ", "").replace("LME", "LME_").split("\n")
        metalls_raw = " ".join(metalls_raw).strip().split(" ")

        # Конфигурируем новый лист по металлам
        metalls_raw = [metalls_raw[i : i + 4] for i in range(0, len(metalls_raw), 4)]

        # Обрезаем до нужной инфы
        metalls_raw = [i[:2] for i in metalls_raw]

        # Приведем цены к числовому формату
        for metall in metalls_raw:
            metall[1] = pd.to_numeric(metall[1])

        # Формируем строку датафрейма по металлам
        dict_list = [{item[0]: item[1]} for item in metalls_raw]

        metall_df = pd.DataFrame()

        for i in dict_list:
            keys = list(i.keys())
            values = list(i.values())
            metall_df[keys[0]] = values

        # Дропаем ненужные колонки
        metall_df.drop(columns=["LME_AluminiumAlloy", "LME_NASAAC"], inplace=True)
        metall_df = metall_df.rename(
            columns={
                "LME_Aluminium": "aluminium",
                "LME_Copper": "copper",
                "LME_Lead": "lead",
                "LME_Nickel": "nickel",
                "LME_Zinc": "zink",
                "LME_Tin": "tin",
            }
        )

        # Достанем в вставим дату
        date_raw = soup.find_all("span", class_="metal-block-container__refreshed-on")
        date_raw = date_raw[0]
        date_raw = str(date_raw).split(">")[1]
        date_raw = date_raw.replace("\xa0", "").replace("</span", "")
        date_raw = pd.to_datetime(date_raw) - pd.Timedelta(days=1)

        metall_df["date"] = date_raw

        # Поменяем порядок колонок
        new_column_order = [
            "date",
            "aluminium",
            "copper",
            "lead",
            "nickel",
            "zink",
            "tin",
        ]
        metall_df = metall_df[new_column_order]

        # Сохраняем полученные результаты
        lme_db = pd.read_excel("./data/LME_db_new.xlsx", index_col=0)
        lme_db = pd.concat([lme_db, metall_df]).reset_index(drop=True)

        # Дропаем дубликаты
        lme_db = lme_db.drop_duplicates(subset=lme_db.columns.to_list()[1:])

        # Сохраняем
        with pd.ExcelWriter(
            "./data/LME_db_new.xlsx",
            date_format="YYYY-MM-DD",
            datetime_format="YYYY-MM-DD",
        ) as writer:
            lme_db.to_excel(writer, sheet_name="LME_main")
        
        print("LME_main is done!!!")

    except Exception as error:
        print(error)

    finally:
        driver.close()
        driver.quit()

    return lme_db



In [16]:
################################################################
#########################   KITCO (main) ########################
################################################################

async def kitko_parser_async():
    def kitco_raw():
        try:
                url = "https://www.kitco.com/price/fixes/london-fix"

                service = Service()
                driver = webdriver.Chrome(service=service)

                # driver.maximize_window()
                driver.get(url=url)

                time.sleep(2)

                driver.execute_script(
                    "window.scrollTo(0, window.scrollY + window.innerHeight);"
                )
                driver.execute_script(
                    "window.scrollTo(0, window.scrollY + window.innerHeight);"
                )

                time.sleep(2)

                html_code = driver.page_source

                return html_code

        except Exception as error:
            print(error)

        finally:
            driver.close()
            driver.quit()
    
    # Работа над полученными данными
    html_code = kitco_raw()
    soup = BeautifulSoup(html_code)
    
    # Выделим данные и предобработаем для получения дневных данных
    day = soup.find_all("div", class_="border")[1]
    for i in day:
        day = list(i.find_all("div"))

    day = day[4:]

    day = [str(i) for i in day]

    day = [i.replace("<div>", "").replace("</div>", "") for i in day]

    day[0] = pd.to_datetime(day[0])

    day[1] = day[1].split("/")
    day[1] = day[1][1].strip().replace(",", "")
    day[1] = pd.to_numeric(day[1])

    day[2] = pd.to_numeric(day[2])

    day[3] = day[3].split("/")
    day[3] = day[3][1].strip().replace(",", "")
    day[3] = pd.to_numeric(day[3])

    day[4] = day[4].split("/")
    day[4] = day[4][1].strip().replace(",", "")
    day[4] = pd.to_numeric(day[4])

    df_row = (
        pd.DataFrame([day], columns=["Date", "Gold", "Silver", "Platinum", "Palladium"])
        .sort_values("Date")
        .reset_index(drop=True)
    )

    # Выделим данные и предобработаем для получения исторических данных
    historical = soup.find_all("div", class_="border")[2]
    for i in historical:
        historical = list(i.find_all("div"))

    historical = historical[5:]

    historical = [str(i) for i in historical]

    historical = [i.replace("<div>", "").replace("</div>", "") for i in historical]
    historical = [item for item in historical if "<div" not in item]

    # Разобьем по дням
    historical = [historical[i : i + 5] for i in range(0, len(historical), 5)]

    # Обработаем данные внутри каждого дня
    for i in historical:
        i[0] = pd.to_datetime(i[0])

        i[1] = i[1].split("/")
        i[1] = i[1][1].strip().replace(",", "")
        i[1] = pd.to_numeric(i[1])

        i[2] = pd.to_numeric(i[2])

        i[3] = i[3].split("/")
        i[3] = i[3][1].strip().replace(",", "")
        i[3] = pd.to_numeric(i[3])

        i[4] = i[4].split("/")
        i[4] = i[4][1].strip().replace(",", "")
        i[4] = pd.to_numeric(i[4])

    df_historical = (
        pd.DataFrame(
            historical, columns=["Date", "Gold", "Silver", "Platinum", "Palladium"]
        )
        .sort_values("Date")
        .reset_index(drop=True)
    )

    df_historical = pd.concat([df_historical, df_row]).reset_index(drop=True)
    kitko_db = pd.read_excel("./data/kitko_db.xlsx", index_col=0)

    kitko_db = (
        pd.concat([kitko_db, df_historical])
        .reset_index(drop=True)
        .drop_duplicates(subset=["Date"])
    )

    with pd.ExcelWriter(
        "./data/kitko_db.xlsx",
        date_format="YYYY-MM-DD",
        datetime_format="YYYY-MM-DD",
    ) as writer:
        kitko_db.to_excel(writer, sheet_name="kitco_metall")
    
    print("KITKO_main is done!!!") 
    
    return kitko_db


In [17]:
################################################################
#########################   LBMA (KITCO subs) ##################
################################################################

async def lbma_prescious_async():
    url_gold = "https://prices.lbma.org.uk/json/gold_pm.json"
    url_silver = "https://prices.lbma.org.uk/json/silver.json"
    url_platinum = "https://prices.lbma.org.uk/json/platinum_pm.json"
    url_paladium = "https://prices.lbma.org.uk/json/palladium_pm.json"

    def get_raw_data(url, metall="Default"):
        scraper = cfscrape.create_scraper()
        scraped_data = scraper.get(url)

        # scraped_data = get_session(url=url)

        raw_data = pd.read_json(scraped_data.text)
        data = raw_data[["d", "v"]]
        data["v"] = data["v"].apply(lambda x: x[0])
        data["d"] = pd.to_datetime(data["d"])

        data.rename(columns={"d": "Date", "v": metall}, inplace=True)
        data = data.tail(4)

        return data

    gold = get_raw_data(url_gold, metall="Gold")
    silver = get_raw_data(url_silver, metall="Silver")
    platinum = get_raw_data(url_platinum, metall="Platinum")
    paladium = get_raw_data(url_paladium, metall="Palladium")

    result_df = (
        gold.merge(silver, on="Date", how="outer")
        .merge(platinum, on="Date", how="outer")
        .merge(paladium, on="Date", how="outer")
    )

    result_df.fillna(value=0, inplace=True)
    result_df = result_df.sort_values(by="Date")
    result_df.reset_index(inplace=True, drop=True)

    historical = pd.read_excel("./data/lbme_kitco_subs.xlsx", index_col=0)

    result = pd.concat([historical, result_df], axis=0).reset_index(drop=True)

    result.drop_duplicates(inplace=True)

    result = result.sort_values(by="Date")

    result.to_excel("./data/lbme_kitco_subs.xlsx", sheet_name="lbme_metall")
    
    print("LBMA is done!!!") 
    
    return result


In [18]:
################################################################
####################   CB_currencies ###########################
################################################################

def cb_currencies():
    # current_year = datetime.datetime.now().year
    current_year = datetime.now().year

    dict_of_currencies = {
        "USD": "R01235",
        "EUR": "R01239",
        "Australian_Dollar": "R01010",
        "China_Yuan": "R01375",
        "British_Pound": "R01035",
        "Kazakhstan_Tenge": "R01335",
        "Japanese_Yen": "R01820",
        "Swiss_Franc": "R01775",
    }

    def get_data(currency):

        url = f'https://www.cbr.ru/currency_base/dynamics/?UniDbQuery.\
Posted=True&UniDbQuery.so=1&UniDbQuery.mode=1&UniDbQuery.date_req1=&UniDbQuery\
.date_req2=&UniDbQuery.VAL_NM_RQ={currency}&UniDbQuery.From={"01.01.2022"}&UniDbQuery\
.To=31.12.{current_year}'

        number_of_tries = 0

        while number_of_tries < 20:
            try:
                scraper = cfscrape.create_scraper()
                scraped_data = scraper.get(url)

                preprocesed_data = scraped_data.text.replace(",", ".")

                df = pd.read_html(preprocesed_data, header=1)[0]

                base_name_list = df.columns.to_list()
                rename_list = ["Date", "Nominal", "Value"]
                columns = dict(zip(base_name_list, rename_list))
                df.rename(columns=columns, inplace=True)
                df["Value"] = df["Value"] / df["Nominal"]
                df["Date"] = pd.to_datetime(df["Date"], dayfirst=True)
                df.sort_values(by="Date", inplace=True)
                df.reset_index(drop=True, inplace=True)

                return df
                break

            except:
                # print('Error occured!')
                time.sleep(1)
                number_of_tries += 1

    for key, value in dict_of_currencies.items():
        table = get_data(value)

        db_frame = pd.read_excel(f"./data/centrobank/{key}.xlsx", index_col=0)

        new_frame = pd.concat([table, db_frame]).drop_duplicates()

        print(key, new_frame.tail())

        google_upload(new_frame, f"{key}")

        with pd.ExcelWriter(
            f"./data/centrobank/{key}.xlsx",
            date_format="YYYY-MM-DD",
            datetime_format="YYYY-MM-DD",
        ) as writer:
            new_frame.to_excel(writer, sheet_name=f"{key}")

In [19]:
################################################################
####################   CB_metalls ##############################
################################################################


def cb_metalls():
    current_year = datetime.now().year

    url = f"https://www.cbr.ru/hd_base/metall/metall_base_new/?UniDbQuery.Posted\
=True&UniDbQuery.From=01.01.2022&UniDbQuery.To=30.12.{current_year}&UniDbQuery.Gold\
=true&UniDbQuery.Silver=true&UniDbQuery.Platinum=true&UniDbQuery.Palladium\
=true&UniDbQuery.so=1"

    number_of_tries = 0

    while number_of_tries < 20:
        try:
            scraper = cfscrape.create_scraper()
            scraped_data = scraper.get(url)

            preprocesed_data = scraped_data.text.replace(",", ".")

            df = pd.read_html(preprocesed_data, header=0)[0]

            base_name_list = df.columns.to_list()
            rename_list = ["Date", "Gold", "Silver", "Platinum", "Palladium"]
            columns = dict(zip(base_name_list, rename_list))
            df.rename(columns=columns, inplace=True)

            for i in df.columns.to_list():
                try:
                    df[i] = df[i].str.replace(" ", "")
                except AttributeError:
                    pass

            df["Date"] = pd.to_datetime(df["Date"], dayfirst=True)
            df.sort_values(by="Date", inplace=True)
            df.reset_index(drop=True, inplace=True)

            df.sort_values(by="Date", inplace=True)

            db_frame = pd.read_excel("./data/centrobank/metalls.xlsx", index_col=0)

            result_df = pd.concat([df, db_frame]).drop_duplicates()

            google_upload(result_df, f"CB_Metalls_consolidate")

            with pd.ExcelWriter(
                f"./data/centrobank/metalls.xlsx",
                date_format="YYYY-MM-DD",
                datetime_format="YYYY-MM-DD",
            ) as writer:
                result_df.to_excel(writer, sheet_name=f"cb_metalls")

            print(result_df.tail())
            break

        except:
            # print('Error occured!')
            time.sleep(1)
            number_of_tries += 1

In [20]:
################################################################
#########################   NBK ################################
################################################################

async def nbk_tenge_async():
    # Realy unrelieable source, mb it would be better off with using ms query inside the file
    try:
        year = date.today().year

        upper_bound = f'01.01.2022'
        lower_bound = f'31.12.{year}'

        url = f'https://nationalbank.kz/ru/exchangerates/ezhednevnye-oficialnye-rynochnye-kursy-valyut\
            /report?rates%5B%5D=5&beginDate={upper_bound}&endDate={lower_bound}'

        counter = 0

        while counter <= 6:
            try:
                with time_limit(15):
                    page = requests.get(url=url)
                    break

            except TimeoutException:
                # print("NBK_tenge timed out! Another attempt")
                counter += 1

        temp_df = pd.read_html(page.text)
        df = temp_df[0]
        df['Unnamed: 0'] = pd.to_datetime(df['Unnamed: 0'])  # , dayfirst=True)
        df.rename(columns={'Unnamed: 0': 'date'}, inplace=True)

        with pd.ExcelWriter(
                '../parser_beta/data/nbk_tenge.xlsx') as writer:
            df.to_excel(writer, sheet_name='tenge')

        print('NBK_tenge parsing is DONE!')

        return df
    
    except ValueError:
        print("Probably tech problem, check the source")

In [21]:
################################################################
##########################  SHMET ##############################
################################################################

async def shmet_optimized_async():
    url = "https://en.shmet.com/api/rest/enweb/spot/getSpotPrice?code=baseMetal&size=10&currentLength=0"
    responce = requests.get(url)

    day_df = pd.DataFrame(responce.json()["data"])
    day_df["date"] = date.today()
    cooper_row = day_df[day_df["name"].str.contains("cu", case=False)]

    result = cooper_row[["date", "middle", "unit"]]
    result["date"] = pd.to_datetime(result["date"])
    result = result.rename(columns={"middle": "price"})

    hist_data = pd.read_excel("./data/shmet_historical.xlsx", index_col=0)

    new_df = pd.concat([result, hist_data], axis=0).reset_index(drop=True)
    new_df.drop_duplicates(inplace=True)

    with pd.ExcelWriter(
        "../parser_beta/data/shmet_historical.xlsx",
        date_format="YYYY-MM-DD",
        datetime_format="YYYY-MM-DD",
    ) as writer:
        new_df.to_excel(writer, sheet_name="SHMET")

    final = pd.read_excel("./data/shmet_historical.xlsx", index_col=0)
    final.drop_duplicates(inplace=True)

    with pd.ExcelWriter(
        "../parser_beta/data/shmet_historical.xlsx",
        date_format="YYYY-MM-DD",
        datetime_format="YYYY-MM-DD",
    ) as writer:
        final.to_excel(writer, sheet_name="SHMET")

    print("SHMET is done!!!") 
    return final

In [22]:
################################################################
################## NEW_WESTMETALL###############################
################################################################

async def new_westmetall_async():
    def get_data(metall, col_name):
        url = f'https://www.westmetall.com/en/markdaten.php?action=table&field=LME_{metall}_cash'
        response = requests.get(url=url)

        df = pd.read_html(response.text)[0][:30]
        data = df.iloc[:, :2]

        data = data.query("date != 'date'")
        data['date'] = pd.to_datetime(data['date'])
        data.iloc[:, 1] = pd.to_numeric(data.iloc[:, 1])
        data.rename(columns={data.columns[1]: col_name}, inplace=True)

        return data

    al = get_data(metall='Al', col_name='aluminium')
    cu = get_data(metall='Cu', col_name='copper')
    pb = get_data(metall='Pb', col_name='lead')
    ni = get_data(metall='Ni', col_name='nickel')
    zn = get_data(metall='Zn', col_name='zink')
    tn = get_data(metall='Sn', col_name='tin')

    result = pd.merge(al, cu, on='date', how='left').merge(pb, on='date', how='left').merge(
        ni, on='date', how='left').merge(zn, on='date', how='left').merge(tn, on='date', how='left')

    old_data = pd.read_excel('./data/LME_westmetall_db.xlsx', index_col=0)

    final_data = pd.concat([old_data, result], axis=0)

    final_data.drop_duplicates(subset='date', inplace=True)

    final_data.sort_values(by='date', inplace=True)

    final_data.reset_index(inplace=True, drop=True)

    with pd.ExcelWriter(
        "../parser_beta/data/LME_westmetall_db.xlsx",
            date_format="YYYY-MM-DD",
            datetime_format="YYYY-MM-DD") as writer:
        final_data.to_excel(writer, sheet_name='LME_westmetall')
    
    print("WESTMETALL is done!!!") 
    return final_data

In [23]:
if __name__ == "__main__":
    await new_westmetall_async()
    await lme_selenium_async()
    await nbk_tenge_async()
    await shmet_optimized_async()
    await kitko_parser_async()
    

WESTMETALL is done!!!
LME_main is done!!!
NBK_tenge parsing is DONE!
SHMET is done!!!
KITKO_main is done!!!
