In [19]:
# Main import block
import numpy as np
import pandas as pd
import requests
import cfscrape 
from bs4 import BeautifulSoup
import datetime
from datetime import date, datetime

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
import re


import time
from tqdm import tqdm

# Upload to google sheets
import gspread
#import df2gspread as d2g
from df2gspread import df2gspread as d2g
from oauth2client.service_account import ServiceAccountCredentials

import signal
from contextlib import contextmanager

import warnings
warnings.filterwarnings('ignore')


In [20]:
##### Functions to maintain side manipulations #####

# Timeout class for reattempting connection
class TimeoutException(Exception):
    pass


@contextmanager
def time_limit(seconds):
    def signal_handler(signum, frame):
        raise TimeoutException("Timed out!")
    signal.signal(signal.SIGALRM, signal_handler)
    signal.alarm(seconds)
    try:
        yield
    finally:
        signal.alarm(0)


# Function to convert date from json
def date_format(date_raw):
    timestamp = date_raw / 1000
    date = datetime.fromtimestamp(timestamp)
    formatted_date = date.strftime("%Y-%m-%d %H:%M:%S")
    return formatted_date


def date_format_reverse():
    date_now = date.today()
    date_string = date_now.strftime("%Y-%m-%d %H:%M:%S")
    date_raw = datetime.strptime(date_string, "%Y-%m-%d %H:%M:%S").timestamp()
    date_raw = date_raw*1000
    return int(date_raw)


# Function for uploading dataframes into the google docs
def google_upload(df, sheet_name):
    # Params used to connect to google api
    scope = [
        'https://spreadsheets.google.com/feeds',
        'https://www.googleapis.com/auth/drive']
    credentials = ServiceAccountCredentials.from_json_keyfile_name(
        'macro-parser-lme-c2f2972b48fc.json', scope)  # security token
    gc = gspread.authorize(credentials)

    # Key params for connection to particular document
    spreadsheet_key = '1WhLiXRcdlkG7NCvHac9unC8ROt4lcbY7GxrOEdezZ9s'  # document id
    wks_name = sheet_name  # sheet name that we use
    df_array = df.to_numpy()
    df_new = pd.DataFrame(df_array, columns=df.columns)
    # df_new.reset_index(drop=True, inplace=True)
    d2g.upload(df_new, spreadsheet_key, wks_name, credentials=credentials)
    # print(f'Uploading to {sheet_name} completed')
    time.sleep(3)


# Session creation via proxy
def get_session(url):

    # Free proxy function
    def get_free_proxies():
        url = "https://free-proxy-list.net/"
        soup = BeautifulSoup(requests.get(url).content, "html.parser")

        raw_list = []
        proxies = dict()
        trs = soup.find('table').find_all('tr')  # main table

        for i in trs[1:]:
            raw_list.append(i.find_all('td'))  # list of raw data rows

        for i in range(len(raw_list)):  # creating working proxy list
            try:
                if raw_list[i][6].text == 'yes':  # taking only https
                    proxies[raw_list[i]
                            [3].text] = f'{raw_list[i][0].text}:{raw_list[i][1].text}'
            except IndexError:
                continue

        adress = pd.Series(proxies)  # creating proxy series

        return adress

    # create session
    session = requests.Session()

    # random proxy
    proxy = get_free_proxies()
    counter = 0

    while counter <= len(proxy):
        try:
            with time_limit(7):
                random_proxy = proxy.sample().values[0]
                session.proxies = {"https": random_proxy}
                response = session.get(url)
                break

        except OSError:
            counter += 1
            pass

        except TimeoutException:
            # print("NBK_tenge timed out! Another attempt")
            counter += 1
            print(f'Attempt {counter+1}')

    return response

### This is the main function block ###

In [21]:
################################################################
##########################   LME ###############################
################################################################

def lme_selenium():
    def get_price(metall=''):
        url = f'https://www.lme.com/en/Metals/Non-ferrous/LME-{metall}#Summary'

        service = Service()
        driver = webdriver.Chrome(service=service)

        try:
            driver.maximize_window()
            driver.get(url=url)

            time.sleep(2)

            driver.execute_script(
                "window.scrollTo(0, window.scrollY + window.innerHeight);")
            driver.execute_script(
                "window.scrollTo(0, window.scrollY + window.innerHeight);")

            time.sleep(2)

            html_code = driver.page_source

            soup = BeautifulSoup(html_code, 'html.parser')
            data_raw = soup.find_all('td', class_='data-set-table__cell')
            price = pd.to_numeric(data_raw[1].text)            
            return price

        except Exception as error:
            print(error)

        finally:
            driver.close()
            driver.quit()



    def get_date(metall='Aluminium'):
        url = f'https://www.lme.com/en/Metals/Non-ferrous/LME-{metall}#Summary'

        service = Service()
        driver = webdriver.Chrome(service=service)

        try:
            driver.maximize_window()
            driver.get(url=url)

            time.sleep(2)

            driver.execute_script(
                "window.scrollTo(0, window.scrollY + window.innerHeight);")
            driver.execute_script(
                "window.scrollTo(0, window.scrollY + window.innerHeight);")

            time.sleep(2)

            html_code = driver.page_source

            soup = BeautifulSoup(html_code, 'html.parser')
            data_raw = soup.find_all('td', class_='data-set-table__cell')

            date_raw = (soup.find_all(
                'span', class_='data-set-tabs__content-detail'))
            
            date_processed = date_raw[3].text.strip()
            
            date = pd.to_datetime(date_processed)
            
            return date

        except Exception as error:
            print(error)

        finally:
            driver.close()
            driver.quit()    
    
    try:
    
        date = get_date('Aluminium')
        al_price = get_price('Aluminium')
        cu_price = get_price('Copper')
        zn_price = get_price('Zinc')
        nk_price = get_price('Nickel')
        ld_price = get_price('Lead')
    

        data_row = pd.DataFrame({
            'date': date,
            'aluminium': al_price,
            'copper': cu_price,
            'lead': ld_price,
            'nickel': nk_price,
            'zink': zn_price
        }, index=[0])

        historical_data = pd.read_excel(
            './data/LME_db.xlsx', index_col=0)

        historical_data = pd.concat(
            [historical_data, data_row]).reset_index(drop=True)

        historical_data.drop_duplicates(inplace=True)

        with pd.ExcelWriter('./data/LME_db.xlsx', date_format="YYYY-MM-DD", datetime_format="YYYY-MM-DD") as writer:
            historical_data.to_excel(writer, sheet_name='LME_non_ferrous')
        
        google_upload(historical_data, 'LME_non_ferrous')
        
        return historical_data
    
    except TypeError as e:
        print(e)
    
    

In [22]:
################################################################
#########################   KITCO ##############################
################################################################

def kitco_db():
    url = 'https://www.kitco.com/price/fixes/london-fix'

    scraper = cfscrape.create_scraper()
    scraped_data = scraper.get(url)

    soup = BeautifulSoup(scraped_data.text)

    temp_row = soup.find_all(
        'div', class_='grid grid-cols-5 border-b border-gray-200 text-center py-4 border-[rgba(0,0,0,0)]')

    raw_row = str(temp_row).split('<')


    row = [raw_row[2].replace('div>', ''), raw_row[6].replace('!-- -->', ''),
        raw_row[8].replace('div>', ''), raw_row[12].replace('!-- -->', ''),
        raw_row[16].replace('!-- -->', '')]

    df_row = pd.DataFrame(row).T

    df_row.rename(columns={0: 'Date', 1: 'Gold', 2: 'Silver',
                3: 'Platinum', 4: 'Palladium'}, inplace=True)

    df_row['Date'] = pd.to_datetime(df_row['Date'])

    for column in df_row.columns.to_list()[1:]:
        df_row[column] = df_row[column].str.replace(',', '')
        df_row[column] = pd.to_numeric(df_row[column])

    db_frame = pd.read_excel('./data/kitko_db.xlsx', index_col=0, parse_dates=['Date'])

    final_df = pd.concat([db_frame, df_row])
    final_df.reset_index(drop=True, inplace=True)
    final_df.drop_duplicates(inplace=True)

    final_df.to_excel(
        '../parser_beta/data/kitko_db.xlsx', sheet_name='kitco_metall')
    
    google_upload(final_df, 'KITCO')

    return final_df


In [23]:
################################################################
#########################   LBMA (KITCO subs) ##################
################################################################

def lbma_prescious():
    url_gold = 'https://prices.lbma.org.uk/json/gold_pm.json'
    url_silver = 'https://prices.lbma.org.uk/json/silver.json'
    url_platinum = 'https://prices.lbma.org.uk/json/platinum_pm.json'
    url_paladium = 'https://prices.lbma.org.uk/json/palladium_pm.json'

    def get_raw_data(url, metall='Default'):
        scraper = cfscrape.create_scraper()
        scraped_data = scraper.get(url)
        
        #scraped_data = get_session(url=url)

        raw_data = pd.read_json(scraped_data.text)
        data = raw_data[['d', 'v']]
        data['v'] = data['v'].apply(lambda x: x[0])
        data['d'] = pd.to_datetime(data['d'])

        data.rename(columns={'d': 'Date', 'v': metall}, inplace=True)
        data = data.tail(4)

        return data

    gold = get_raw_data(url_gold, metall='Gold')
    silver = get_raw_data(url_silver, metall='Silver')
    platinum = get_raw_data(url_platinum, metall='Platinum')
    paladium = get_raw_data(url_paladium, metall='Palladium')

    result_df = gold.merge(silver, on='Date', how='outer').merge(
        platinum, on='Date', how='outer').merge(paladium, on='Date', how='outer')

    result_df.fillna(value=0, inplace=True)
    result_df = result_df.sort_values(by='Date')
    result_df.reset_index(inplace=True, drop=True)
    
    historical = pd.read_excel('./data/lbme_kitco_subs.xlsx', index_col=0)
    
    result = pd.concat([historical, result_df], axis=0).reset_index(drop=True)
    
    result.drop_duplicates(inplace=True)
    
    result = result.sort_values(by='Date')
    
    result.to_excel('./data/lbme_kitco_subs.xlsx', sheet_name='lbme_metall')
    
    google_upload(result, 'LBMA')
    
    return result

In [24]:
################################################################
####################   CB_currencies ###########################
################################################################

def cb_currencies():
    #current_year = datetime.datetime.now().year
    current_year = datetime.now().year
    
    dict_of_currencies = {
        'USD': 'R01235',
        'EUR': 'R01239',
        'Australian_Dollar': 'R01010',
        'China_Yuan': 'R01375',
        'British_Pound': 'R01035',
        'Kazakhstan_Tenge': 'R01335',
        'Japanese_Yen': 'R01820',
        'Swiss_Franc': 'R01775'
    }

    def get_data(currency):

        url = f'https://www.cbr.ru/currency_base/dynamics/?UniDbQuery.\
Posted=True&UniDbQuery.so=1&UniDbQuery.mode=1&UniDbQuery.date_req1=&UniDbQuery\
.date_req2=&UniDbQuery.VAL_NM_RQ={currency}&UniDbQuery.From={"01.01.2022"}&UniDbQuery\
.To=31.12.{current_year}'
        
        number_of_tries = 0
        
        while number_of_tries<20:
            try:
                scraper = cfscrape.create_scraper()
                scraped_data = scraper.get(url)

                preprocesed_data = scraped_data.text.replace(',','.')

                df = pd.read_html(preprocesed_data, header=1)[0]

                base_name_list = df.columns.to_list()
                rename_list = ['Date', 'Nominal', 'Value']
                columns = dict(zip(base_name_list, rename_list))
                df.rename(columns=columns, inplace=True)
                df['Value'] = df['Value']/df['Nominal']
                df['Date'] = pd.to_datetime(df['Date'], dayfirst=True)
                df.sort_values(by='Date', inplace=True)
                df.reset_index(drop=True, inplace=True)
                
                return df
                break
            
            except:
                #print('Error occured!')
                time.sleep(1)
                number_of_tries+=1
    
    for key, value in dict_of_currencies.items():
        table = get_data(value)
        
        db_frame = pd.read_excel(f'./data/centrobank/{key}.xlsx', index_col=0)
        
        new_frame = pd.concat([table, db_frame]).drop_duplicates()
        
        print(key, new_frame.tail())
        
        google_upload(new_frame, f'{key}')
        
        with pd.ExcelWriter(f'./data/centrobank/{key}.xlsx', date_format="YYYY-MM-DD", datetime_format="YYYY-MM-DD") as writer:
            new_frame.to_excel(writer, sheet_name=f'{key}')


In [25]:
################################################################
####################   CB_metalls ##############################
################################################################

def cb_metalls():
    current_year = datetime.now().year
    
    url = f'https://www.cbr.ru/hd_base/metall/metall_base_new/?UniDbQuery.Posted\
=True&UniDbQuery.From=01.01.2022&UniDbQuery.To=30.12.{current_year}&UniDbQuery.Gold\
=true&UniDbQuery.Silver=true&UniDbQuery.Platinum=true&UniDbQuery.Palladium\
=true&UniDbQuery.so=1'
    
    number_of_tries = 0
    
    while number_of_tries<20:
        try:
            scraper = cfscrape.create_scraper()
            scraped_data = scraper.get(url)
            
            preprocesed_data = scraped_data.text.replace(',','.')
            
            df = pd.read_html(preprocesed_data, header=0)[0]
            
            base_name_list = df.columns.to_list()
            rename_list = ['Date', 'Gold', 'Silver', 'Platinum', 'Palladium']
            columns = dict(zip(base_name_list, rename_list))
            df.rename(columns=columns, inplace=True)
            
            for i in df.columns.to_list():
                try:
                    df[i] = df[i].str.replace(' ', '')
                except AttributeError:
                    pass
            
            df['Date'] = pd.to_datetime(df['Date'], dayfirst=True)
            df.sort_values(by='Date', inplace=True)
            df.reset_index(drop=True, inplace=True)
            
            df.sort_values(by='Date', inplace=True)
            
            db_frame = pd.read_excel('./data/centrobank/metalls.xlsx', index_col=0)
            
            result_df = pd.concat([df, db_frame]).drop_duplicates()
            
            google_upload(result_df, f'CB_Metalls_consolidate')

            with pd.ExcelWriter(f'./data/centrobank/metalls.xlsx', date_format="YYYY-MM-DD", datetime_format="YYYY-MM-DD") as writer:
                    result_df.to_excel(writer, sheet_name=f'cb_metalls')
                    
            print(result_df.tail())
            break
        
        except:
            #print('Error occured!')
            time.sleep(1)
            number_of_tries+=1
    

In [26]:
################################################################
#########################   NBK ################################
################################################################

def nbk_tenge():
    # Realy unrelieable source, mb it would be better off with using ms query inside the file
    year = date.today().year

    upper_bound = f'01.01.2022'
    lower_bound = f'31.12.{year}'

    url = f'https://nationalbank.kz/ru/exchangerates/ezhednevnye-oficialnye-rynochnye-kursy-valyut\
        /report?rates%5B%5D=5&beginDate={upper_bound}&endDate={lower_bound}'

    counter = 0

    while counter <= 6:
        try:
            with time_limit(15):
                page = requests.get(url=url)
                break

        except TimeoutException:
            # print("NBK_tenge timed out! Another attempt")
            counter += 1

    temp_df = pd.read_html(page.text)
    df = temp_df[0]
    df['Unnamed: 0'] = pd.to_datetime(df['Unnamed: 0'])  # , dayfirst=True)
    df.rename(columns={'Unnamed: 0': 'date'}, inplace=True)

    with pd.ExcelWriter(
            '../parser_beta/data/nbk_tenge.xlsx') as writer:
        df.to_excel(writer, sheet_name='tenge')

    # print('NBK_tenge parsing is DONE!')

    google_upload(df, 'nbk_tenge')

    return df

In [27]:
################################################################
##########################  SHMET ##############################
################################################################

def shmet_optimized():
    url = 'https://en.shmet.com/api/rest/enweb/spot/getSpotPrice?code=baseMetal&size=10&currentLength=0'
    responce = requests.get(url)

    day_df = pd.DataFrame(responce.json()['data'])
    day_df['date'] = date.today()
    cooper_row = day_df[day_df['name'].str.contains("cu", case=False)]

    result = cooper_row[['date', 'middle', 'unit']]
    result['date'] = pd.to_datetime(result['date'])
    result = result.rename(columns={'middle': 'price'})

    hist_data = pd.read_excel('./data/shmet_historical.xlsx', index_col=0)

    new_df = pd.concat([result, hist_data], axis=0).reset_index(drop=True)
    new_df.drop_duplicates(inplace=True)

    with pd.ExcelWriter(
        "../parser_beta/data/shmet_historical.xlsx",
            date_format="YYYY-MM-DD",
            datetime_format="YYYY-MM-DD") as writer:
        new_df.to_excel(writer, sheet_name='SHMET')

    final = pd.read_excel('./data/shmet_historical.xlsx', index_col=0)
    final.drop_duplicates(inplace=True)

    with pd.ExcelWriter(
        "../parser_beta/data/shmet_historical.xlsx",
            date_format="YYYY-MM-DD",
            datetime_format="YYYY-MM-DD") as writer:
        final.to_excel(writer, sheet_name='SHMET')

    google_upload(final, 'SHMET')

    return final

In [28]:
################################################################
################## NEW_WESTMETALL###############################
################################################################

def new_westmetall():

    def get_data(metall, col_name):
        url = f'https://www.westmetall.com/en/markdaten.php?action=table&field=LME_{metall}_cash'
        response = requests.get(url=url)

        df = pd.read_html(response.text)[0][:30]
        data = df.iloc[:, :2]

        data = data.query("date != 'date'")
        data['date'] = pd.to_datetime(data['date'])
        data.iloc[:, 1] = pd.to_numeric(data.iloc[:, 1])
        data.rename(columns={data.columns[1]: col_name}, inplace=True)

        return data

    al = get_data(metall='Al', col_name='aluminium')
    cu = get_data(metall='Cu', col_name='copper')
    pb = get_data(metall='Pb', col_name='lead')
    ni = get_data(metall='Ni', col_name='nickel')
    zn = get_data(metall='Zn', col_name='zink')

    result = pd.merge(al, cu, on='date', how='left').merge(pb, on='date', how='left').merge(
        ni, on='date', how='left').merge(zn, on='date', how='left')

    old_data = pd.read_excel('./data/LME_westmetall_db.xlsx', index_col=0)

    final_data = pd.concat([old_data, result], axis=0)

    final_data.drop_duplicates(subset='date', inplace=True)

    final_data.sort_values(by='date', inplace=True)

    final_data.reset_index(inplace=True, drop=True)

    with pd.ExcelWriter(
        "../parser_beta/data/LME_westmetall_db.xlsx",
            date_format="YYYY-MM-DD",
            datetime_format="YYYY-MM-DD") as writer:
        final_data.to_excel(writer, sheet_name='LME_westmetall')

    google_upload(final_data, 'LME_westmetall')

    return final_data

In [29]:
new_westmetall()

Unnamed: 0,date,aluminium,copper,lead,nickel,zink
0,2022-01-04,2815.5,9660.0,2327.0,20730.0,3602.0
1,2022-01-05,2866.0,9778.0,2343.0,20900.0,3660.0
2,2022-01-06,2912.5,9565.0,2291.0,20480.0,3590.0
3,2022-01-07,2919.5,9615.0,2342.0,20725.0,3602.5
4,2022-01-10,2923.0,9665.0,2303.0,21045.0,3576.5
...,...,...,...,...,...,...
510,2024-01-12,2175.0,8289.0,2058.0,16075.0,2480.0
511,2024-01-15,2153.0,8238.5,2078.0,16200.0,2535.0
512,2024-01-16,2158.5,8280.0,2086.0,15880.0,2518.5
513,2024-01-17,2142.0,8230.0,2033.0,15765.0,2487.0


In [30]:
cb_currencies()

USD           Date  Nominal    Value
497 2024-01-13        1  88.1324
498 2024-01-16        1  87.6772
499 2024-01-17        1  87.6457
500 2024-01-18        1  88.3540
501 2024-01-19        1  88.6610
EUR           Date  Nominal    Value
497 2024-01-13        1  96.7517
498 2024-01-16        1  96.0682
499 2024-01-17        1  95.6007
500 2024-01-18        1  96.0531
501 2024-01-19        1  96.5882
Australian_Dollar           Date  Nominal    Value
497 2024-01-13        1  59.0840
498 2024-01-16        1  58.6823
499 2024-01-17        1  57.9776
500 2024-01-18        1  58.0574
501 2024-01-19        1  58.1439
China_Yuan           Date  Nominal    Value
497 2024-01-13        1  12.2428
498 2024-01-16        1  12.1520
499 2024-01-17        1  12.1217
500 2024-01-18        1  12.1935
501 2024-01-19        1  12.2490
British_Pound           Date  Nominal     Value
497 2024-01-13        1  112.0515
498 2024-01-16        1  111.8849
499 2024-01-17        1  111.5817
500 2024-01-18       

In [31]:
cb_metalls()

          Date     Gold  Silver Platinum Palladium
496 2024-01-12   5785.3   65.55  2654.59   2845.84
497 2024-01-13  5749.64   65.26  2618.17   2833.52
498 2024-01-16  5794.64   64.99  2615.93   2807.61
499 2024-01-17  5776.36   65.40  2581.17   2713.61
500 2024-01-18  5789.66   65.48  2567.94   2681.57


In [32]:
kitco_db()


Unnamed: 0,Date,Gold,Silver,Platinum,Palladium
0,2023-01-03,1843.25,24.295,1082.0,1795.0
1,2023-01-04,1857.30,24.290,1080.0,1736.0
2,2023-01-05,1834.00,23.410,1062.0,1783.0
3,2023-01-06,1852.20,23.455,1073.0,1784.0
4,2023-01-09,1878.85,23.850,1092.0,1793.0
...,...,...,...,...,...
258,2024-01-11,2029.15,23.030,924.0,1000.0
259,2024-01-12,2055.65,23.060,928.0,996.0
260,2024-01-15,2049.90,23.210,916.0,963.0
261,2024-01-16,2038.15,23.050,904.0,944.0


In [33]:
lbma_prescious()

Unnamed: 0,Date,Gold,Silver,Platinum,Palladium
0,1968-01-02,0.00,2.173,0.0,0.0
1,1968-01-03,0.00,2.225,0.0,0.0
2,1968-01-04,0.00,2.171,0.0,0.0
3,1968-01-05,0.00,2.157,0.0,0.0
4,1968-01-08,0.00,2.163,0.0,0.0
...,...,...,...,...,...
14171,2024-01-12,2055.65,23.055,928.0,996.0
14172,2024-01-15,2049.90,23.210,916.0,963.0
14173,2024-01-16,2038.15,23.050,904.0,944.0
14174,2024-01-17,2011.75,22.805,897.0,927.0


In [34]:
nbk_tenge()

Unnamed: 0,date,Числовое значение,ДОЛЛАР США
0,2022-01-01,1,431.80
1,2022-01-02,1,431.80
2,2022-01-03,1,431.80
3,2022-01-04,1,431.80
4,2022-01-05,1,431.80
...,...,...,...
744,2024-01-15,1,451.33
745,2024-01-16,1,451.37
746,2024-01-17,1,451.25
747,2024-01-18,1,452.95


In [35]:
shmet_optimized()


Unnamed: 0,date,price,unit
0,2024-01-19,67640,Yuan/MT
1,2024-01-18,67960,Yuan/MT
2,2024-01-17,68020,Yuan/MT
3,2024-01-16,68020,Yuan/MT
4,2024-01-15,68300,Yuan/MT
...,...,...,...
791,2020-01-17,48930,Yuan/MT
792,2020-01-16,48950,Yuan/MT
793,2020-01-15,49060,Yuan/MT
794,2020-01-14,48990,Yuan/MT


In [36]:
lme_selenium()

list index out of range


Unnamed: 0,date,aluminium,copper,lead,nickel,zink
0,2022-01-04,2815.5,9660.0,2327.0,20730,3602.0
1,2022-01-05,2866.0,9778.0,2343.0,20900,3660.0
2,2022-01-06,2912.5,9565.0,2291.0,20480,3590.0
3,2022-01-07,2919.5,9615.0,2342.0,20725,3602.5
4,2022-01-10,2923.0,9665.0,2303.0,21045,3576.5
...,...,...,...,...,...,...
510,2024-01-13,2175.0,8289.0,2058.0,16075,2480.0
511,2024-01-16,2153.0,8238.5,2078.0,16200,2535.0
512,2024-01-16,2158.5,8280.0,2086.0,15880,2518.5
513,2024-01-17,2142.0,,2033.0,15765,2487.0
