In [36]:
import requests
import pandas as pd
import numpy as np
import re

from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm

### Parse methods

In [2]:
def parseGuitarLinks(url):
    html = requests.get(url).text
    soup = BeautifulSoup(html, "html.parser")

    hrefs = [a['href'] for a in soup.find_all('a', class_='catalog-card__name')]
    hrefs = ['https://www.muztorg.ru' + i for i in hrefs]

    return hrefs

In [30]:
def parseGuitarData(url):
    html = requests.get(url).text
    soup = BeautifulSoup(html, "html.parser")

    name = None
    brand = None
    fret_amount = None
    string_amount = None
    scale = None
    body = None
    body_material = None

    price_data_1 = soup.find('div', class_='mt-product-price__default-value')
    price_data_2 = soup.find('div', class_='mt-product-price__discounted-value')
    if price_data_1 == None and price_data_2 == None:
        price = None
    elif price_data_1 == None:
        price = price_data_2.text
        price = int(price.replace(' ₽', '').replace(' ', '').replace('\xa0', '')) / 1000
    else:
        price = price_data_1.text
        price = int(price.replace(' ₽', '').replace(' ', '').replace('\xa0', '')) / 1000

    tmp_name = soup.find('h1', class_='title-1')
    if tmp_name:
        name = tmp_name.text
        brand = name.split()[0]

    characteristics = soup.find('div', class_='mt-product-info__list')
    if characteristics:
        divs = characteristics.find_all('div')
        text_characteristics = [divs[i].text for i in range(len(divs))]

        for i in range(len(text_characteristics)):
            if 'Количество ладов (диапазон)' in text_characteristics[i]:
                fret_amount = int(divs[i].find('span').text)
                
            if 'Количество струн' in text_characteristics[i]:
                string_amount = int(divs[i].find('span').text)
        
            if 'Материал корпуса' in text_characteristics[i]:
                body_material = divs[i].find('span').text
        
            if 'Форма корпуса' in text_characteristics[i]:
                body = divs[i].find('span').text
                
            if 'Мензура, дюймы' in text_characteristics[i]:
                scale = float(divs[i].find('span').text)

    return name, brand, price, fret_amount, string_amount, scale, body, body_material

In [31]:
def fetch_guitar_data(href):
    return parseGuitarData(href)

In [4]:
def parseCurrGuitarData(url):
    html = requests.get(url).text
    soup = BeautifulSoup(html, "html.parser")

    frets = 0
    strings = 0
    bodies = ''
    scales = 0.0

    # parse main div
    guitar_data = soup.find_all('div', class_='panel-group visible-xs')

    if guitar_data:
        panel_body = guitar_data[0].find('div', id='mobile-characteristics')
        
        if panel_body:
            text = panel_body.find_all('li')
            text = [str(text[i]) for i in range(len(text))]
            
            # postprocessing
            keys_to_extract = ["Количество ладов (диапазон)", "Количество струн", "Форма корпуса", "Мензура, дюймы"]
            filtered_data = [item for item in text if any(key in item for key in keys_to_extract)]
            pattern = r':\s*(.+?)</li>'
            
            extracted_values = []
            for item in filtered_data:
                match = re.search(pattern, item)
                if match:
                    extracted_values.append(match.group(1))
            
            if len(extracted_values) >= 4:
                frets = int(extracted_values[0])
                strings = int(extracted_values[1])
                bodies = extracted_values[2]
                scales = float(extracted_values[3])

    return frets, strings, bodies, scales

 brands = []
    prices = []
    names = []
    links = set()
    
    #parse main div
    guitar_div = soup.find_all('div', class_='thumbnail-list grid-3')

    
    #parse required data from guitar_divs
    for div in guitar_div:
        meta_tags_brands = div.find_all('meta', {'itemprop': 'brand'})
        for meta_tag in meta_tags_brands:
            brand = meta_tag.get('content')
            brands.append(brand)

    for div in guitar_div:
        p_tags = div.find_all('p', class_='price')
        for price_tag in p_tags:
            price = price_tag.get_text()
            prices.append(price)
            
    for div in guitar_div:
        meta_tags = div.find_all('meta', {'itemprop': 'name'})
        for meta_tag in meta_tags:
            content = meta_tag.get('content')
            words = re.findall(r'\b[\w-]+\b', content)
            name = ' '.join(words)
            names.append(name)

    for div in guitar_div:
        a_tags_links = soup.find_all('a', href=True)
        for link in a_tags_links:
            href = link.get('href')
            if '/product/' in href:
                links.add('https://www.muztorg.ru' + href + '?view_tab=characteristics')
            

    #postprocess data (if required)
    prices = [re.sub(r'\D', '', prices[i]) for i in range(len(prices))]
    for i in range(len(prices)):
        prices[i] = int(prices[i])/1000

    links = list(links)

    
    return brands, prices, names, links

IndentationError: unindent does not match any outer indentation level (<string>, line 39)

### Generate urls for parsing and parse all guitar links

In [32]:
links = [f'https://www.muztorg.ru/category/elektrogitary?all-stock=1&page={page}' for page in range(1, 80)]

In [9]:
HREFS = []
for i in range(len(links)):
    HREFS.extend(parseGuitarLinks(links[i]))

len(HREFS)

2528

###  Initialize global arrays for storing data

In [33]:
NAMES = []
BRANDS = []
PRICES = []
FRET_AMOUNTS = []
STRING_AMOUNTS = []
SCALES = []
BODIES = []
BODY_MATERIALS = []

### Append parsed data to required global arrays

In [34]:
with ThreadPoolExecutor() as executor:
    results = list(tqdm(executor.map(fetch_guitar_data, HREFS), total=len(HREFS), desc="Fetching guitar data"))

for result in results:
    NAMES.append(result[0])
    BRANDS.append(result[1])
    PRICES.append(result[2])
    FRET_AMOUNTS.append(result[3])
    STRING_AMOUNTS.append(result[4])
    SCALES.append(result[5])
    BODIES.append(result[6])
    BODY_MATERIALS.append(result[7])

Fetching guitar data: 100%|████████████████| 2528/2528 [14:01<00:00,  3.00it/s]


In [35]:
len(NAMES), len(BRANDS), len(PRICES), len(FRET_AMOUNTS), len(STRING_AMOUNTS), len(SCALES), len(BODIES), len(BODY_MATERIALS)

(2528, 2528, 2528, 2528, 2528, 2528, 2528, 2528)

### Paste data to dataset

In [46]:
data = {
    'name': NAMES,
    'brand': BRANDS,
    'price': PRICES,
    'fret-amount': FRET_AMOUNTS,
    'string-amount': STRING_AMOUNTS,
    'scale': SCALES,
    'body': BODIES,
    'body-material': BODY_MATERIALS
}

In [47]:
df = pd.DataFrame(data)

In [63]:
df

Unnamed: 0,name,brand,price,fret-amount,string-amount,scale,body,body-material
0,IBANEZ GRG121DX-BKF,IBANEZ,27.60,22,6,25.5,Modern stat,тополь
1,ROCKDALE Stars Black Limited Edition HSS BK,ROCKDALE,13.30,24,6,24.5,Stratocaster,тополь
2,IBANEZ GRX70QA-TRB,IBANEZ,23.94,24,6,25.5,Superstrat,тополь
3,ROCKDALE Stars HSS BK,ROCKDALE,12.20,24,6,25.5,Stratocaster,тополь
4,IBANEZ GRG121DX-WNF,IBANEZ,27.90,22,6,25.5,Superstrat,красное дерево
...,...,...,...,...,...,...,...,...
2523,"GRETSCH G2655 Streamliner C/Block Jr. DC, V-St...",GRETSCH,84.00,22,6,26.5,Superstrat,тополь
2524,"GRETSCH G2622 Streamliner C/Block DC, V-Stopta...",GRETSCH,86.00,24,6,26.5,Modern stat,ольха
2525,"GRETSCH G2622 Streamliner C/Block DC, V-Stopta...",GRETSCH,86.00,24,6,24.5,Stratocaster,тополь
2526,"GRETSCH G2655 Streamliner C/Block Jr. DC, V-St...",GRETSCH,86.00,22,6,26.5,Stratocaster,красное дерево


In [64]:
df.to_csv('guitar-data.csv', index=False, encoding='utf-8')