In [91]:
import pandas as pd
import requests
from sqlalchemy import create_engine
import random
from datetime import datetime
import json
from postgre_cred import database_url

In [92]:
engine = create_engine(database_url)

In [93]:
def generate_url(full_slug: str, cat_id: int):
    url = f'https://veli.store/_next/data/O6jvrOse3zlGpiC7AWBmz/ka/category/{full_slug}/{cat_id}.json?'
    slug_parts = full_slug.split('/')
    slug_parts = list(map(lambda x: f'type={x}&', slug_parts))
    slug_parts = ''.join(slug_parts)
    url = f'{url}{slug_parts}type={cat_id}'
    return url

In [94]:
from functools import cache

@cache
def get_all_sub_cat_dataframes(url):
    tech_sub_cat_data = requests.get(url).json()
    
    # If it's a leaf category with no subcategories
    if len(tech_sub_cat_data['pageProps']['data']['categories']) == 0:
        print(f"{tech_sub_cat_data['pageProps']['data']['headline']} IS A LEAF CATEGORY!")
        return [extract_data(url)]  # Return as a list containing this df
    else:
        tech_sub_cat_data = tech_sub_cat_data['pageProps']['data']['categories']
        tech_sub_cat_df = pd.DataFrame(tech_sub_cat_data)
        tech_sub_cat_df = tech_sub_cat_df[['id', 'full_slug']]
        
        urls = tech_sub_cat_df.apply(lambda row: generate_url(row['full_slug'], row['id']), axis=1)
        
        result = []
        for sub_url in urls:
            result.extend(get_all_sub_cat_dataframes(sub_url))
        return result

In [95]:
def extract_data(url):
    page = 1
    products = []
    slug = 'tmp'
    while True:
        resp = requests.get(f'{url}&page_size=1000&page={page}')
        resp = resp.json()
        if page == 1:
            slug = resp['pageProps']['data']['fullSlug']
            slug = slug.lower().replace('-', '_').replace(' ', '-')
        current_products = resp['pageProps']['data']['products']
        if not len(current_products):
            break
        products.extend(current_products)
        page += 1
    
    
    df = pd.DataFrame(products)
    print('DATA EXTRACTION SUCCESS!')
    return [df, slug]

In [103]:
def clean_data(df: pd.DataFrame):
    try:
        df['price'] = df['stock'].apply(lambda x: x['price'])
        df['old_price'] = df['stock'].apply(lambda x: x['start_price'])
        df = df[['price', 'old_price', 'headline', 'image', 'slug']]
        df.rename(columns={'headline': 'name', 'slug': 'url'}, inplace=True)
        print('DATA CLEANING SUCCESS!')
    except:
        print('ERROR ON THIS CATEGORY!')
    return df

In [98]:
def export_data(df: pd.DataFrame, db_name):
    df.to_sql(db_name, con=engine, if_exists='replace', index=True)
    print(f'{db_name} DATA EXPORT SUCCESS!')

In [101]:
def final():
    main_url = 'https://veli.store/_next/data/O6jvrOse3zlGpiC7AWBmz/ka/category/teqnika/774.json?type=teqnika&type=774'
    dfs_n_names = get_all_sub_cat_dataframes(main_url)
    cleaned_dfs = list(map(lambda x: clean_data(x[0]), dfs_n_names))
    for i in range(len(cleaned_dfs)):
        try:
            export_data(cleaned_dfs[i], f'veli_{dfs_n_names[i][1]}')
        except:
            print(f'Error on {dfs_n_names[i][1]}')

In [104]:
final()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.rename(columns={'headline': 'name', 'slug': 'url'}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.rename(columns={'headline': 'name', 'slug': 'url'}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.rename(columns={'headline': 'name', 'slug': 'url'}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing

DATA CLEANING SUCCESS!
DATA CLEANING SUCCESS!
DATA CLEANING SUCCESS!
DATA CLEANING SUCCESS!
DATA CLEANING SUCCESS!
DATA CLEANING SUCCESS!
DATA CLEANING SUCCESS!
DATA CLEANING SUCCESS!
DATA CLEANING SUCCESS!
DATA CLEANING SUCCESS!
DATA CLEANING SUCCESS!
DATA CLEANING SUCCESS!
DATA CLEANING SUCCESS!
DATA CLEANING SUCCESS!
ERROR ON THIS CATEGORY!
DATA CLEANING SUCCESS!
DATA CLEANING SUCCESS!
DATA CLEANING SUCCESS!
DATA CLEANING SUCCESS!
DATA CLEANING SUCCESS!
DATA CLEANING SUCCESS!
DATA CLEANING SUCCESS!
DATA CLEANING SUCCESS!
DATA CLEANING SUCCESS!
DATA CLEANING SUCCESS!
DATA CLEANING SUCCESS!
DATA CLEANING SUCCESS!
DATA CLEANING SUCCESS!
DATA CLEANING SUCCESS!
DATA CLEANING SUCCESS!
DATA CLEANING SUCCESS!
DATA CLEANING SUCCESS!
DATA CLEANING SUCCESS!
DATA CLEANING SUCCESS!
DATA CLEANING SUCCESS!
DATA CLEANING SUCCESS!
DATA CLEANING SUCCESS!
DATA CLEANING SUCCESS!
DATA CLEANING SUCCESS!
DATA CLEANING SUCCESS!
DATA CLEANING SUCCESS!
DATA CLEANING SUCCESS!
DATA CLEANING SUCCESS!
DATA CLEAN