In [1]:
import os
import time
import numpy as np
import pandas as pd 
from dotenv import load_dotenv

from sqlalchemy import create_engine, inspect
load_dotenv('./.env')

True

In [2]:
DB_IP = os.environ.get("DB_IP")
DB_PORT = os.environ.get("DB_PORT")
DB_NAME = os.environ.get("DB_NAME")
DB_USER = os.environ.get("DB_USER")
DB_PASSWORD = os.environ.get("DB_PASSWORD")

url = f"postgresql://{DB_USER}:{DB_PASSWORD}@{DB_IP}:{DB_PORT}/{DB_NAME}"
engine = create_engine(url)
inspector = inspect(engine)

In [3]:
start = time.time()
basis = pd.read_sql_table("basis_with_metrics", schema="ml_house", con=engine)
print((time.time() - start)/60)

0.9396559874216716


# posts_flags

In [4]:
start = time.time()
flags = pd.read_sql_table("posts_flags", schema="parse", con=engine)
print((time.time() - start)/60)

3.7246795376141866


In [5]:
print(flags.shape)
flags.sample()

(17544153, 13)


Unnamed: 0,id,is_post,silent,noforwards,pinned,fwd_from_flag,photo,document,web,audio,voice,video,gif
14238017,5913976,True,False,False,False,False,False,True,False,False,False,True,False


In [6]:
basis_with_flags = (
    basis
    .merge(
        flags, 
        on="id",
        how="inner"
    )
)

basis_with_flags.shape

(177373, 58)

# posts_metadata

In [8]:
start = time.time()
posts_metadata = pd.read_sql_table("posts_metadata", schema="parse", con=engine)
print((time.time() - start)/60)

6.740217089653015


In [9]:
posts_metadata = posts_metadata[["id", "channel_id", "post_date"]]

basis_with_meatadata = (
    basis_with_flags
    .merge(
        posts_metadata, 
        on="id",
        how="inner"
    )
)

basis_with_meatadata.shape

(177373, 60)

# channels

In [26]:
start = time.time()
channels = pd.read_sql_table("channels", schema="parse", con=engine)
print((time.time() - start)/60)

0.06086671749750773


In [27]:
channels

Unnamed: 0,id,name,title,participants,last_pinned_msg_id,about
0,1097289882,medicalksu,Medical –ö—Å—é,12026,5335.0,–ö–∞–Ω–∞–ª –æ —Ü–∏—Ñ—Ä–æ–≤—ã—Ö —Ç–µ—Ö–Ω–æ–ª–æ–≥–∏—è—Ö –≤ –∑–¥—Ä–∞–≤–æ–æ—Ö—Ä–∞–Ω–µ–Ω–∏–∏...
1,1203719412,ngised,Design Porn,10048,,
2,1336205252,TolmacChannel,Tolma√ß Channel,445,124.0,Group of @TolmacBot\nChat @tolmacchat\nContact...
3,1742423071,kobzevii,#–∫–æ–±–∑–µ–≤–Ω–∞—Å–≤—è–∑–∏,54831,9078.0,–û—Ñ–∏—Ü–∏–∞–ª—å–Ω—ã–π –∫–∞–Ω–∞–ª –ì—É–±–µ—Ä–Ω–∞—Ç–æ—Ä–∞ –ò—Ä–∫—É—Ç—Å–∫–æ–π –æ–±–ª–∞—Å—Ç...
4,1362285551,vlgsud,–û–±—ä–µ–¥–∏–Ω–µ–Ω–Ω–∞—è –ø—Ä–µ—Å—Å-—Å–ª—É–∂–±–∞ —Å—É–¥–æ–≤ –í–æ–ª–≥–æ–≥—Ä–∞–¥—Å–∫–æ–π ...,1659,2520.0,
...,...,...,...,...,...,...
13189,1901789788,Dekretnyeistorii,–ü–æ —Å(–¥)–µ–∫—Ä–µ—Ç—É –≤—Å–µ–º—É —Å–≤–µ—Ç—Éüôà,784,,–ü—Ä–∏–≤–µ—Ç‚úåüèº—Ç—ã –∑–∞—à–ª–∞ –≤ –ú–æ–π –º–∞–ª–µ–Ω—å–∫–∏–π –ª–∏—á–Ω—ã–π –¥–Ω–µ–≤–Ω–∏...
13190,1488393404,tre_kyrsk,–¢—Ä–µ–≤–æ–∂–Ω—ã–π –ö—É—Ä—Å–∫ | –°—É–¥–∂–∞,38208,6708.0,–ù–æ–≤–æ—Å—Ç–∏ –ö—É—Ä—Å–∫–∞ –∏ –ö—É—Ä—Å–∫–æ–π –æ–±–ª–∞—Å—Ç–∏.\n\n–ü—Ä–µ–¥–ª–æ–∂–∏—Ç...
13191,1239513509,naufortelegram,–ù–∞—Ü–∏–æ–Ω–∞–ª—å–Ω–∞—è –∞—Å—Å–æ—Ü–∏–∞—Ü–∏—è —É—á–∞—Å—Ç–Ω–∏–∫–æ–≤ —Ñ–æ–Ω–¥–æ–≤–æ–≥–æ —Ä...,1871,,–ù–∞—Ü–∏–æ–Ω–∞–ª—å–Ω–∞—è –∞—Å—Å–æ—Ü–∏–∞—Ü–∏—è —É—á–∞—Å—Ç–Ω–∏–∫–æ–≤ —Ñ–æ–Ω–¥–æ–≤–æ–≥–æ —Ä...
13192,1962245308,Kuturchin,–ë–∞–∑–∞ –æ—Ç–¥—ã—Ö–∞ –ö—É—Ç—É—Ä—á–∏–Ω-–¶–µ–Ω—Ç—Ä,232,742.0,


In [34]:
channels = channels[["id", "participants", "about"]]

basis_with_channels = (
    pd.merge(
        basis_with_meatadata,
        channels,
        left_on="channel_id",
        right_on="id",
        how="inner",
        suffixes=("", "_channels")
    )
)

basis_with_channels.shape

(177373, 63)

# save

In [37]:
start = time.time()
basis_with_channels.to_sql("final_basis", schema="ml_house", con=engine)
print((time.time() - start)/60)

14.239390444755553
