In [1]:
import sqlalchemy
import numpy as np
import pandas as pd
import calendar
import datetime
import clickhouse_connect
import pandahouse as ph
import logging

In [2]:
logging.basicConfig(level=logging.INFO, filename="py_log.log",filemode="w",
                    format="%(asctime)s %(levelname)s %(message)s")

# Connections

In [3]:
engine = sqlalchemy.create_engine('postgresql://postgres:5555@db.mpkazantsev.ru:5432/postgres')

In [4]:
client = clickhouse_connect.get_client(host='db.mpkazantsev.ru',
                                       port=8123,
                                       username='sergey',
                                       password='sergey')

# To make string in ' ' for insert

In [5]:
def make_db_string(value):
    if type(value) == str:
        return "'" + value + "'"

# List of days in YYYY-MM-DD 

In [23]:
def get_load_period_day_by_day(year=2018, month=5):
    num_days = calendar.monthrange(year, month)[1]
    list_of_days = [datetime.date(year, month, day).strftime("%Y-%m-%d") for day in range(1, num_days + 1)]
    return list(map(make_db_string, list_of_days))

In [24]:
days = get_load_period_day_by_day()

# Clear sl_from_pg table

In [8]:
def clear_table(ch_client, table):
    ch_client.command('ALTER TABLE maindb.{} DELETE WHERE 1=1'.format(table))

In [9]:
def drop_table(ch_client, table):
    ch_client.command('DROP TABLE maindb.{}'.format(table))

In [None]:
#drop_table(client, 'sl_from_pg')

# Get PG types

In [11]:
types = pd.read_sql("""SELECT column_name, data_type 
                       FROM information_schema.columns
                       WHERE table_name = 'ontime'
                    """, engine)
types.data_type.unique()

array(['bigint', 'date', 'text'], dtype=object)

# Map func to map types pg->ch

In [12]:
mapping_dict = {'bigint': 'Int64',
                'date': 'Date',
                'text': 'String'}

In [13]:
def mapping_types(pg_type):
    return mapping_dict[pg_type]

In [14]:
types['clickhouse_type'] = types['data_type'].apply(mapping_types)

In [15]:
types = types[['column_name', 'clickhouse_type']]

# Create table query for sl_from_pg table

In [17]:
query = "CREATE TABLE maindb.sl_from_pg ( `index` Int64, "
for _, row in types.iloc[1:].iterrows():
    query += "{} {} NULL, ".format(row[0], row[1])
query = query[: -2]
query += ") ENGINE = MergeTree ORDER BY `index`"

In [18]:
client.command(query)

''

# Additonal connection to ClickHouse, load df as is

In [20]:
connection = {'host': 'http://db.mpkazantsev.ru:8123',
              'database': 'maindb',
              'user': 'sergey',
              'password': 'sergey'}

In [21]:
table_name = 'sl_from_pg'

In [None]:
logging.info("Starting load from PostgreSQL and insert into ClickHouse")

# Migration from PG to CH (data from 1 month, day by day)

In [22]:
for day in days:
    df = pd.read_sql("""SELECT * 
                        FROM ontime 
                        WHERE "FlightDate" = {}::date 
                    """.format(day), engine)
    logging.info("df from {} created".format(day))
    ph.to_clickhouse(df, table_name, connection=connection, index=False)
    logging.info("df from {} inserted".format(day))

In [None]:
logging.info("End of migration")