In [None]:
import pandas as pd
import pandahouse as ph
import sqlalchemy
import clickhouse_connect
import logging

# Logging conf

In [None]:
logging.basicConfig(level=logging.INFO, filename="dataframe_logs.log",filemode="w",
                    format="%(asctime)s %(levelname)s %(message)s")

# Connections

In [None]:
engine = sqlalchemy.create_engine('postgresql://postgres:5555@db.mpkazantsev.ru:5432/postgres')

In [None]:
client = clickhouse_connect.get_client(host='db.mpkazantsev.ru',
                                       port=8123,
                                       username='sergey',
                                       password='sergey')

# Additonal connection to ClickHouse, load df as is

In [None]:
connection = {'host': 'http://db.mpkazantsev.ru:8123',
              'database': 'maindb',
              'user': 'sergey',
              'password': 'sergey'}

# Clear sl_from_pg table

In [None]:
def clear_table(ch_client, table):
    ch_client.command('ALTER TABLE maindb.{} DELETE WHERE 1=1'.format(table))
    logging.info("Clear table {}".format(table))

In [None]:
def drop_table(ch_client, table):
    ch_client.command('DROP TABLE maindb.{}'.format(table))
    logging.info("Drop table {}".format(table))

In [None]:
#drop_table(client, 'sl_from_pg')

In [None]:
#clear_table(client, 'sl_from_pg')

# Get PG types

In [None]:
types = pd.read_sql("""SELECT column_name, data_type 
                       FROM information_schema.columns
                       WHERE table_name = 'ontime'
                    """, engine)
types.data_type.unique()

# Map func to map types pg->ch

In [None]:
mapping_dict = {'bigint': 'Int64',
                'date': 'Date',
                'text': 'String'}

In [None]:
def mapping_types(pg_type):
    return mapping_dict[pg_type]

In [None]:
types['clickhouse_type'] = types['data_type'].apply(mapping_types)

In [None]:
types = types[['column_name', 'clickhouse_type']]

# Create table query for sl_from_pg table

In [None]:
query = "CREATE TABLE maindb.sl_from_pg ( `index` Int64, "
for _, row in types.iloc[1:].iterrows():
    query += "{} {} NULL, ".format(row[0], row[1])
query = query[: -2]
query += ') ENGINE = MergeTree ORDER BY tuple()'

In [None]:
logging.info("Start creating table")
client.command(query)
logging.info("Table created")

# Migration from PG to CH (data from 1 month)

In [None]:
def get_increment_as_df(table, increment, dfrom, dto, engine):
    logging.info("Start migration from pg to ch")
    logging.info("Start reading PG {} with {} field from {} to {}".format(table, increment, dfrom, dto))
    try:
        df = pd.read_sql("""SELECT *
                            FROM {} 
                            WHERE "{}" BETWEEN '{}'::date AND '{}'::date
                        """.format(table, increment, dfrom, dto), engine)
        logging.info("DataFrame ready")
        return df
    except:
        logging.info("Load failed")
        return False

In [None]:
def push_increment_to_clickhouse(df, table, increment, dfrom, dto, client, conn):
    logging.info("Delete existing records in ClickHouse")
    client.command("""ALTER TABLE maindb.{} DELETE WHERE "{}" BETWEEN '{}' AND '{}'""".format(table, increment, dfrom, dto))
    logging.info("Deleted")
    logging.info("Load increment to ClickHouse")
    try:
        ph.to_clickhouse(df, table, connection=conn, index=False)
        logging.info("Loaded")
        logging.info("End of migration from pg to ch")
    except:
        logging.info("Load failed")

In [None]:
df = get_increment_as_df('ontime', 'FlightDate', '2017-05-01', '2017-06-01', engine)

In [None]:
push_increment_to_clickhouse(df, 'sl_from_pg', 'FlightDate', '2017-05-01', '2017-06-01', client, connection)

In [None]:
client.close()
connection.close()