## Import modules

In [5]:
import os

import numpy as np
import pandas as pd
import psycopg2
from sqlalchemy import create_engine

## Get database credentials

In [6]:
HOST = os.environ['POSTGRES_HOST']
PORT = os.environ['POSTGRES_PORT']
USER = os.environ['POSTGRES_USER']
PASS = os.environ['POSTGRES_PASS']
DB = os.environ['POSTGRES_DB']

### Формирует SQL-запрос, который решает задачу, отправляет его на сервер и скачивает результат выполнения в pd.DataFrame

In [7]:
def create_connection(db_host, db_port, db_user, db_password, db_name):
    try:
        connection = psycopg2.connect(
            host=db_host,
            port=db_port,           
            user=db_user,
            password=db_password,
            database=db_name,
            )
    except psycopg2.OperationalError as err:
        print(err)
    return connection

In [8]:
connection = create_connection(HOST, PORT, USER, PASS, DB)

In [9]:
def execute_query(connection, query):
    cursor = connection.cursor()
    result = None
    try:
        cursor.execute(query)
        result = cursor.fetchall()
        return result
    except psycopg2.OperationalError as err:
        print(err)

In [10]:
sessions_columns = [
    'visitor_session_id',
    'site_id',
    'visitor_id',
    'date_time',
    'campaign_id'
]
communications_columns = [
    'communication',
    'site_id',
    'visitor_id',
    'date_time'
]
get_sessions_query = 'SELECT * FROM sessions;'
get_communications_query = 'SELECT * FROM communications;'

In [11]:
sessions = execute_query(connection, get_sessions_query)
communications = execute_query(connection, get_communications_query)

In [12]:
sessions_df = pd.DataFrame(data=sessions, columns=sessions_columns)
communications_df = pd.DataFrame(data=communications, columns=communications_columns)

In [13]:
merged_df = pd.merge(
    communications_df,
    sessions_df,
    on='visitor_id',
    how='left',
    suffixes=['_com', '_ses']
)

In [14]:
merged_df = merged_df.query('site_id_ses == site_id_com')

In [15]:
grouped_by_communication = merged_df.groupby('communication')

In [16]:
session_counter = grouped_by_communication['communication'].count()
merged_df = pd.merge(merged_df, session_counter, left_on='communication', right_index=True)

In [17]:
merged_df.drop(['communication_x', 'site_id_ses'], axis=1, inplace=True)

In [18]:
merged_df.rename(
    {'communication': 'communication_id',
    'site_id_com': 'site_id',
    'visitor_id_x': 'visitor_id',
    'date_time_com': 'communication_date_time',
    'date_time_ses': 'session_date_time',
    'communication_y': 'row_n'
    },
    axis='columns',
    inplace=True
)

In [28]:
result_df = pd.DataFrame()
for comm in merged_df.communication_id.unique():
    spam = pd.DataFrame()
    spam = merged_df.query('communication_id == @comm and communication_date_time > session_date_time')
    if spam['row_n'].sum() != 1:
        count_session = spam.communication_id.count()
        spam = spam.sort_values('session_date_time', ascending=False)
        spam = spam.head(1)
    else:
        spam = spam.assign(visitor_session_id = np.nan).assign(session_date_time = np.nan).assign(campaign_id = np.nan) 
        temp = spam['row_n'].values[0] - 1 if spam['row_n'].values[0] != 1 else pass
        spam = spam.assign(row_n = temp)
    result_df = pd.concat([result_df, spam])
    result_df.visitor_session_id = result_df.visitor_session_id.astype('Int64')
        

In [25]:
result_df.shape

(24, 8)

In [30]:
result_df.sort_values('communication_id')

Unnamed: 0,communication_id,site_id,visitor_id,communication_date_time,visitor_session_id,session_date_time,campaign_id,row_n
20,2056231,16649,1311935382,2018-04-01 00:21:07,,NaT,,0
19,2056235,16649,1311935382,2018-04-01 00:21:49,,NaT,,0
9,2061609,16649,848021975,2018-04-02 09:20:20,2027105873.0,2018-04-02 09:15:08,116083.0,2
22,2067161,16649,1318710746,2018-04-02 22:09:49,2032035085.0,2018-04-02 22:06:33,116083.0,2
24,2067481,30374,1318950710,2018-04-02 23:56:24,,NaT,,0
29,2079051,16649,1325560636,2018-04-04 18:11:59,2043066145.0,2018-04-04 18:01:59,116083.0,2
32,2083191,30374,1328012720,2018-04-05 12:52:27,,NaT,,0
33,2083197,30374,1328012720,2018-04-05 12:53:23,,NaT,,0
34,2084037,30374,1328492094,2018-04-05 14:52:35,,NaT,,0
36,2086624,16649,1329874720,2018-04-05 22:05:46,,NaT,,0
