In [1]:
from cassandra.io.libevreactor import LibevConnection
from cassandra.cluster import Cluster
import pandas as pd
from IPython.display import display
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

cluster = Cluster(connect_timeout=30)
cluster.connection_class = LibevConnection
session = cluster.connect()

#keyspace: v10_stg for staging and v10_prod for prod
session.set_keyspace('v10_prod')

tables = ["accounts", "users", "parties", "user_sessions", "party_user", "blocks", "block_user"]

for table in tables:
    query = f"SELECT * FROM {table}"
    rows = session.execute(query).all()
    df = pd.DataFrame(rows)
    # display(df)
    df.to_csv(f"{table}.csv", encoding="utf-8")

cluster.shutdown()

In [2]:
import numpy as np
accounts = pd.read_csv("accounts.csv", encoding="utf-8")
block_user = pd.read_csv("block_user.csv", encoding="utf-8")
blocks = pd.read_csv("blocks.csv", encoding="utf-8")
parties = pd.read_csv("parties.csv", encoding="utf-8", parse_dates=["begin_time", "end_time", "start_time", "updated_at"])
party_user = pd.read_csv("party_user.csv", encoding="utf-8", parse_dates=["created_at", "updated_at"])
users = pd.read_csv("users.csv", encoding="utf-8")
user_sessions = pd.read_csv("user_sessions.csv", encoding="utf-8")


In [3]:
parties.status.unique()

array(['ended', 'coming', 'ongoing'], dtype=object)

In [62]:
display(user_sessions.head())

Unnamed: 0.1,Unnamed: 0,id,account_id,device_uid,expires_at,user_id
0,0,5cac0722-331b-11ec-9922-5adc50ad8e5b,12b5eb10-331b-11ec-bed2-9a010fca4d91,5bd1fda9e6078acd,2040-10-26 20:15:15.354,30ecc4be-331b-11ec-8b7c-4f667ee55425
1,1,5c0d4952-407c-11ec-ad47-a273009342b9,e5d1182c-323e-11ec-9992-5adc50ad8e5b,f5518b5d5a965c7b,2040-11-12 20:52:20.613,4b96e0f6-323f-11ec-b1ea-94bbbdaee87f
2,2,c1dc3922-3321-11ec-bed2-9a010fca4d91,c1d93510-3321-11ec-bed2-9a010fca4d91,5c91d5f7f73f4172,2040-10-26 21:01:02.100,
3,3,66444430-4c3d-11ec-acc9-9a58024cb2de,1b0f6a64-3313-11ec-b92a-9a010fca4d91,c7dddaf2e30c33c3,2040-11-27 19:51:53.410,3d918522-3313-11ec-8329-feef32fc5b19
4,4,cdc30dc2-4bd3-11ec-ba2f-c2804d2a32b1,aa734a86-4bd2-11ec-9500-c687132219c8,739FDFBD-626A-4B30-A774-2658C8853299,2040-11-27 07:16:00.393,26b303f2-4bd3-11ec-87c1-e517be6df39b


In [4]:

dff = pd.merge(parties[["id", "begin_time", "end_time", "creator_id", "start_time", "status", "updated_at"]], party_user[["party_id", "user_id", "attendance_status", "created_at", "role", "updated_at"]], how="outer", left_on="id", right_on="party_id")
dff.columns = ['party_id', 'party_begin_time', 'party_end_time', 'creator_id', 'party_start_time', 'party_status', 'party_updated_at', 'party_id_', 'user_id', 'attendance_status', 'user_created_at', 'role', 'user_updated_at']
# display(dff.head())

In [5]:
dff_ = pd.merge(dff, users[["id", "account_id", "username"]], how="left", left_on="user_id", right_on="id")
dff_.columns = ['party_id', 'party_begin_time', 'party_end_time', 'creator_id', 'party_start_time', 'party_status', 'party_updated_at', 'party_id_', 'user_id', 'attendance_status', 'user_created_at', 'role', 'user_updated_at', 'user_id_', 'account_id', 'username']
del dff_['party_id_']
del dff_['user_id_']
# display(dff_.head())


In [6]:
dff_["party_duration"] = dff_["party_updated_at"] - dff_["party_begin_time"]
dff_["party_duration"] = dff_["party_duration"].dt.total_seconds()

In [7]:

dff_["start_date"] = dff_["party_begin_time"].dt.date
dff_ = dff_.sort_values(["username", "start_date", "party_updated_at","role","user_updated_at"])
# display(dff_.head(20))

In [8]:
dff_ = pd.merge(dff_, accounts[["id", "email", "phone"]], how="left", left_on="account_id", right_on="id")
dff_.to_csv("dff_.csv", encoding="utf-8")