In [1]:
# Importing Necessary dependencies
import pandas as pd
import snowflake.connector as sf
import numpy as np
import time
import boto3
import os
import zipfile
import smtplib, ssl
import shutil
import csv
from snowflake.connector.pandas_tools import write_pandas

In [3]:
#Create Log for review
start_time = time.time()
claimslog = []

In [4]:
try:
    #Create Local Directory to store files in temporarily
    os.makedirs('C:\\Users\\Jad Driggers\\Documents\\Vesta\\CTLFILES')
    os.chdir('C:\\Users\\Jad Driggers\\Documents\\Vesta\\CTLFILES')
    root_directory = os.getcwd()
    claimslog.append('Successfully created CTL File Temporary Folder at ' + time.strftime('%Y-%m-%d %H:%M:%S',
                                                                                          time.localtime(time.time())))
except Exception as e:
    claimslog.append('There was an error with creating the temporary CTL File - ' + str(e))
    print('Successfully created CTL File Temporary Folder at ' + time.strftime('%Y-%m-%d %H:%M:%S',
                                                                               time.localtime(time.time())))

Successfully created CTL File Temporary Folder at 2022-11-18 09:21:55


In [5]:
try:
    #Create Connection Object for Connecting to AWS
    s3 = boto3.resource(
        service_name='s3',
        region_name='us-east-1',
        aws_access_key_id=os.getenv('aws_access_key_id'),
        aws_secret_access_key=os.getenv('aws_secret_access_key'))
    print('AWS connection object created at ' + time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())))
    claimslog.append(
        'AWS connection object created at ' + time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())))
except Exception as e:
    print('There was an error with creating the AWS connection object - ' + str(e))
    claimslog.append('There was an error with creating the AWS connection object - ' + str(e))

AWS connection object created at 2022-11-18 09:21:59


In [6]:
#Create the file name format for locating the proper CTL files to parse

filename_format_list = ['CL_VESTA_Claims_Monthly_202211081555.txt', 'CL_VESTA_Enrollment_202211081449.txt', 'CL_VESTA_Pharmacy_Monthly_202211081603.txt']
claimslog.append('Looking for CTL Files that start like ' + " and ".join(filename_format_list) + ' at ' + time.strftime(
    '%Y-%m-%d %H:%M:%S', time.localtime(time.time())))
print('Looking for CCA Files that start like ' + " and ".join(filename_format_list) + ' at ' + time.strftime(
    '%Y-%m-%d %H:%M:%S', time.localtime(time.time())))

Looking for CCA Files that start like CL_VESTA_Claims_Monthly_202211081555.txt and CL_VESTA_Enrollment_202211081449.txt and CL_VESTA_Pharmacy_Monthly_202211081603.txt at 2022-11-18 09:45:09


In [7]:
print(filename_format_list)

['CL_VESTA_Claims_Monthly_202211081555.txt', 'CL_VESTA_Enrollment_202211081449.txt', 'CL_VESTA_Pharmacy_Monthly_202211081603.txt']


In [8]:
#Creating a list to store all the keys (file names) to download

key_list = []

try:
    #Searching the S3 bucket for the most current Ping Files
    for obj in s3.Bucket('hometeam-clinical-data').objects.all():
        for filename_format in filename_format_list:
            if filename_format in str(obj):
                #print(obj.key)
                key_list.append(obj.key)

except Exception as e:
    claimslog.append('There was an error while looking for most CCA Files - ' + str(e))

In [9]:
try:
    #Downloading each of the files found in the key list
    for file in key_list:
        s3.Bucket('hometeam-clinical-data').download_file(file, file.split('/')[2])
        print(
            'Files were successfully downloaded at ' + time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())))
    claimslog.append(
        'Files were successfully downloaded at ' + time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())))
except Exception as e:
    claimslog.append('There was an error while trying to download the CCA Files - ' + str(e))

Files were successfully downloaded at 2022-11-18 09:46:22
Files were successfully downloaded at 2022-11-18 09:46:34
Files were successfully downloaded at 2022-11-18 09:47:03


In [10]:
#On local computer, change directory and set directory for unzipping of files.
os.chdir('C:\\Users\\Jad Driggers\\Documents\\Vesta\\CTLFILES')
root_directory = os.getcwd()

In [13]:
%time
#Create dictionary to store dataframes as they are created
df_dict = {}
error_dict = {}

#Set current directory
cwd = os.chdir('C:\\Users\\Jad Driggers\\Documents\\Vesta\\CTLFILES')

#Loop through all txt files in the directory
for i, file in enumerate(os.listdir(cwd)):
    if '.txt' in file:

        #empty lists to story the data while cleaning
        df_list = []
        df_error_list = []

        #open the txt file
        with open(file, 'r') as csvfile:
            reader = csv.reader(csvfile, delimiter='\t')

            #read through each line and find any rows with errors
            for i, row in enumerate(reader):
                if i == 0:
                    #capture the correct number of columns for the dataframe
                    correct_columns = len(row)

                df_list.append(row)

                #create list of rows with errors
                if len(row) < correct_columns:
                    df_error_list.append(i)

            #Check to see if the list row in the data frame is an empty row, if so, drop it
            if len(df_list[df_error_list[-1]]) == 0:
                df_error_list.pop()

            #The error exists between two rows, so looking at the second occurance of an error
            #and deleting the first item should fix the error
            for i, error in enumerate(df_error_list):
                if i % 2 != 0:
                    df_list[error].pop(0)

            #Loop back through the error list and join first errors to second errors to make a complete row
            for i, error in enumerate(df_error_list):
                if i % 2 == 0:
                    df_list[error] = df_list[error] + df_list[error + 1]

            #Loop back through the entire data frame list to delete those rows that are smaller than the correct
            #number of columns
            for i, item in enumerate(df_list):
                if len(item) < correct_columns:
                    del df_list[i]

            df = pd.DataFrame(df_list[1:])
            df.columns = df_list[0]
            df = df.rename(columns={df.columns[0]: df.columns[0][3:]})
            df = df.astype(str)
            df.columns = map(lambda x: str(x).upper(), df.columns)
            df_dict[csvfile.name.replace(' ', '_').split(".")[0]] = df
            error_dict[csvfile.name.replace(' ', '_').split(".")[0]] = df_error_list

        csvfile.close()

for key, value in df_dict.items():
    print(key)

print(error_dict)

CPU times: total: 0 ns
Wall time: 0 ns


IndexError: list index out of range

In [None]:
#Delete all contents in the temporary CCA Folder
os.chdir('C:\\Users\\Jad Driggers\\Documents\\Vesta')
shutil.rmtree('C:\\Users\\Jad Driggers\\Documents\\Vesta\\CTLFILES')
claimslog.append('Successfully Deleted all contents in temporary CTL Folder at ' + time.strftime('%Y-%m-%d %H:%M:%S',
                                                                                                 time.localtime(
                                                                                                     time.time())))

In [None]:
df_dict[]

In [6]:


# Create Connection Object for Connecting to AWS
def AWSConnection():
    try:
        s3 = boto3.resource(
            service_name='s3',
            region_name='us-east-1',
            aws_access_key_id=os.getenv('aws_access_key_id'),
            aws_secret_access_key=os.getenv('aws_secret_access_key'))
        claimslog.append('AWS connection object created at '
                         + time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())))
        return s3
    except Exception as e:
        claimslog.append('There was an error with creating the AWS connection object - ' + str(e))

    # Search S3 Bucket and Download Files Locally


def S3SearchAndDownload(connection, filename_list):
    #Creating a list to store all the keys (file names) to download
    key_list = []

    try:
        #Searching the S3 bucket for the most current Ping Files
        for obj in connection.Bucket('hometeam-clinical-data').objects.all():
            for filename_format in filename_format_list:
                if filename_format in str(obj):
                    #print(obj.key)
                    key_list.append(obj.key)

    except Exception as e:
        claimslog.append('There was an error while looking for most CCA Files - ' + str(e))

    try:
        #Downloading each of the files found in the key list
        for file in key_list:
            s3.Bucket('hometeam-clinical-data').download_file(file, file.split('/')[2])
        claimslog.append('Files were successfully downloaded at '
                         + time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())))
    except Exception as e:
        claimslog.append('There was an error while trying to download the CTL Files - ' + str(e))


# Unzipping files 
def UnzipFiles():
    # On local computer, change directory and set directory for unzipping of files.
    os.chdir('C:\\Users\\Jad Driggers\\Documents\\Vesta\\CTLFILES')
    root_directory = os.getcwd()

    # Locate only Zipped Files
    files_to_unzip = []
    for filename in os.listdir(root_directory):
        if 'zip' in filename:
            files_to_unzip.append(filename)
    try:
        # Unzip each file in the Zipped files list
        for zipped_file in files_to_unzip:
            with zipfile.ZipFile(root_directory + "\\" + zipped_file, 'r') as zip_ref:
                # print(zipped_file)
                zip_ref.extractall(root_directory)
        shutil.unpack_archive(root_directory + "\\" + zipped_file, root_directory + "\\" + zipped_file.split('.')[0])
        claimslog.append('Successfully Unzipped each file at '
                         + time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())))
    except Exception as e:
        claimslog.append('There was an error while trying to unzip each file - ' + str(e))


# CTL Files Dictionary
def CreateDFDict():
    # Create dictionary to store dataframes as they are created 
    df_dict = {}
    error_dict = {}

    # Set current directory
    cwd = os.chdir('C:\\Users\\Jad Driggers\\Documents\\Vesta\\CTLFILES')

    # Loop through all txt files in the directory
    for i, file in enumerate(os.listdir(cwd)):
        if '.txt' in file:

            # Empty lists to story the data while cleaning
            df_list = []
            df_error_list = []

            # Open the txt file
            with open(file, 'r') as csvfile:
                reader = csv.reader(csvfile, delimiter='|')

                # Read through each line and find any rows with errors
                for i, row in enumerate(reader):
                    if i == 0:
                        # Capture the correct number of columns for the dataframe
                        correct_columns = len(row)

                    df_list.append(row)

                    # Create list of rows with errors
                    if len(row) < correct_columns:
                        df_error_list.append(i)

                # Review error list
                if df_error_list:

                    # Check to see if the list row in the data frame is an empty row, if so, drop it
                    if len(df_list[df_error_list[-1]]) == 0:
                        df_error_list.pop()

                    # The error exists between two rows, so looking at the second occurance of an error 
                    # and deleting the first item should fix the error
                    for i, error in enumerate(df_error_list):
                        if i % 2 != 0:
                            df_list[error].pop(0)

                    # Loop back through the error list and join first errors to second errors to make a complete row
                    for i, error in enumerate(df_error_list):
                        if i % 2 == 0:
                            df_list[error] = df_list[error] + df_list[error + 1]

                # Loop back through the entire data frame list to delete those rows that are smaller than the correct
                # number of columns
                for i, item in enumerate(df_list):
                    if len(item) < correct_columns:
                        del df_list[i]

                df = pd.DataFrame(df_list[1:])
                df.columns = df_list[0]
                df = df.astype(str)
                df.columns = map(lambda x: str(x).upper(), df.columns)
                df_dict[csvfile.name.replace(' ', '_').split(".")[0]] = df
                error_dict[csvfile.name.replace(' ', '_').split(".")[0]] = df_error_list

            csvfile.close()

    for key, value in df_dict.items():
        print(key)

    print(error_dict)

    return df_dict


# Delete all files in local folder
def FolderDeletion():
    os.chdir('C:\\Users\\Jad Driggers\\Documents\\Vesta\\CTLFILES')
    shutil.rmtree('C:\\Users\\Jad Driggers\\Documents\\Vesta\\CTLFILES')
    claimslog.append('Successfully Deleted all contents in temporary CTL Folder at ' +
                     time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())))


# Create a dictionary that stores the columns in the data frame and the max length of the values in those columns
# in order to create table in snowflake to minimize table size
def SQLTableStructure():
    #Create two dictionaries to store the columns and the max len of values in those columns
    max_col_len = {}
    col_dict = {}

    #Vectorizing the lenth function
    measurer = np.vectorize(len)

    #Looping through df_dictionary to capture column names and max len of values in those columns
    max_col_len = {}
    for key, value in df_dict.items():
        col_len = measurer(df_dict[key].astype(str)).max(axis=0)
        max_col_len[key] = col_len
        col_dict[key] = df_dict[key].columns.tolist()

    #Function for joining the two dictionaries with similar keys (claim files)    
    def common_entries(*dcts):
        if not dcts:
            return
        for i in set(dcts[0]).intersection(*dcts[1:]):
            yield (i,) + tuple(d[i] for d in dcts)

    mylist = list(common_entries(col_dict, max_col_len))

    #Creating new dictionary and zipping the column names with respective max len of values in those columns
    sql_dict = {}
    for x in mylist:
        sql_dict[x[0]] = list(zip(x[1], x[2]))

    #Iterating through the list values to prep for SQL to Snowflake
    sql_script_dict_table = {}
    for key, value in sql_dict.items():
        script_string_table = ''
        for (col, max_len) in sql_dict[key]:
            script_string_table += str(col) + ' VARCHAR(' + str(max_len + 10) + '),'
        sql_script_dict_table[key] = "(" + script_string_table[:-1] + ")"

    return sql_script_dict_table


# Create Snowflake Object
def SnowflakeConnection():
    #Creating of parameters for securing connection to Snowflake
    username = os.getenv('Snowflake_User')
    password = os.getenv('Snowflake_password')
    account = os.getenv('Snowflake_account')


    #Define parameters if neccessary
    warehouse = 'DEVELOPER_STANDARD'
    database = 'VESTA_DEVELOPMENT'
    schema = 'ANALYST_SANDBOX'

    #Create connection object for Snowflake connection
    conn = sf.connect(user=username, password=password, account=account, warehouse=warehouse)
    return conn


# Snowflake execution function
def execute_query(connection, query):
    cursor = connection.cursor()
    cursor.execute(query)
    cursor.close

In [None]:
#Create Log for review
start_time = time.time()
claimslog = []

# Create local folder
FolderCreation()

# Create AWS connection object
s3 = AWSConnection()

# Create the filename format for locating the proper CTL files to parse
filename_format_list = ['CL_VESTA_Claims_Monthly_202203081839.txt',
                        'CL_Vesta_Pharmacy_Catchup_202203081328.txt',
                        'CL_VESTA_Enrollment_202203081316.txt']

# Search AWS for files and download locally
S3SearchAndDownload(s3, filename_format_list)

# Not neccessary with CTL
# UnzipFiles()

# Create dataframe dictionary of CTL files
df_dict = CreateDFDict()

# Delete all local CTL files and local CTL folder
FolderDeletion()

# Detemine the table structure and size for Snowflake
sql_table = SQLTableStructure()

# # Creating of parameters for securing connection to Snowflake
# username =
# password =
# account =

# # Define parameters if neccessary
# warehouse =
# database =
# schema =

# # Create connection object for Snowflake connection
# conn = sf.connect(user=username, password=password, account=account, warehouse=warehouse)

# # Define Database to use in Snowflake
# sql = 'USE DATABASE {}'.format(database)
# execute_query(conn,sql)

# # Define Schema to use in Snowflake
# sql = 'USE SCHEMA {}.{}'.format(database,schema)
# execute_query(conn,sql)

# # Define Warehouse to use in Snowflake
# sql = 'USE WAREHOUSE {}'.format(warehouse)
# execute_query(conn,sql)

# # Create CTL_ENROLL_RAW
# try:
#     sql = 'CREATE TABLE IF NOT EXISTS CTL_ENROLL_RAW_TEST ' + sql_table['CL_VESTA_Enrollment_202203081316']
#     execute_query(conn,sql)
#     success, nchucks, nrows, _ = write_pandas(conn,df_dict['CL_VESTA_Enrollment_202203081316'],'CTL_ENROLL_RAW_TEST')
# except Exception as e:
#         print(e)  




In [None]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [None]:
df_dict['CL_VESTA_Claims_Monthly_202203081839'][df_dict['CL_VESTA_Claims_Monthly_202203081839']['PROCEDURE_CODE'].str.contains('0013A')]

In [None]:
claimslog

In [None]:
my_string = '\n'.join(claimslog)

port = 465  # For SSL
smtp_server = "smtp.gmail.com"
sender_email = "VestaPingLog@gmail.com"  # Enter your address
receiver_email_list = ["jdriggers@vestahealthcare.com", "john@vestahealthcare.com",
                       'joe@vestahealthcare.com']  # Enter receiver address
password = os.getenv('Vesta_Ping_Log_Email')
message = "Subject: Ping Logs \n" + '''
             
''' + my_string

context = ssl.create_default_context()
with smtplib.SMTP_SSL(smtp_server, port, context=context) as server:
    server.login(sender_email, password)
    for receiver_email in receiver_email_list:
        server.sendmail(sender_email, receiver_email, message)