In [1]:
import os
import pandas as pd
from sqlalchemy import create_engine
from dotenv import dotenv_values
from datetime import datetime

In [2]:
def get_latest_file(folder_path):
    files = [os.path.join(folder_path, file) for file in os.listdir(folder_path)]
    
    if files:
        latest_file = max(files, key=os.path.getmtime)
        return latest_file
    else:
        return None

def read_latest_file_into_dataframe(folder_path):
    latest_file = get_latest_file(folder_path)

    if latest_file:
        if latest_file.lower().endswith('.csv'):
            df = pd.read_csv(latest_file)
        elif latest_file.lower().endswith(('.xls', '.xlsx')):
            df = pd.read_excel(latest_file)
        else:
            print(f"Unsupported file format for {latest_file}")
            return None

        return df
    else:
        print(f"No files found in {folder_path}")
        return None

In [3]:
# Define the folder path
folder_path = r"C:\Users\Temidayo\Desktop\Test_Question\Repayment_Folder"
latest_df = read_latest_file_into_dataframe(folder_path)

if latest_df is not None:
    print("DataFrame created from the latest file: File Available")
    

latest_df.head(10)

DataFrame created from the latest file: File Available


Unnamed: 0,loan_id(fk),payment_id(pk),Amount_paid,Date_paid
0,32u09wekjbfje,3434r409kmPAID123456,03/31/2021,100790.3333
1,32u09wekjbfje,3434r409kmPAID123457,03/31/2021,100790.3333
2,32u09wekjbfje,3434r409kmPAID123458,04/27/2021,100790.3333
3,32u09wekjbfje,3434r409kmPAID123459,05/27/2021,100790.3333
4,32u09wekjbfje,3434r409kmPAID123460,06/27/2021,100790.3333
5,32u09wekjbfje,3434r409kmPAID123461,07/31/2021,100790.3333
6,32u09wekjbfje,3434r409kmPAID123462,10/31/2021,100790.3333
7,32u09wekjbfje,3434r409kmPAID123463,10/31/2021,100790.3333
8,32u09wekjbfje,3434r409kmPAID123464,10/27/2021,100790.3333
9,32u09wekjbfje,3434r409kmPAID123465,11/27/2021,100790.3333


In [4]:
def transformation(latest_df):
    # Step 1: Rename columns
    latest_df.rename(columns={
        'loan_id(fk)': 'loan_id',
        'payment_id(pk)': 'payment_id',
        'Amount_paid': 'date_paid',
        'Date_paid': 'amount_paid'
    }, inplace=True)
    
    # Step 2: Convert 'date_paid' to datetime with time component
    if 'date_paid' in latest_df.columns:
        try:
            # Convert to datetime
            latest_df['date_paid'] = pd.to_datetime(latest_df['date_paid'], format='%m/%d/%Y', errors='coerce')
            
            # Set the time to 00:00:00
            latest_df['date_paid'] = latest_df['date_paid'].dt.normalize()
            
            print("Column 'date_paid' converted to datetime with time component.")
        except Exception as e:
            print(f"Error converting 'date_paid': {str(e)}")
    
    return latest_df

# Apply the transformation function
transformed_df = transformation(latest_df)

# Display the first few rows of the 'date_paid' column
print(transformed_df['date_paid'].head(10))

# Verify the datatype of the 'date_paid' column
print(transformed_df['date_paid'].dtype)

# Check for any null values
print(transformed_df['date_paid'].isnull().sum())

# Display a sample value in the desired format
print(transformed_df['date_paid'].iloc[0].strftime('%Y-%m-%d %H:%M:%S.%f')[:-3])

Column 'date_paid' converted to datetime with time component.
0   2021-03-31
1   2021-03-31
2   2021-04-27
3   2021-05-27
4   2021-06-27
5   2021-07-31
6   2021-10-31
7   2021-10-31
8   2021-10-27
9   2021-11-27
Name: date_paid, dtype: datetime64[ns]
datetime64[ns]
0
2021-03-31 00:00:00.000


In [5]:
transformed_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 84 entries, 0 to 83
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   loan_id      84 non-null     object        
 1   payment_id   84 non-null     object        
 2   date_paid    84 non-null     datetime64[ns]
 3   amount_paid  84 non-null     float64       
dtypes: datetime64[ns](1), float64(1), object(2)
memory usage: 2.8+ KB


In [6]:
transformed_df.head()

Unnamed: 0,loan_id,payment_id,date_paid,amount_paid
0,32u09wekjbfje,3434r409kmPAID123456,2021-03-31,100790.3333
1,32u09wekjbfje,3434r409kmPAID123457,2021-03-31,100790.3333
2,32u09wekjbfje,3434r409kmPAID123458,2021-04-27,100790.3333
3,32u09wekjbfje,3434r409kmPAID123459,2021-05-27,100790.3333
4,32u09wekjbfje,3434r409kmPAID123460,2021-06-27,100790.3333


In [7]:
# Adding the ingestion_date column with the current UTC timestamp
transformed_df['ingestion_date'] = datetime.utcnow()

# Display the updated DataFrame
transformed_df.head()

Unnamed: 0,loan_id,payment_id,date_paid,amount_paid,ingestion_date
0,32u09wekjbfje,3434r409kmPAID123456,2021-03-31,100790.3333,2024-08-06 19:37:06.368842
1,32u09wekjbfje,3434r409kmPAID123457,2021-03-31,100790.3333,2024-08-06 19:37:06.368842
2,32u09wekjbfje,3434r409kmPAID123458,2021-04-27,100790.3333,2024-08-06 19:37:06.368842
3,32u09wekjbfje,3434r409kmPAID123459,2021-05-27,100790.3333,2024-08-06 19:37:06.368842
4,32u09wekjbfje,3434r409kmPAID123460,2021-06-27,100790.3333,2024-08-06 19:37:06.368842


In [8]:
# Step 3: Load environment variables from .env file
env_dir = r'C:\Users\Temidayo\Desktop\Test_Question\Credentials'
env_values = dotenv_values(os.path.join(env_dir, '.env'))

In [9]:
# Function to load environment variables from .env file
def get_db_credentials(env_dir):
    env_values = dotenv_values(os.path.join(env_dir, '.env'))
    
    sql_server = env_values.get("sql_server")
    sql_database = env_values.get("sql_database")
    sql_username = env_values.get("sql_username")
    sql_password = env_values.get("sql_password")
    sql_driver = 'ODBC Driver 17 for SQL Server'
    schema_name = 'autocheck'
    table_name = 'repayment'
    
    return {
        'sql_server': sql_server,
        'sql_database': sql_database,
        'sql_username': sql_username,
        'sql_password': sql_password,
        'sql_driver': sql_driver
    }

In [10]:
# Function to load DataFrame to SQL database
def load_data_to_sql(transformed_df, table_name, env_dir, schema_name='autocheck'):
    try:
        credentials = get_db_credentials(env_dir)
        connection_string = f"mssql+pyodbc://{credentials['sql_username']}:{credentials['sql_password']}@{credentials['sql_server']}/{credentials['sql_database']}?driver={credentials['sql_driver']}"
        engine = create_engine(connection_string)
        
        # Append DataFrame to SQL table
        transformed_df.to_sql(table_name, engine, schema=schema_name, if_exists='append', index=False)
        
        print(f"Data appended to Azure SQL Database table {schema_name}.{table_name} successfully.")
    except Exception as e:
        print(f"Error writing to SQL Database: {str(e)}")

# Check if DataFrame is available before attempting to load it to SQL
if transformed_df is not None:
    table_name = 'repayment'  # Define your table name here
    load_data_to_sql(transformed_df, table_name, env_dir)

Data appended to Azure SQL Database table autocheck.repayment successfully.
