In [1]:
import pandas as pd
import chardet
import ibm_db
import os
import json


def get_csv_file_size(file_path):
    try:
        # Get the size of the file in bytes
        file_size_bytes = os.path.getsize(file_path)

        # Convert bytes to kilobytes or megabytes if needed
        file_size_kb = file_size_bytes / 1024  # Convert bytes to kilobytes
        file_size_mb = file_size_kb / 1024     # Convert kilobytes to megabytes

        return file_size_bytes, file_size_kb, file_size_mb

    except Exception as e:
        print(f"Error getting file size: {e}")
        return None
    

def get_file_encoding(file_path):
    try:
        with open(file_path, 'rb') as f:
            # Read a chunk of the file to detect the encoding
            rawdata = f.read()
            result = chardet.detect(rawdata)
            return result['encoding']
    except Exception as e:
        print(f"Error detecting file encoding: {e}")
        return None


def get_csv_metadata(file_path):
    try:
        df = pd.read_csv(file_path)
        column_names = df.columns.tolist()
    
        data_types = df.dtypes.tolist()
        
        # summary_statistics = df.describe()
        
        num_rows, num_columns = df.shape
        metadata = {
            'column_names': column_names,
            'data_types': data_types,
            # 'summary_statistics': summary_statistics,
            'encoding':get_file_encoding(file_path),
            'file_size':get_csv_file_size(file_path),
            'num_rows': num_rows,
            'num_columns': num_columns
            
        }
        return metadata
    except Exception as e:
        print(f"Error reading CSV file: {e}")
        return None



csv_file_path = "data.csv"  
metadata = get_csv_metadata(csv_file_path)
if metadata:
    print("CSV file is valid. Metadata:")
    print(metadata)
else:
    print("CSV file is corrupted or cannot be read.")

CSV file is valid. Metadata:
{'column_names': ['year', 'industry_code_ANZSIC', 'industry_name_ANZSIC', 'rme_size_grp', 'variable', 'value', 'unit'], 'data_types': [dtype('int64'), dtype('O'), dtype('O'), dtype('O'), dtype('O'), dtype('O'), dtype('O')], 'encoding': 'UTF-8-SIG', 'file_size': (1492633, 1457.6494140625, 1.4234857559204102), 'num_rows': 17028, 'num_columns': 7}


In [None]:
def get_config_object(file_name='config.json'): 
	with open('config.json', 'r') as file: 
		config = json.load(file)
	return config

In [None]:
def store_df_to_db2(df, table_name):
    config = get_config_object()
    # Connection parameters
    database_name = config['database']['database_name']
    hostname = config['database']['host']
    port = config['database']['port']
    protocol = "TCPIP"
    user = config['database']['username']
    password = config['database']['password']

    # Connection string
    dsn = (
        "DRIVER={{IBM DB2 ODBC DRIVER}};"
        "DATABASE={0};"
        "HOSTNAME={1};"
        "PORT={2};"
        "PROTOCOL={3};"
        "UID={4};"
        "PWD={5};"
    ).format(database_name, hostname, port, protocol, user, password)

    # Establish the database connection
    conn = ibm_db.connect(dsn, "", "")

    # Check if the connection is successful
    if conn:
        print("Connected to the database")

        # Insert DataFrame into DB2 table
        df.to_sql(table_name, conn, if_exists='append', index=False)

        # Close the connection
        ibm_db.close(conn)
        print("Connection closed")
    else:
        print("Failed to connect to the database")

In [None]:
from ftplib import FTP
import pandas as pd
import io


def read_csv_from_ftp_and_delete(host, username, password, directory):
    try:
        df_list = []
        # Connect to the FTP server
        ftp = FTP(host)
        ftp.login(username, password)
        # Change to the specified directory

        ftp.cwd(directory)

        # List all files in the directory
        file_list = ftp.nlst()

        for filename in file_list:
            # Read the CSV file into a BytesIO object
            csv_data = io.BytesIO()
            ftp.retrbinary(f'RETR {filename}', csv_data.write)
            csv_data.seek(0)  # Reset the file pointer to the beginning

            # Read the CSV data from BytesIO into a pandas DataFrame
            df = pd.read_csv(csv_data)
            df_list.append(df)

            # Process the data as needed
            print(df)

            # Delete the file on the remote machine
            ftp.delete(filename)
            print(f"File '{filename}' deleted successfully.")

        # Close the FTP connection
        ftp.quit()
        return df_list

    except Exception as e:
        print(f"Error: {e}")


# Example usage:
host = 'ftp.example.com'
username = 'your_username'
password = 'your_password'
remote_file = 'example.csv'

read_csv_from_ftp_and_delete(host, username, password, remote_file)

In [None]:
# Concatenate along rows (axis=0)
result = pd.concat([df1, df2], ignore_index=True)

In [None]:
from ftplib import FTP


def delete_files_in_ftp_directory(host, username, password, directory):
    try:
        # Connect to the FTP server
        ftp = FTP(host)
        ftp.login(username, password)

        # Change to the specified directory
        ftp.cwd(directory)

        # List all files in the directory
        file_list = ftp.nlst()

        # Delete each file in the directory
        for file in file_list:
            print(f"Deleting file: {file}")
            ftp.delete(file)

        # Close the FTP connection
        ftp.quit()

    except Exception as e:
        print(f"Error: {e}")


# Example usage:
host = 'ftp.example.com'
username = 'your_username'
password = 'your_password'
directory = '/path/to/directory'

delete_files_in_ftp_directory(host, username, password, directory)

In [None]:
from ftplib import FTP
import pandas as pd
import os
import tempfile


def convert_files_to_dataframes(host, username, password, directory):
    try:
        # Connect to the FTP server
        ftp = FTP(host)
        ftp.login(username, password)

        # Change to the specified directory
        ftp.cwd(directory)

        # List all files in the directory
        file_list = ftp.nlst()

        # Create a temporary directory to store downloaded files
        temp_dir = tempfile.mkdtemp()

        # Loop through each file in the directory
        for file_name in file_list:
            print(f"Downloading file: {file_name}")
            # Download the file to the temporary directory
            local_file_path = os.path.join(temp_dir, file_name)
            with open(local_file_path, 'wb') as f:
                ftp.retrbinary(f'RETR {file_name}', f.write)

            # Read the file into a pandas DataFrame
            # Assuming CSV format, adjust as needed
            df = pd.read_csv(local_file_path)
            print(df)  # Do whatever processing you need with the DataFrame

            # Delete the local temporary file
            os.remove(local_file_path)

        # Close the FTP connection
        ftp.quit()

    except Exception as e:
        print(f"Error: {e}")


# Example usage:
host = 'ftp.example.com'
username = 'your_username'
password = 'your_password'
directory = '/path/to/directory'

convert_files_to_dataframes(host, username, password, directory)