# Part 1

In [4]:
import pandas as pd
import requests
import os

def fetch_data(source, is_local=True):
    """
    Fetches data from a local file or remote URL.
    
    Parameters:
        source (str): File path or URL to the data source.
        is_local (bool): True if fetching from a local file, False for a URL.
    
    Returns:
        pd.DataFrame: The data loaded into a DataFrame.
    """
    if is_local:
        # Read from a local file
        if os.path.isfile(source):
            if source.endswith('.csv'):
                return pd.read_csv(source)
            elif source.endswith('.json'):
                return pd.read_json(source)
            else:
                raise ValueError("Unsupported file format. Only CSV and JSON are supported.")
        else:
            raise FileNotFoundError("File not found at the specified path.")
    else:
        # Download from a URL
        response = requests.get(source)
        if response.status_code == 200:
            if source.endswith('.csv'):
                # Read the content into a DataFrame
                return pd.read_csv(pd.compat.StringIO(response.text))
            elif source.endswith('.json'):
                # Read the content into a DataFrame
                return pd.read_json(response.content)
            else:
                raise ValueError("Unsupported URL format. Only CSV and JSON are supported.")
        else:
            raise ConnectionError(f"Failed to fetch data from URL. Status code: {response.status_code}")


# Part 2

In [5]:
import json
import pandas as pd
import sqlite3
import csv
from io import StringIO

def convert_data(input_data, input_type, output_type, sql_table_name=None, db_conn=None):
    # Step 1: Convert input to a pandas DataFrame
    if input_type == 'json':
        # Load JSON data into pandas DataFrame
        data = pd.read_json(StringIO(input_data)) if isinstance(input_data, str) else pd.DataFrame(input_data)
    
    elif input_type == 'csv':
        # Load CSV data into pandas DataFrame
        data = pd.read_csv(StringIO(input_data)) if isinstance(input_data, str) else pd.DataFrame(input_data)
    
    elif input_type == 'sql':
        # Read from SQL database into pandas DataFrame
        if db_conn is None or sql_table_name is None:
            raise ValueError("You must provide a database connection and table name for SQL input")
        query = f"SELECT * FROM {sql_table_name}"
        data = pd.read_sql(query, db_conn)
    
    else:
        raise ValueError("Unsupported input type")
    
    # Step 2: Convert DataFrame to the desired output format
    if output_type == 'json':
        return data.to_json(orient='records')  # Convert DataFrame to JSON
    
    elif output_type == 'csv':
        return data.to_csv(index=False)  # Convert DataFrame to CSV
    
    elif output_type == 'sql':
        if db_conn is None or sql_table_name is None:
            raise ValueError("You must provide a database connection and table name for SQL output")
        data.to_sql(sql_table_name, db_conn, if_exists='replace', index=False)
        return f"Data written to SQL table '{sql_table_name}'"
    
    else:
        raise ValueError("Unsupported output type")

# Part 3

In [None]:
def modify_dataframe(df, action, column_name=None, data=None):
    """
    Modify the DataFrame by adding or removing columns.
    
    Parameters:
    df (pd.DataFrame): The DataFrame to modify.
    action (str): The action to perform. 'add' to add a column, 'remove' to remove a column.
    column_name (str): The name of the column to add or remove.
    data (list or pd.Series or other data type, optional): Data for the new column (required if action is 'add').
    
    Returns:
    pd.DataFrame: The modified DataFrame.
    """
    
    if action == 'add':
        if column_name is None or data is None:
            raise ValueError("To add a column, you must provide both a column name and data.")
        if len(data) != len(df):
            raise ValueError("The length of the data must match the number of rows in the DataFrame.")
        
        # Add the column
        df[column_name] = data
        print(f"Column '{column_name}' added.")
    
    elif action == 'remove':
        if column_name is None:
            raise ValueError("To remove a column, you must provide the column name.")
        if column_name not in df.columns:
            raise ValueError(f"Column '{column_name}' does not exist in the DataFrame.")
        
        # Remove the column
        df.drop(columns=[column_name], inplace=True)
        print(f"Column '{column_name}' removed.")
    
    else:
        raise ValueError("Action must be either 'add' or 'remove'.")
    
    return df

# Controller Code

In [14]:
df = None

def run():
    option = get_option()
    print("You selected: ", option)
    do(option)

def get_option():
    print("Validating dataframe...")
    if not df:
        print("There is currently no dataframe")
        print("ETL Pipeline: What would you like to do?")
        valid_options = ['1', '6']
        print("""Available Options:
            1: Fetch or Upload Data (CSV, JSON, SQL DB via upload or API call) 
            6: Exit program
        """)

        while True:
            print("Waiting for input...")
            option = input("Select an option: ")

            if option not in valid_options:
                print("Invalid option! Valid options are: ", valid_options)
                continue
            
            break
    
    else:
        print("Existing data detected!")
        print("ETL Pipeline: What would you like to do?")
        print("""Available Options:
        1: Fetch or Upload Data (Overwrite existing data) 
        2: Convert file types (CSV, JSON, SQL DB)
        3: Modify dataframe (Data upload required)
        4: Export/Store Data to disk or database
        5: Summarize current dataframe
        6: Exit program
        """)
        valid_options = ['1','2','3','4','5','6']
        while True:
            print("Waiting for input...")
            option = input("Select an option: ")
            if option not in valid_options:
                print("Invalid option! Valid options are: ", valid_options)
                continue
            if option == 1:
                print("Warning: If existing data has not been exported, it will be lost. Do you wish to continue (Y/N)")
                print("Waiting for input...")
                y_n = input("Do you wish to continue (Y/N)")
                if y_n == 'Y': 
                    break 
                else: 
                    continue
            
            break

    return option
    

def do(option):
    option = int(option)
    if option == 1:
        df = option_1()

    print(df)

def option_1():
    print("Would you like to get your data locally or via API \n 1: Locally \n 2: API")
    
    while True:
        print("Waiting for input...")
        option = input("Select an option: ")

        valid_options = ['1', '2']
        if option not in valid_options:
            print("Invalid option! Valid options are: ", valid_options)
            continue

        break

    option = int(option)

    if option == 1:
        print("You selected: Locally. Please put the file into the input_data folder and provide the file name, incuding its extension (e.g. data.csv, data.json, etc)")
        print("Waiting for input...")
        file_name = input("Enter file name: ")
        source = f"input_data/{file_name}"

        return fetch_data(source)

run()

Validating dataframe...
There is currently no dataframe
ETL Pipeline: What would you like to do?
Available Options:
            1: Fetch or Upload Data (CSV, JSON, SQL DB via upload or API call) 
            6: Exit program
        
Waiting for input...
You selected:  1
Would you like to get your data locally or via API
1: Locally 
 2: API
Waiting for input...
You selected: Locally. Please put the file into the input_data folder and provide the file name, incuding its extension (e.g. data.csv, data.json, etc)
Waiting for input...
      application_id  gender  international   gpa       major      race  \
0                  1  Female          False  3.30    Business     Asian   
1                  2    Male          False  3.28  Humanities     Black   
2                  3  Female           True  3.30    Business       NaN   
3                  4    Male          False  3.47        STEM     Black   
4                  5    Male          False  3.35        STEM  Hispanic   
...           