# Part 1

In [33]:
import pandas as pd
import requests
import os

def fetch_data(source, is_local=True):
    """
    Fetches data from a local file or remote URL.
    
    Parameters:
        source (str): File path or URL to the data source.
        is_local (bool): True if fetching from a local file, False for a URL.
    
    Returns:
        pd.DataFrame: The data loaded into a DataFrame.
    """
    if is_local:
        # Read from a local file
        if os.path.isfile(source):
            if source.endswith('.csv'):
                return pd.read_csv(source)
            elif source.endswith('.json'):
                return pd.read_json(source)
            else:
                raise ValueError("Unsupported file format. Only CSV and JSON are supported.")
        else:
            raise FileNotFoundError("File not found at the specified path.")
    else:
    # Download from a URL
        response = requests.get(source)
    
    if response.status_code == 200:
        # Check the Content-Type header
        content_type = response.headers.get('Content-Type', '')
        
        # If the response is a zip file
        if 'zip' in content_type:
            import zipfile
            import io
            with zipfile.ZipFile(io.BytesIO(response.content)) as z:
                # List of extracted file names
                extracted_files = z.namelist()
                
                # Attempt to load the first CSV or JSON file found
                for filename in extracted_files:
                    if filename.endswith('.csv'):
                        with z.open(filename) as f:
                            return pd.read_csv(f)
                    elif filename.endswith('.json'):
                        with z.open(filename) as f:
                            return pd.read_json(f)
                
                raise ValueError("No CSV or JSON files found in the ZIP archive.")
        
        # If the response is JSON data
        elif 'application/json' in content_type:
            data = response.json()
            return pd.json_normalize(data)

        # Handle CSV if not a ZIP or JSON
        elif source.endswith('.csv'):
            return pd.read_csv(io.StringIO(response.text))
        
        # Handle other formats if needed
        else:
            raise ValueError("Unsupported URL format. Only CSV, JSON, and ZIP are supported.")
    else:
        raise ConnectionError(f"Failed to fetch data from URL. Status code: {response.status_code}")


# Part 2

In [5]:
import json
import pandas as pd
import sqlite3
import csv
from io import StringIO

def convert_data(input_data, input_type, output_type, sql_table_name=None, db_conn=None):
    # Step 1: Convert input to a pandas DataFrame
    if input_type == 'json':
        # Load JSON data into pandas DataFrame
        data = pd.read_json(StringIO(input_data)) if isinstance(input_data, str) else pd.DataFrame(input_data)
    
    elif input_type == 'csv':
        # Load CSV data into pandas DataFrame
        data = pd.read_csv(StringIO(input_data)) if isinstance(input_data, str) else pd.DataFrame(input_data)
    
    elif input_type == 'sql':
        # Read from SQL database into pandas DataFrame
        if db_conn is None or sql_table_name is None:
            raise ValueError("You must provide a database connection and table name for SQL input")
        query = f"SELECT * FROM {sql_table_name}"
        data = pd.read_sql(query, db_conn)
    
    else:
        raise ValueError("Unsupported input type")
    
    # Step 2: Convert DataFrame to the desired output format
    if output_type == 'json':
        return data.to_json(orient='records')  # Convert DataFrame to JSON
    
    elif output_type == 'csv':
        return data.to_csv(index=False)  # Convert DataFrame to CSV
    
    elif output_type == 'sql':
        if db_conn is None or sql_table_name is None:
            raise ValueError("You must provide a database connection and table name for SQL output")
        data.to_sql(sql_table_name, db_conn, if_exists='replace', index=False)
        return f"Data written to SQL table '{sql_table_name}'"
    
    else:
        raise ValueError("Unsupported output type")

# Part 3

In [55]:
def modify_dataframe(df, action, column_name=None, data=None):
    """
    Modify the DataFrame by adding or removing columns.
    
    Parameters:
    df (pd.DataFrame): The DataFrame to modify.
    action (str): The action to perform. 'add' to add a column, 'remove' to remove a column.
    column_name (str): The name of the column to add or remove.
    data (list or pd.Series or other data type, optional): Data for the new column (required if action is 'add').
    
    Returns:
    pd.DataFrame: The modified DataFrame.
    """
    
    if action == 'add':
        if column_name is None or data is None:
            raise ValueError("To add a column, you must provide both a column name and data.")
        if len(data) != len(df):
            raise ValueError("The length of the data must match the number of rows in the DataFrame.")
        
        # Add the column
        df[column_name] = data
        print(f"Column '{column_name}' added.")
    
    elif action == 'remove':
        if column_name is None:
            raise ValueError("To remove a column, you must provide the column name.")
        if column_name not in df.columns:
            raise ValueError(f"Column '{column_name}' does not exist in the DataFrame.")
        
        # Remove the column
        df.drop(columns=[column_name], inplace=True)
        print(f"Column '{column_name}' removed.")
    
    else:
        raise ValueError("Action must be either 'add' or 'remove'.")
    
    return df

# Controller Code

In [35]:
import json
import pandas as pd
import sqlite3
import csv
from io import StringIO

df = None


In [71]:
def run():
    option = get_option()
    print("You selected: ", option)
    do(option)

def get_option():
    print("Validating dataframe...")
    if df is None:
        print("There is currently no dataframe")
        print("ETL Pipeline: What would you like to do?")
        valid_options = ['1', '6']
        print("""Available Options:
            1: Fetch or Upload Data (CSV, JSON, SQL DB via upload or API call) 
            6: Exit program
        """)

        while True:
            print("Waiting for input...")
            option = input("Select an option: ")

            if option not in valid_options:
                print("Invalid option! Valid options are: ", valid_options)
                continue
            
            break
    
    else:
        print("Existing data detected!")
        print("ETL Pipeline: What would you like to do?")
        while True:
            print("""Available Options:
            1: Fetch or Upload Data (Overwrite existing data) 
            2: Convert file types (CSV, JSON, SQL DB)
            3: Modify dataframe (Data upload required)
            4: Export/Store Data to disk or database
            5: Summarize current dataframe
            6: Exit program
            """)
            valid_options = ['1','2','3','4','5','6']
            print("Waiting for input...")
            option = input("Select an option: ")
            if option not in valid_options:
                print("Invalid option! Valid options are: ", valid_options)
                continue
            if option == '1':
                print("WARNING: If existing data has not been exported, it will be lost. Do you wish to continue (Y/N)")
                print("Waiting for input...")
                y_n = input("Do you wish to continue (Y/N)")
                if y_n.upper() == 'Y': 
                    break 
                else: 
                    continue
            
            break

    return option
    

def do(option):
    global df
    option = int(option)
    if option == 1:
        df = option_1()
        print("SUCESSFULLY UPLOADED DATAFRAME. Printing head...")
        print("---------------------------------------------------------------------------------------------------------")
        print(df.head())
        print("---------------------------------------------------------------------------------------------------------")
    
    if option == 3:
        df = option_3()
        print("SUCESSFULLY MODIFIED DATAFRAME. Printing head...")
        print("---------------------------------------------------------------------------------------------------------")
        print(df.head())
        print("---------------------------------------------------------------------------------------------------------")

    if option == 4:
        option_4()

    if option == 5:
        option_5()

def option_1():
    print("Would you like to get your data locally or via API \n 1: Locally \n 2: API")
    
    while True:
        print("Waiting for input...")
        option = input("Select an option: ")

        valid_options = ['1', '2']
        if option not in valid_options:
            print("Invalid option! Valid options are: ", valid_options)
            continue

        break

    option = int(option)

    if option == 1:
        print("You selected: Locally. Please put the file into the input_data folder and provide the file name, incuding its extension (e.g. data.csv, data.json, etc)")
        print("Waiting for input...")
        file_name = input("Enter file name: ")
        source = f"input_data/{file_name}"

        return fetch_data(source)
    
    if option == 2:
        print("You selected: via API. Please provide the URL for the API to retrieve the data. Note, the URL must link diirectly to JSON data, or a CSV or JSON file or zip archive containing the CSV/JSON file")
        print("Waiting for input...")
        source = input("input URL: ")

        return fetch_data(source, False)


def option_3():
    global df
    print("How would you like to modify the dataframe? \n 1: Add column \n 2: Remove column")
    while True:
        print("Waiting for input...")
        option = input("Select an option: ")

        valid_options = ['1', '2']
        if option not in valid_options:
            print("Invalid option! Valid options are: ", valid_options)
            continue

        break
    
    if option == '1':
        print("You selected: Add column. Please provide the name of the column you'd like to add")
        print("Waiting for input...")
        name = input("Enter name of new column")
        print("Please provide the data you'd like to add, separated by commas")
        data = input("Enter data separated by commas").split(',')
        return modify_dataframe(df, 'add', name, data)

    
    if option == '2':
        print("You selected: Remove column. Please provide the name of the column you'd like to remove")        
        print("Waiting for input...")
        name = input("Enter name of column to remove")
        return modify_dataframe(df, 'remove', name)

def option_4():
    print("How would you like the dataframe to be stored? Options: \n 1: Write to local disk as CSV or JSON \n 2: Write to SQL database")
    print("Waiting for input...")
    while True:
        print("Waiting for input...")
        option = input("Select an option: ")

        valid_options = ['1', '2']
        if option not in valid_options:
            print("Invalid option! Valid options are: ", valid_options)
            continue

        break


def option_5():
    global df
    print("Generating summary of dataframe...")
    print(df.info())
    print(df.describe())

run()

Validating dataframe...
Existing data detected!
ETL Pipeline: What would you like to do?
Available Options:
            1: Fetch or Upload Data (Overwrite existing data) 
            2: Convert file types (CSV, JSON, SQL DB)
            3: Modify dataframe (Data upload required)
            4: Export/Store Data to disk or database
            5: Summarize current dataframe
            6: Exit program
            
Waiting for input...
You selected:  5
Generating summary of dataframe...
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6194 entries, 0 to 6193
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   gender         6194 non-null   object 
 1   international  6194 non-null   bool   
 2   gpa            6194 non-null   float64
 3   major          6194 non-null   object 
 4   race           4352 non-null   object 
 5   gmat           6194 non-null   float64
 6   work_exp       6194 non-null   float64
 7   wor

In [67]:
print(df.head())

  Unnamed: 0    Unnamed: 1 Unnamed: 2 Unnamed: 3    Unnamed: 4 Totals  \
0         Rk        Player      Class        Pos        School      G   
1          1  Amaree Abram         SO          G  Georgia Tech     10   

  Unnamed: 6 Unnamed: 7 Unnamed: 8 Unnamed: 9  ... Unnamed: 12 Unnamed: 13  \
0         MP        TRB        AST        STL  ...          PF         PTS   
1        108         17         11          1  ...          10          34   

  Shooting Unnamed: 15 Unnamed: 16 Unnamed: 17 Advanced Unnamed: 19  \
0      FG%         2P%         3P%         FT%      PER          WS   
1    0.262       0.318         0.2       0.615      4.1        -0.1   

  Unnamed: 20    bd  
0         BPM   pls  
1        -6.4  work  

[2 rows x 22 columns]
