# todo

slim_dataframe

list-based or dictionary based...

list based...

using indexes as rows...

list of lists?
for each list, follow index...


In [1]:
import json


class PyPhrame:
    """
    A python-dictionary based minimal pandas emulator
    """


    def __init__(self):
        """
        Initialize an empty dictionary to store the data.
        Keys are column names, and values are lists of column data.
        """
        self.data = {}


    def add_column(self, column_name, data):
        """
        Add a new column to the dataframe.

        Args
          column_name: str - The name of the new column.
          data: list - A list of data values for the column.
        """

        # Check for duplicate column names.
        if column_name in self.data:
            raise ValueError("Column already exists")

        # Validate data types, only int, float, or str are allowed.
        if not all(isinstance(d, (int, float, str)) for d in data):
            raise ValueError("Data must be int, float, or str")

        # Add the new column to the data dictionary.
        self.data[column_name] = data


    def get_column(self, column_name):
        """
        Retrieve data for a specified column.
        Args
          column_name: str - The name of the column to retrieve.

        Returns
          list - The data of the specified column, or an empty list if the column doesn't exist.
        """

        return self.data.get(column_name, [])


    def get_value(self, row_index, column_name):
        """
        Retrieve a value at a specified row and column.

        Args
          row_index: int - The row index of the value.
          column_name: str - The column name of the value.

        Returns
          The value at the specified row and column.

        Raises
          IndexError: If the row index is out of range.
          KeyError: If the column name is not found.
        """
        if column_name not in self.data:
            raise KeyError(f"Column '{column_name}' not found")

        if row_index >= len(self.data[column_name]) or row_index < 0:
            raise IndexError("Row index out of range")

        return self.data[column_name][row_index]


    def set_value(self, row_index, column_name, value):
        """
        Set a value at a specified row and column.

        Args
          row_index: int - The row index where the value will be set.
          column_name: str - The column name where the value will be set.
          value: The new value to be set.

        Raises
          IndexError: If the row index is out of range.
          KeyError: If the column name is not found.
        """
        if column_name not in self.data:
            raise KeyError(f"Column '{column_name}' not found")

        if row_index >= len(self.data[column_name]) or row_index < 0:
            raise IndexError("Row index out of range")

        self.data[column_name][row_index] = value


    def rows_with_value(self, column_name, value):
        """
        Find rows where the column has an exact match to the specified value.
        Args:
          column_name: str - The name of the column.
          value: The value to match.
        Returns:
          list of int - A list of row indices where the value matches.
        """
        if column_name not in self.data:
            raise KeyError(f"Column '{column_name}' not found")

        return [i for i, v in enumerate(self.data[column_name]) if v == value]


    def rows_with_substring(self, column_name, substring):
        """
        Find rows where the column contains the specified substring.
        Args:
          column_name: str - The name of the column.
          substring: str - The substring to search for.
        Returns:
          list of int - A list of row indices where the substring is found.
        """
        if column_name not in self.data:
            raise KeyError(f"Column '{column_name}' not found")

        return [i for i, v in enumerate(self.data[column_name]) if substring in str(v)]


    def rows_with_number_comparison(self, column_name, number, operator):
        """
        Find rows based on a numerical comparison in a specified column.
        Args:
          column_name: str - The name of the column.
          number: int or float - The number to compare against.
          operator: str - The comparison operator ('<', '>', '=', '<=', '>=').
        Returns:
          list of int - A list of row indices where the comparison is true.
        Raises:
          KeyError: If the column name is not found.
          ValueError: If an invalid operator is passed.
        """
        if column_name not in self.data:
            raise KeyError(f"Column '{column_name}' not found")

        valid_operators = {'<': lambda x: x < number,
                           '>': lambda x: x > number,
                           '=': lambda x: x == number,
                           '<=': lambda x: x <= number,
                           '>=': lambda x: x >= number}

        if operator not in valid_operators:
            raise ValueError(f"Invalid operator '{operator}'. Valid operators are <, >, =, <=, >=")

        return [i for i, v in enumerate(self.data[column_name]) if isinstance(v, (int, float)) and valid_operators[operator](v)]


    def to_json(self):
        """
        Convert the dataframe to a JSON-compatible format.
        Returns:
          str - A JSON string representing the dataframe.
        """
        # List of dictionaries to store each row's data
        rows_data = []

        # Check if there is any data in the dataframe
        if not self.data:
            return json.dumps(rows_data)  # Return empty list in JSON format

        # Get the number of rows in the dataframe
        num_rows = len(next(iter(self.data.values())))

        # For each row, create a dictionary where key is column name and value is the data in that column
        for i in range(num_rows):
            row_data = {column: self.data[column][i] for column in self.data}
            rows_data.append(row_data)

        # Convert the list of dictionaries to JSON
        return json.dumps(rows_data, indent=4)


    def __str__(self):
        """
        Printing: Provide a string representation of the dataframe for printing.

        Returns
          str - A formatted string representing the dataframe.
        """

        # Check for an empty dataframe.
        if not self.data:
            return "Empty DataFrame"

        # Determine the width of each column for formatting.
        # width is based on the longest string in each column (including the header).
        column_widths = {col: max(len(str(col)), max(len(str(x)) for x in self.data[col])) for col in self.data}

        # Format the header row.
        header = "\t\t".join(col.ljust(column_widths[col]) for col in self.data)

        # Create a separator line.
        separator = "-" * len(header)

        # Zip the data values together and format each row.
        rows = zip(*self.data.values())
        data_rows = "\n".join("\t\t".join(str(x).ljust(column_widths[col]) for x, col in zip(row, self.data)) for row in rows)

        # Combine header, separator, and data rows into the final string.
        return "\n".join([header, separator, data_rows])

# Example 1 of use
df = PyPhrame()
df.add_column("Age", [25, 30, 45])
df.add_column("Name", ["Alice", "Bob", "Charlie"])
df.add_column("Name2", ["Alice", "Bob", "Charlie"])
print(df, "\n")


# Example 2 of use
df = PyPhrame()
df.add_column("Age", [25, 30, 45])
df.add_column("Name", ["Alice", "Bob", "Charlie"])

# Reading a value
this_name = df.get_value(1, "Name")
print(f"Value at row 1, column 'Name': {this_name}\n")

# Setting a new value
df.set_value(1, "Name", "Robert")

print("Updated DataFrame:")
print(df)


df = PyPhrame()
df.add_column("Age", [25, 30, 45])
df.add_column("Name", ["Alice", "Bob", "Charlie"])

# Find rows where 'Name' is 'Bob'
print("Rows where Name is 'Bob':", df.rows_with_value("Name", "Bob"))

# Find rows where 'Name' contains 'b'
print("Rows where Name contains 'b':", df.rows_with_substring("Name", "b"))


df = PyPhrame()
df.add_column("Age", [25, 30, 45])
df.add_column("Name", ["Alice", "Bob", "Charlie"])

# Find rows where 'Age' is greater than 20
print("Rows where Age is greater than 20:", df.rows_with_number_comparison("Age", 20, '>'))

# Find rows where 'Age' is less than or equal to 30
print("Rows where Age is less than or equal to 30:", df.rows_with_number_comparison("Age", 30, '<='))


df = PyPhrame()
df.add_column("Age", [25, 30, 45])
df.add_column("Name", ["Alice", "Bob", "Charlie"])

# Convert dataframe to JSON
json_data = df.to_json()
print(json_data)


Age		Name   		Name2  
---------------------
25 		Alice  		Alice  
30 		Bob    		Bob    
45 		Charlie		Charlie 

Value at row 1, column 'Name': Bob

Updated DataFrame:
Age		Name   
------------
25 		Alice  
30 		Robert 
45 		Charlie
Rows where Name is 'Bob': [1]
Rows where Name contains 'b': [1]
Rows where Age is greater than 20: [0, 1, 2]
Rows where Age is less than or equal to 30: [0, 1]
[
    {
        "Age": 25,
        "Name": "Alice"
    },
    {
        "Age": 30,
        "Name": "Bob"
    },
    {
        "Age": 45,
        "Name": "Charlie"
    }
]
