In [73]:
import csv
import operator
import itertools as it
import datetime
import statistics

def pretty_string(string,color,length=None):
    """Accepted colors: red, green, yellow, blue, magenta, cyan"""
    colors_dict = {
        'red':31,
        'green':32,
        'yellow':33,
        'blue':34,
        'magenta':35,
        'cyan':36,
    }
    if length==None:
        return f"\033[{colors_dict[color]}m{string}\033[0m"
    else:
        return f"\033[{colors_dict[color]}m{string[:length]}\033[0m"

def element_wise_comparison(func, list_1, list_2):
    """Compare list_1 and list_2 using func and return a list of Bool

    Takes Python lists or tuples and outputs Python lists. list_2 may be a scalar.
    """
    if not isinstance(list_1,(list,tuple)):
        raise TypeError("list_1 must be of the type 'List'")
    if isinstance(list_2, (int, float, str, datetime.datetime)) :
        return [func(x,list_2) for x in list_1]
    elif isinstance(list_2, (list,tuple)):
        if len(list_1) != len(list_2):
            raise ValueError("Lists have incompatible lengths")
        return [func(x,y) for x, y in zip(list_1, list_2)]
    else:
        raise TypeError("Can only compare against the types 'Int,' 'Float,' 'Str,' or 'List'")

class ColumnProperties:
    def __init__(self,data=None):
        if data==None:
            return
        elif isinstance(data, dict):
            for attr_name, attr_val in data.items():
                setattr(self, attr_name, attr_val)
        else:
            raise TypeError("ColumnProperties data parameter must be of the type 'Dict'")

    def add_properties(self, data):
        if isinstance(data, dict):
            for attr_name, attr_val in data.items():
                setattr(self, attr_name, attr_val)
        else:
            raise TypeError("ColumnProperties data parameter must be of the type 'Dict'")

    def get_property(self, key):
        try:
            return self.key
        except:
            raise ValueError(f"Property {key} not found")

class Category:
    def __init__(self,data):
        self.data = data
        return None
        
    def __repr__(self):
        return self.data
        
    def __format__(self,fmt):
        return f"{self.data:{fmt}}"

class DataColumn:
    """
    Column of Simplistic DataFrame
    """
    def __init__(self, data):
        self.data = data

    def __add__(self, other):
        if isinstance(other, int) or isinstance(other, float):
            return DataColumn([operator.add(x, other) for x in self.data])
        elif isinstance(other, DataColumn):
            if len(self.data) != len(other.data):
                raise ValueError("Columns have incompatible lengths")
            return DataColumn([operator.add(x, y) for x, y in zip(self.data, other.data)])
        else:
            raise TypeError("Operands must be of the type 'DataColumn,' 'Int,' or 'Float'")

    def __sub__(self, other):
        if isinstance(other, int) or isinstance(other, float):
            return DataColumn([operator.sub(x, other) for x in self.data])
        elif isinstance(other, DataColumn):
            if len(self.data) != len(other.data):
                raise ValueError("Columns have incompatible lengths")
            return DataColumn([operator.sub(x, y) for x, y in zip(self.data, other.data)])
        else:
            raise TypeError("Operands must be of the types 'DataColumn,' 'Int,' or 'Float'")

    def __mul__(self, other):
        if isinstance(other, int) or isinstance(other, float):
            return DataColumn([operator.mul(x, other) for x in self.data])
        elif isinstance(other, DataColumn):
            if len(self.data) != len(other.data):
                raise ValueError("Columns have incompatible lengths")
            return DataColumn([operator.mul(x, y) for x, y in zip(self.data, other.data)])
        else:
            raise TypeError("Can only divide by the types 'Int,' or 'Float'")

    def __truediv__(self, other):
        if isinstance(other, int) or isinstance(other, float):
            if other==0:
                raise ValueError("Div by zero is not allowed")
            return DataColumn([operator.truediv(x, other) for x in self.data])
        elif isinstance(other, DataColumn):
            if len(self.data) != len(other.data):
                raise ValueError("Columns have incompatible lengths")
            if 0 in other:
                raise ValueError("Encountered division by zero")
            return DataColumn([operator.truediv(x, y) for x, y in zip(self.data, other.data)])
        else:
            raise TypeError("Can only divide by the types 'Int,' or 'Float'")

    def __eq__(self, other):
        list_1 = self.data
        list_2 = other.data if isinstance(other,DataColumn) else other
        return DataColumn(element_wise_comparison(operator.eq,list_1, list_2))

    def __lt__(self, other):
        list_1 = self.data
        list_2 = other.data if isinstance(other,DataColumn) else other
        return DataColumn(element_wise_comparison(operator.lt,list_1, list_2))

    def __le__(self, other):
        list_1 = self.data
        list_2 = other.data if isinstance(other,DataColumn) else other
        return DataColumn(element_wise_comparison(operator.le,list_1, list_2))

    def __ne__(self, other):
        list_1 = self.data
        list_2 = other.data if isinstance(other,DataColumn) else other
        return DataColumn(element_wise_comparison(operator.ne,list_1, list_2))

    def __ge__(self, other):
        list_1 = self.data
        list_2 = other.data if isinstance(other,DataColumn) else other
        return DataColumn(element_wise_comparison(operator.ge,list_1, list_2))

    def __gt__(self, other):
        list_1 = self.data
        list_2 = other.data if isinstance(other,DataColumn) else other
        return DataColumn(element_wise_comparison(operator.gt,list_1, list_2))
        
    def __repr__(self):
        print(repr(self.data))
        return "Column"

    def as_list(self):
        return self.data

    def __iter__(self):
        return iter(self.data)

    def apply(self, func):
        return DataColumn(list(map(func,self.data)))

    def min(self):
        return min(self.data)

    def max(self):
        return max(self.data)

    def mean(self):
        return statistics.mean(self.data)

    def median(self):
        return statistics.median(self.data)

    def median_low(self):
        return statistics.median_low(self.data)

    def median_high(self):
        return statistics.median_high(self.data)

    def mode(self):
        return statistics.mode(self.data)

    def std(self):
        return statistics.stdev(self.data)

    def var(self):
        return statistics.variance(self.data)

    def pstd(self):
        return statistics.pstdev(self.data)

    def pvariance(self):
        return statistics.pvariance(self.data)

    def cov(self,other):
        if isinstance(other, DataColumn):
            return statistics.covariance(self.data,other.data)
        else:
            raise TypeError("Can only compare to another DataColumn")

    def cor(self,other):
        if isinstance(other, DataColumn):
            return statistics.correlation(self.data,other.data)
        else:
            raise TypeError("Can only compare to another DataColumn")

    def lr(self,other):
        """Linear regression against another column.

        Regress this column on another column and return slope and intercept.
        https://docs.python.org/3/library/statistics.html

        Returns slope, intercept
        """
        if isinstance(other, DataColumn):
            return statistics.linear_regression(other.data,self.data)
        else:
            raise TypeError("Can only compare to another DataColumn")

    def set_type(self, new_type):
        casted_values = []
        for val in self.data:
            try:
                if val==None:
                    casted_val=None
                else:
                    casted_val = new_type(val)
                casted_values.append(casted_val)
            except (TypeError, ValueError) as e:
                raise ValueError(f"Cannon cast {val} to {new_type}: {e}")
        return DataColumn(casted_values)

    #def isna(self):
        

class DataFrame:
    '''
    Simplistic DataFrame

    Properties that must be maintained as columns are added, deleted, or moved:
    self.data
    self.columns
    self.col_properties

    Functions
    ---------
    read_csv
    to_csv
    apply
    '''
    def __init__(self,data=None,dtypes=None):
        dtypes_provided = isinstance(dtypes,dict)
        self.data = []
        self.columns = []
        self.col_properties = []
        values_len = -1
        if data==None:
            pass
        elif isinstance(data,dict):
            for key, values in data.items():
                if values_len == -1:
                    values_len = len(values)
                else:
                    if len(values) != values_len:
                        raise ValueError("Columns have incompatible lengths")
                self.columns.append(key)
                self.data.append(values)
                # Check if dtypes were given:
                if dtypes_provided:
                    self.col_properties.append(ColumnProperties({'dtype':dtypes[key]}))
        else:
            raise TypeError("Data must be of the type'Dict'")
        return

    def read_csv(self, file_path):
        with open(file_path, 'r', newline='') as file:
            csv_reader = csv.reader(file,skipinitialspace=True) # https://docs.python.org/3/library/csv.html
            self.columns = next(csv_reader)
            data = []
            for row in csv_reader:
                processed_row = [None if value == '' else value for value in row]
                data.append(processed_row)
            self.data = list(zip(*data))
        
        del data;
        self.col_properties = [ColumnProperties() for i in range(len(self.columns))]
        return

    def to_csv(self, file_path):
        with open(file_path, 'w', newline='') as file: # newline????
            csv_writer = csv.writer(file) # https://docs.python.org/3/library/csv.html
            csv_writer.writerow(self.columns)
            csv_writer.writerows(self.data)

    def __getitem__(self, key):
        if isinstance(key, int):
            return DataColumn(self.data[key])
        elif isinstance(key, str):
            try:
                col_idx = self.columns.index(key)
                return DataColumn(self.data[col_idx])
            except ValueError:
                raise KeyError(f"Column '{key}' not found")

    def __setitem__(self, key, new_col_values):
        required_col_len = len(self.data[0])
        if isinstance(key, str):
            # If exists, find the column index, otherwise check if possible (corrent length) to create the column
            if key in self.columns:
                col_idx = self.columns.index(key)
            else:
                if len(new_col_values.as_list()) != required_col_len:
                    ValueError("Columns have incompatible lengths")
                else:
                    col_idx = len(self.columns) # b/c current length is 1 greater than current rightmost idn
                    self.columns.append(key)
                    self.data.append([None]*required_col_len)
        elif isinstance(key, int):
            col_idx = key
        else:
            raise TypeError("Key must be of the types 'Str' or 'Int'")
        if isinstance(new_col_values, DataColumn):
            if len(self.data[col_idx]) != len(new_col_values.as_list()):
                raise ValueError("Columns have incompatible lengths")
            self.data[col_idx] = new_col_values.as_list()
        elif isinstance(new_col_values, list):
            if len(self.data[col_idx]) != len(new_col_values):
                raise ValueError("Columns have incompatible lengths")
            self.data[col_idx] = new_col_values
        return
        
    def __len__(self):
        return len(self.data[0])
    
    def __repr__(self):
        # Human readable representation or informal, string, representation of the dataframe
        return str(self.rows(start_row=0,nrows=5,show_index=True)) #str(list(self.data))

    def __iter__(self):
        return iter(self.data)

    def rows(self,start_row=0,nrows=5,show_index=True):
        """Print nrows first rows of data
        """
        display_data = [] # each element to represent a row (instead of col as is in self.data
        col_width = 10
        prefix_extra_len = len(str(start_row+nrows))-1
        prefix_header1 = "| " 
        prefix_header2 = "| "
        prefix_line = "--"
        prefix_data = "f'| '"
        # Prepare prefix
        if show_index:
            prefix_header1 = f"{' ':>{1+prefix_extra_len}} |"
            prefix_header2 = f"{'i':>{1+prefix_extra_len}} |"
            prefix_line = "-"*(3+prefix_extra_len)
            prefix_data="f'{data_idx:>{1+prefix_extra_len}} |'"
        # Slice rows
        for col in df:
            col = list(it.islice(col,start_row,start_row+nrows))
            display_data.append(col)
        # Transpose for  printing row by row
        display_data = list(zip(*display_data))
        # Print header
        ## Row 1 (short name)
        print(prefix_header1,end=' ')
        for c in self.columns:
            print(f"{c:^{col_width}}",end = ' | ')
        ## Row 2 (dtypes)
        print("\n"+prefix_header2,end=' ')
        for c in self.col_properties:
            try:
                dtype = c.dtype
                text_to_print = ""
                if dtype==str:
                    text_to_print = pretty_string(f"{'str':>{col_width}}",'magenta')
                elif dtype==int or dtype==float:
                    text_to_print = pretty_string(f"{'num':>{col_width}}",'green')
                elif dtype==Category:
                    text_to_print = pretty_string(f"{'C':>{col_width}}",'yellow') ########################### Need to specify whether dummiefied already or not and how many cats
                else:
                    text_to_print = pretty_string(f"{'UNK':>{col_width}}",'red')
                print(text_to_print,end = ' | ')
            except:
                pass
        # Break line
        print("\n"+prefix_line+("-"*len(self.columns)*13))
        # Print rows, one col at a time
        for r in range(len(display_data)):
            data_idx = r + start_row
            print(eval(prefix_data),end=' ')
            for c in display_data[r]:
                text_to_print = "" # text to print for the current column, formatted below
                if isinstance(c,float):
                    text_to_print=f"{c:>{col_width},.1f}"
                elif c==None:
                    text_to_print = pretty_string(f"{'--':>{col_width}}",'red')
                elif isinstance(c,Category):
                    text_to_print=f"{c:>{col_width}}"
                else:
                    text_to_print=f"{c[:col_width]:>{col_width}}"
                print(text_to_print,end = ' | ')
            print('')
        # Return descriptive string
        return f"DataFrame with {len(self.columns)} columns and {len(self.data[0])} rows"

    def set_types(self,types):
        """Set each column's data type accordingly to types.

        Currently, only accepts dict.
        """
        if isinstance(types, dict):
            for col_name, col_type in types.items():
                self[col_name] = self[col_name].set_type(col_type)
                col_idx = self.columns.index(col_name)
                self.col_properties[col_idx].add_properties({'dtype':col_type})
        else:
            raise TypeError("Types param must be of the type 'Dict'")


In [74]:
print(f"{'shortestsssssssl'[:7]:>7}|")

shortes|


In [75]:
col1 = DataColumn([1,3,8])
col2 = DataColumn([1,1,10])
#print(col1, col2)
col3 = col1 + col2
print(col3)

[2, 4, 18]
Column


In [76]:
df = DataFrame()

In [77]:
df.read_csv("input_test.csv")

In [78]:
df

  |   col_a    |   col_b    |  col_num   | long_text_col | 
i | 
-------------------------------------------------------
0 |          a |          A |         11 | this is lo | 
1 |          b |          B |         21 | this is lo | 
2 |          c |          C |         13 | this is lo | 
3 |          d |          D |         23 | this is lo | 
4 |          e |          E |          8 | this is lo | 


DataFrame with 4 columns and 21 rows

In [84]:
df.rows(10,20)

   |   col_a    |   col_b    |  col_num   | long_text_col | 
 i | [35m       str[0m | [33m         C[0m | [32m       num[0m | 
--------------------------------------------------------
10 |          m |          A |       11.0 | this is lo | 
11 |          n |          B |       21.0 | this is lo | 
12 |          o |          C | [31m        --[0m | this is lo | 
13 |          p |          D |       23.0 | this is lo | 
14 |          q |          E |        8.0 | this is lo | 
15 |          r |          F |       12.0 | this is lo | 
16 |          s |          A |       11.0 | this is lo | 
17 |          t |          B |        2.0 | this is lo | 
18 |          u |          C |        2.0 | this is lo | 
19 |          v |          D |       82.0 | this is lo | 
20 |          x | [31m        --[0m |       13.0 | this is lo | 


'DataFrame with 4 columns and 21 rows'

In [80]:
df.set_types({'col_a':str,'col_b':Category,'col_num':float})

In [81]:
df

  |   col_a    |   col_b    |  col_num   | long_text_col | 
i | [35m       str[0m | [33m         C[0m | [32m       num[0m | 
-------------------------------------------------------
0 |          a |          A |       11.0 | this is lo | 
1 |          b |          B |       21.0 | this is lo | 
2 |          c |          C |       13.0 | this is lo | 
3 |          d |          D |       23.0 | this is lo | 
4 |          e |          E |        8.0 | this is lo | 


DataFrame with 4 columns and 21 rows

In [82]:
pretty_string('this is just a test','blue')

'\x1b[34mthis is just a test\x1b[0m'