In [1]:
from sys import getsizeof
from tabulate import tabulate
from itertools import zip_longest
from IPython.core.display import display, HTML
import numpy as np
import pyarrow as pa


class Table:
    __slots__ = ["data"] 

    def __init__(self, data=None, types=None):
        if data:
            self.data = np.array(list(zip_longest(*data.values())), dtype=types)
        else:
            self.data = np.array([])

    @property
    def data_types(self):
        if self.data.size == 0:
            return list()
        else:
            return self.data.dtype.descr

    @property
    def length(self):
        return len(self.data)

    @property
    def width(self):
        return len(self.data_types)

    @property
    def names(self):
        if self.data.size == 0:
            return list()
        else:
            return list(self.data.dtype.names)
        
    @property
    def info(self):
        info = (
            "Table("
            f"shape=({self.width}x{self.length}),"
            f"bytesize={getsizeof(self.data)}"
            ")"
        )
        return info
    
    @property
    def to_arrow(self):
        arrow_table = pa.Table.from_arrays(
            [pa.array(self.data[name]) for name in self.names], 
            names=self.names
        )
        return arrow_table
    
    def rows(self, start=None, stop=None):
        if not start:
            start = 0
        if not stop:
            stop = self.length
        return (row for row in self.data[start:stop])    

    def indices(self, start=None, stop=None, step=None):
        if not start:
            start = 0
        if not stop:
            stop = self.length
        if not step:
            step = 1
        return range(start, stop, step)

    def __getitem__(self, selector):
        if type(selector) is tuple:
            #TODO write code for handling slices, itnegers strings and more.
            name, index = selector
            return self.data[name][index]
        elif type(selector) is str:
            return self.data[selector]

    def __setitem__(self, selector, value):
        if type(selector) is tuple:
            #TODO write code for handling slices, itnegers strings and more.
            name, index = selector
            if name not in self.names:
                self.new_column(name)
            self.data[name][index] = value
        elif type(selector) is str:
            if selector not in self.names:
                self.new_column(selector)
            value = self.value_to_list(value)
            self.align_length(value)
            self.data[selector] = value
            
    def value_to_list(self, value):
        if type(value) is not list:
            value = [value]
        return value

    def align_length(self, value):
        length_difference = len(value) - self.length
        if length_difference > 0:
            for _ in range(1, length_difference + 1):
                self.data.resize(self.length + 1)
                self.data[-1] = tuple([None] * self.width)

    def new_column(self, name, data_type="O"):
        arrays_list = [list(self.data[name]) for name in self.names]
        arrays_list.append([None] * self.length)
        types = self.data_types + [(name, data_type)]
        
        self.data = np.array(list(zip_longest(*arrays_list)), dtype=types)
    
    def __str__(self):
        print(self.info, "\n")
        return tabulate(
            self.data, headers="keys", showindex="always", tablefmt="github", disable_numparse=True
        )

    def __repr__(self):
        html = "<table>\n<tr>\n<th></th>\n"
        for column, type_value in self.data.dtype.fields.items():
            html = f"{html}<th>{str(column)}<br>{str(type_value[0])}</br></th>\n"
        html = "".join([html, "</tr>\n"])

        for idx, row in enumerate(self.data):
            html = "".join([html, "<tr>\n<td>", str(idx), "</td>\n"])
            for elem in row:
                html = f"{html}<td>{str(elem)}</td>\n"
            html = "".join([html, "</tr>\n"])
        html = "".join([html, "</table>"])
        display(HTML(html))
        return self.info
    

In [2]:
data = {
    "ColA": [1,2,3,4], "ColB": [1], "ColC": [4,2,7,1],
    "ColD": [8,3], "ColE": [1,4,3,1,1,1], "ColF": [3,2,4,2]
}
types = [
    ("ColA", "O"), ("ColB", "O"), ("ColC", "O"), ("ColD", "O"), ("ColE", "O"), ("ColF", "O")
]

In [3]:
table = Table(data, types)

In [4]:
print(table)

Table(shape=(6x6),bytesize=384) 

|    | ColA   | ColB   | ColC   | ColD   | ColE   | ColF   |
|----|--------|--------|--------|--------|--------|--------|
| 0  | 1      | 1      | 4      | 8      | 1      | 3      |
| 1  | 2      |        | 2      | 3      | 4      | 2      |
| 2  | 3      |        | 7      |        | 3      | 4      |
| 3  | 4      |        | 1      |        | 1      | 2      |
| 4  |        |        |        |        | 1      |        |
| 5  |        |        |        |        | 1      |        |


In [5]:
table["ColG", 4] = False

In [6]:
for idx in table.indices():
    if table["ColA", idx] and table["ColF", idx]:
        table["ColH", idx] = (table["ColA", idx] + table["ColF", idx]) / 2

In [7]:
table["ColI"] = ["a", "b", "c", "d", "e", "f", "g"]

In [8]:
table

Unnamed: 0,ColA object,ColB object,ColC object,ColD object,ColE object,ColF object,ColG object,ColH object,ColI object
0,1.0,1.0,4.0,8.0,1.0,3.0,,2.0,a
1,2.0,,2.0,3.0,4.0,2.0,,2.0,b
2,3.0,,7.0,,3.0,4.0,,3.5,c
3,4.0,,1.0,,1.0,2.0,,3.0,d
4,,,,,1.0,,False,,e
5,,,,,1.0,,,,f
6,,,,,,,,,g


Table(shape=(9x7),bytesize=600)

In [9]:
import pyarrow.parquet as pq

def table_to_parquet(table, file_name, extension, directory=""):
    file_path = f"{directory}{file_name}.{extension}"
    arrow_table = table.to_arrow
    pq.write_table(arrow_table, file_path)
    
def table_from_parquet(file_name, extension, directory=""):
    file_path = f"{directory}{file_name}.{extension}"
    arrow_table = pq.read_table(file_path)
    data_types = [
        (data_type.name, "O") 
        for data_type in arrow_table.schema
    ]
    table = Table(arrow_table.to_pydict(), data_types)
    return table

In [10]:
file_name = "TestTable"
extension = "parquet"
directory = "C:\\AnalyticsPy\\tests\\"

In [11]:
table_to_parquet(table, file_name, extension, directory)

In [12]:
tb = table_from_parquet(file_name, extension, directory)

In [13]:
tb

Unnamed: 0,ColA object,ColB object,ColC object,ColD object,ColE object,ColF object,ColG object,ColH object,ColI object
0,1.0,1.0,4.0,8.0,1.0,3.0,,2.0,a
1,2.0,,2.0,3.0,4.0,2.0,,2.0,b
2,3.0,,7.0,,3.0,4.0,,3.5,c
3,4.0,,1.0,,1.0,2.0,,3.0,d
4,,,,,1.0,,False,,e
5,,,,,1.0,,,,f
6,,,,,,,,,g


Table(shape=(9x7),bytesize=600)