In [None]:
from dataclasses import dataclass
import numpy as np
import pandas as pd

SUPPORTED_LANGUAGES = [
    "js", "html", "css", "java", "py", "cpp", "rb", "php", "go", "rs",
    "swift", "kt", "ts", "cs", "dart", "lua", "sh", "pl", "r", "scala",
    "hs", "vb", "m", "jl", "fs", "ex", "sql", "h"
]
OPTIONAL_RELEASE_FEATURES = []
FEATURES = ["date"] + SUPPORTED_LANGUAGES + OPTIONAL_RELEASE_FEATURES
REPOSITORY_FEATURES = ["name", "file_counts", "release_count", "size", "stars", "forks", "countributor_count", "clone_count", "country"]



repo = pd.DataFrame(columns=FEATURES)

# FAKE DATA GENERATION
# Fill the DataFrame with fake data for demonstration purposes
for i in range(9):
    date = pd.Timestamp('2023-01-01') + pd.Timedelta(days=i)
    lang_data = np.random.randint(0, 100, size=len(SUPPORTED_LANGUAGES))
    
    repo.loc[len(repo)] = pd.Series(
        [date] + lang_data.tolist(),
        index=FEATURES
    )

repo

Unnamed: 0,date,js,html,css,java,python,cpp,rb,php,go,...,r,scala,hs,vb,m,jl,fs,ex,sql,h
0,2023-01-01 00:00:00,50,54,93,44,37,95,42,47,43,...,76,60,57,64,14,63,26,24,45,28
1,2023-01-02 00:00:00,19,68,24,5,44,89,96,2,97,...,48,39,30,43,61,27,64,20,9,59
2,2023-01-03 00:00:00,4,92,96,71,24,55,79,72,10,...,78,17,28,49,64,62,69,51,59,82
3,2023-01-04 00:00:00,15,58,34,19,74,61,63,56,70,...,55,12,31,20,94,87,34,97,33,17
4,2023-01-05 00:00:00,76,76,9,3,59,72,38,83,78,...,24,82,34,24,47,38,82,49,24,56
5,2023-01-06 00:00:00,54,7,30,91,48,85,94,55,23,...,61,63,75,54,86,53,77,96,15,79
6,2023-01-07 00:00:00,78,89,46,3,11,93,95,85,6,...,72,27,43,10,75,20,53,82,17,40
7,2023-01-08 00:00:00,91,46,51,61,78,85,12,53,11,...,90,27,88,35,46,63,36,62,40,68
8,2023-01-09 00:00:00,15,73,10,82,17,0,38,34,53,...,62,35,70,68,0,38,9,55,92,27


In [3]:
@dataclass
class RepositoryStatistics:
    name: str
    file_counts: int
    release_count: int
    size: int
    stars: int
    forks: int
    countributor_count: int
    clone_count: int
    country: str

    def add_to_dataframe(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        Adds the current dataclass to the last line of the given dataframe.
        Changes are done in place so that we don't necessarily need to use the returned dataframe.

        Args:
            df (pd.DataFrame): The dataframe to which the dataclass will be added

        Returns:
            pd.DataFrame: The updated dataframe
        """
        df.loc[len(df)] = pd.Series(
            self.__dict__
        )
        return df

def create_empty_repository_statistics_dataframe() -> pd.DataFrame:
    """
    Creates an empty DataFrame with the appropriate columns for RepositoryStatistics.

    Returns:
        pd.DataFrame: An empty DataFrame with RepositoryStatistics columns
    """
    return pd.DataFrame(columns=REPOSITORY_FEATURES)

def create_fake_repository_statistics(i: int) -> RepositoryStatistics:
    """
    Creates a DataFrame filled with fake data for demonstration purposes.

    Returns:
        pd.DataFrame: A DataFrame filled with fake RepositoryStatistics data
    """
    return RepositoryStatistics(
        name=f"repo_{i}",
        file_counts=np.random.randint(1, 1000),
        release_count=np.random.randint(1, 50),
        size=np.random.randint(100, 10000),
        stars=np.random.randint(0, 5000),
        forks=np.random.randint(0, 3000),
        countributor_count=np.random.randint(1, 100),
        clone_count=np.random.randint(1, 10000),
        country=np.random.choice(["USA", "Canada", "UK", "Germany", "France"])
    )

general = create_empty_repository_statistics_dataframe()


# FAKE DATA GENERATION
# Fill the DataFrame with fake data for demonstration purposes
for i in range(15):
    rep_stat = create_fake_repository_statistics(i)
    
    rep_stat.add_to_dataframe(general)
general

Unnamed: 0,name,file_counts,release_count,size,stars,forks,countributor_count,clone_count,country
0,repo_0,253,31,427,737,1692,91,1126,USA
1,repo_1,233,45,1121,1740,2575,65,8513,Canada
2,repo_2,368,47,1650,3321,630,81,3917,USA
3,repo_3,380,1,3046,4513,449,44,9742,USA
4,repo_4,247,8,9538,2968,1963,21,1870,Germany
5,repo_5,271,35,6636,449,695,32,7852,Canada
6,repo_6,150,47,8369,3690,2513,55,4762,Canada
7,repo_7,718,10,8755,2783,2562,78,1666,France
8,repo_8,724,20,3458,2908,2832,84,1502,France
9,repo_9,833,43,6201,3042,1205,93,3432,Canada


In [4]:
@dataclass
class TestClass:
    attribute1: int = 0
    attribute2: str = "tttt"


test_instance = TestClass()
list(test_instance.__dict__.keys())

['attribute1', 'attribute2']