In [1]:
import pandas as pd
import numpy as np
from itertools import cycle
def generate_fake_dataframe(size, cols, col_names = None, intervals = None, seed = None):
    
    categories_dict = {'animals': ['cow', 'rabbit', 'duck', 'shrimp', 'pig', 'goat', 'crab', 'deer', 'bee', 'sheep', 'fish', 'turkey', 'dove', 'chicken', 'horse'],
                       'names'  : ['James', 'Mary', 'Robert', 'Patricia', 'John', 'Jennifer', 'Michael', 'Linda', 'William', 'Elizabeth', 'Ahmed', 'Barbara', 'Richard', 'Susan', 'Salomon', 'Juan Luis'],
                       'cities' : ['Stockholm', 'Denver', 'Moscow', 'Marseille', 'Palermo', 'Tokyo', 'Lisbon', 'Oslo', 'Nairobi', 'Río de Janeiro', 'Berlin', 'Bogotá', 'Manila', 'Madrid', 'Milwaukee'],
                       'colors' : ['red', 'orange', 'yellow', 'green', 'blue', 'indigo', 'purple', 'pink', 'silver', 'gold', 'beige', 'brown', 'grey', 'black', 'white']
                      }
    default_intervals = {"i" : (0,10), "f" : (0,100), "c" : ("names", 5), "d" : ("2020-01-01","2020-12-31")}
    rng = np.random.default_rng(seed)

    first_c = default_intervals["c"][0]
    categories_names = cycle([first_c] + [c for c in categories_dict.keys() if c != first_c])
    default_intervals["c"] = (categories_names, default_intervals["c"][1])
    
    if isinstance(col_names,list):
        assert len(col_names) == len(cols), f"The fake DataFrame should have {len(cols)} columns but col_names is a list with {len(col_names)} elements"
    elif col_names is None:
        suffix = {"c" : "cat", "i" : "int", "f" : "float", "d" : "date"}
        col_names = [f"column_{str(i)}_{suffix.get(col)}" for i, col in enumerate(cols)]

    if isinstance(intervals,list):
        assert len(intervals) == len(cols), f"The fake DataFrame should have {len(cols)} columns but intervals is a list with {len(intervals)} elements"
    else:
        if isinstance(intervals,dict):
            assert len(set(intervals.keys()) - set(default_intervals.keys())) == 0, f"The intervals parameter has invalid keys"
            default_intervals.update(intervals)
        intervals = [default_intervals[col] for col in cols]
    df = pd.DataFrame()
    for col, col_name, interval in zip(cols, col_names, intervals):
        if interval is None:
            interval = default_intervals[col]
        assert (len(interval) == 2 and isinstance(interval, tuple)) or isinstance(interval, list), f"This interval {interval} is neither a tuple of two elements nor a list of strings."
        if col in ("i","f","d"):
            start, end = interval
        if col == "i":
            df[col_name] = rng.integers(start, end, size)
        elif col == "f":
            df[col_name] = rng.uniform(start, end, size)
        elif col == "c":
            if isinstance(interval, list):
                categories = np.array(interval)
            else:
                cat_family, length = interval
                if isinstance(cat_family, cycle):
                    cat_family = next(cat_family)
                assert cat_family in categories_dict.keys(), f"There are no samples for category '{cat_family}'. Consider passing a list of samples or use one of the available categories: {categories_dict.keys()}"
                categories = rng.choice(categories_dict[cat_family], length, replace = False, shuffle = True)
            df[col_name] = rng.choice(categories, size, shuffle = True)
        elif col == "d":
            df[col_name] = rng.choice(pd.date_range(start, end), size)
    return df       

In [2]:
fake_data_frame = generate_fake_dataframe(size = 1000, cols = 'cififficcd')
fake_data_frame

Unnamed: 0,column_0_cat,column_1_int,column_2_float,column_3_int,column_4_float,column_5_float,column_6_int,column_7_cat,column_8_cat,column_9_date
0,Jennifer,5,33.384483,5,59.292018,37.371772,6,shrimp,Denver,2020-09-16
1,Ahmed,0,29.050320,8,99.424768,8.988284,5,horse,Palermo,2020-08-13
2,Jennifer,6,91.619556,8,2.515888,99.724090,4,cow,Denver,2020-08-13
3,Mary,0,14.336133,7,53.270380,21.002666,1,cow,Madrid,2020-01-21
4,Richard,8,57.320302,1,77.962934,60.294909,0,shrimp,Manila,2020-02-16
...,...,...,...,...,...,...,...,...,...,...
995,Patricia,0,33.429562,4,28.524364,71.662653,9,shrimp,Manila,2020-08-13
996,Jennifer,2,21.349447,1,74.714160,95.763668,4,pig,Manila,2020-09-25
997,Jennifer,7,39.246617,6,70.422090,33.083187,1,cow,Palermo,2020-11-17
998,Patricia,2,29.422925,0,77.960659,86.578007,0,pig,Marseille,2020-11-01


In [3]:
df1 = generate_fake_dataframe(
        size = 100,
        cols = 'cccfd',
        col_names = ["name", "pet", "city","height", "birthday"],
        intervals = {"f" : (1.72,1.95), "d" : ("1996-01-01","1996-12-31")},
        seed=42)

In [4]:
df2 = generate_fake_dataframe(
  size = 30, 
  cols = "cicffcd", 
  col_names = ["user", "age", "residence", "weight","height", "pet", "registered"],
  intervals = [("names",15), (18,25),("cities", 15), (73.2,95.0),
               (1.65,1.95), ("animals", 11), None],
  seed = None)