In [1]:
from main import RandomDatasetGenerator

config = [
        {"name": "age", "type": "integer", "params": {"low": 18, "high": 65}},
        {"name": "income", "type": "float", "distribution": "normal", "params": {"loc": 50000, "scale": 15000}},
        {"name": "name", "type": "name", "choices": ["Alice", "Bob", "Charlie", "Diana", "Edward"]},
        {"name": "email", "type": "email"},
        {"name": "job", "type": "job", "choices": ["Engineer", "Doctor", "Teacher", "Artist"]}
    ]
    
generator = RandomDatasetGenerator(seed=42)
df = generator.generate_dataset(n_rows=10, columns_config=config)
df


Unnamed: 0,age,income,name,email,job
0,56,19835.556654,Diana,johnsonjoshua@example.org,Teacher
1,46,42607.948626,Charlie,jillrhodes@example.net,Doctor
2,32,55888.696256,Diana,garzaanthony@example.org,Artist
3,60,36062.229996,Diana,jesseguzman@example.net,Engineer
4,25,51197.477183,Alice,jennifermiles@example.com,Artist
5,38,47607.252475,Charlie,shaneramirez@example.org,Doctor
6,56,50333.327399,Edward,helenpeterson@example.org,Doctor
7,36,43583.106296,Charlie,susanrogers@example.org,Doctor
8,40,42022.738847,Edward,wdavis@example.net,Engineer
9,28,48237.867473,Alice,lindsay78@example.org,Doctor


In [2]:
print(f"\nDataset shape: {df.shape}")


Dataset shape: (10, 5)


In [None]:
advanced_config = [
    {"name": "user_id", "type": "custom", "params": {"prefix": "USER", "start": 1000}},
    {"name": "department", "type": "category", "choices": ["HR", "Engineering", "Marketing", "Sales"]},
    {"name": "salary", "type": "float", "distribution": "normal", "params": {"loc": 75000, "scale": 20000}},
    {"name": "is_active", "type": "boolean", "params": {"p_true": 0.8}},
    {"name": "join_date", "type": "datetime", "params": {"start": "2020-01-01", "end": "2024-12-31"}},
    {"name": "city", "type": "city", "choices": ["New York", "San Francisco", "Chicago", "Austin", "Seattle"]}
]


advanced_df = generator.generate_dataset(n_rows=15, columns_config=advanced_config)
advanced_df

Unnamed: 0,user_id,department,salary,is_active,join_date,city
0,USER-1000,HR,86322.256536,True,2024-08-18 10:12:34.919888,Seattle
1,USER-1001,Engineering,60910.930992,False,2023-04-03 20:47:58.436150,San Francisco
2,USER-1002,Sales,47441.213924,True,2024-07-28 17:11:34.155710,Chicago
3,USER-1003,Sales,67937.66696,True,2024-04-01 04:05:26.279017,New York
4,USER-1004,Marketing,65770.685637,True,2022-03-31 16:43:34.836357,San Francisco
5,USER-1005,Sales,76333.14557,True,2020-06-23 05:15:10.602273,San Francisco
6,USER-1006,Marketing,71474.286761,True,2021-11-08 02:44:20.703613,Austin
7,USER-1007,Sales,99017.855439,True,2023-05-06 07:17:56.604228,Seattle
8,USER-1008,HR,88967.978827,True,2023-04-30 23:22:52.883059,Chicago
9,USER-1009,Sales,71567.423295,True,2022-12-15 17:02:03.294612,New York


In [4]:
print(f"\nAdvanced dataset shape: {advanced_df.shape}")


Advanced dataset shape: (15, 6)


# 📈 Distribution Summary

## Available Distributions and Their Use Cases:

- **Normal**: Heights, weights, IQ scores, measurement errors  
- **Uniform**: Random selections, dice rolls, uniform sampling  
- **Exponential**: Wait times, time between events, failure times  
- **Poisson**: Count data, events per time period  
- **Lognormal**: Income, file sizes, stock prices (skewed positive)  
- **Binomial**: Number of successes in fixed trials  
- **Bernoulli**: Binary outcomes (success/failure)  
- **Gamma**: Processing times, rainfall, positive continuous data  
- **Beta**: Proportions, rates, probabilities (0–1 range)  
- **Pareto**: Wealth distribution, 80–20 phenomena  
- **Geometric**: Number of trials until first success  
- **Weibull**: Survival analysis, reliability engineering  
- **Chi-square**: Statistical testing, goodness of fit  
- **Rayleigh**: Wind speed, wave heights  
- **Zipf**: Word frequencies, popularity rankings

## Normal Distributions Example

In [5]:
normal_config = [
    {"name": "height_cm", "type": "float", "distribution": "normal", 
        "params": {"loc": 170, "scale": 10}},  # mean=170, std=10
    {"name": "weight_kg", "type": "float", "distribution": "normal", 
        "params": {"loc": 70, "scale": 15}},   # mean=70, std=15
    {"name": "iq_score", "type": "integer", "distribution": "normal", 
        "params": {"loc": 100, "scale": 15}}   # mean=100, std=15 (converted to int)
]

print("\n1. Normal Distribution (Gaussian):")
print("   - height_cm: mean=170, std=10")
print("   - weight_kg: mean=70, std=15") 
print("   - iq_score: mean=100, std=15 (as integers)")
normal_df = generator.generate_dataset(n_rows=8, columns_config=normal_config)
normal_df


1. Normal Distribution (Gaussian):
   - height_cm: mean=170, std=10
   - weight_kg: mean=70, std=15
   - iq_score: mean=100, std=15 (as integers)


Unnamed: 0,height_cm,weight_kg,iq_score
0,186.417711,83.389442,87
1,155.814035,68.421143,96
2,171.90337,55.669803,106
3,171.357538,63.77853,79
4,176.080897,49.018887,84
5,177.049813,64.838792,101
6,173.609234,81.261788,81
7,155.330321,65.085622,129


## Uniform Distribution Examples

In [6]:
uniform_config = [
    {"name": "random_score", "type": "float", "distribution": "uniform", 
        "params": {"low": 0, "high": 100}},    # uniform between 0-100
    {"name": "dice_roll", "type": "integer", "distribution": "uniform", 
        "params": {"low": 1, "high": 6}},      # uniform 1-6 (dice)
    {"name": "percentage", "type": "float", "distribution": "uniform", 
        "params": {"low": 0.0, "high": 1.0}}   # uniform 0-1
]

print("\n2. Uniform Distribution:")
print("   - random_score: uniform between 0-100")
print("   - dice_roll: uniform between 1-6")
print("   - percentage: uniform between 0.0-1.0")
uniform_df = generator.generate_dataset(n_rows=8, columns_config=uniform_config)
uniform_df


2. Uniform Distribution:
   - random_score: uniform between 0-100
   - dice_roll: uniform between 1-6
   - percentage: uniform between 0.0-1.0


Unnamed: 0,random_score,dice_roll,percentage
0,26.941233,2,0.033051
1,24.412552,4,0.345071
2,16.829104,4,0.634351
3,21.876422,1,0.680705
4,55.8102,5,0.530935
5,40.383617,2,0.447783
6,6.489225,5,0.552893
7,25.391541,3,0.592697


## Exponential Distribution Examples

In [7]:
exponential_config = [
    {"name": "wait_time_minutes", "type": "float", "distribution": "exponential", 
        "params": {"scale": 5.0}},             # avg wait time = 5 minutes
    {"name": "server_response_ms", "type": "float", "distribution": "exponential", 
        "params": {"scale": 200}},             # avg response = 200ms
    {"name": "failure_hours", "type": "integer", "distribution": "exponential", 
        "params": {"scale": 1000}}             # avg time to failure = 1000 hours
]

print("\n3. Exponential Distribution (for wait times, failures):")
print("   - wait_time_minutes: scale=5.0 (avg=5 min)")
print("   - server_response_ms: scale=200 (avg=200ms)")
print("   - failure_hours: scale=1000 (avg=1000 hours)")
exp_df = generator.generate_dataset(n_rows=8, columns_config=exponential_config)
exp_df


3. Exponential Distribution (for wait times, failures):
   - wait_time_minutes: scale=5.0 (avg=5 min)
   - server_response_ms: scale=200 (avg=200ms)
   - failure_hours: scale=1000 (avg=1000 hours)


Unnamed: 0,wait_time_minutes,server_response_ms,failure_hours
0,0.421548,320.239516,1658
1,2.307436,32.672691,428
2,1.386415,141.936132,101
3,8.126306,238.0224,2822
4,3.177228,390.891643,506
5,20.498738,78.89284,729
6,2.544341,49.754088,1818
7,8.475847,248.369228,1126


## Poisson Distribution Examples

In [8]:
poisson_config = [
    {"name": "emails_per_day", "type": "integer", "distribution": "poisson", 
        "params": {"lam": 12}},                # avg 12 emails per day
    {"name": "customers_per_hour", "type": "integer", "distribution": "poisson", 
        "params": {"lam": 8}},                 # avg 8 customers per hour
    {"name": "defects_per_batch", "type": "integer", "distribution": "poisson", 
        "params": {"lam": 2}}                  # avg 2 defects per batch
]

print("\n4. Poisson Distribution (for count data):")
print("   - emails_per_day: λ=12 (avg 12 emails/day)")
print("   - customers_per_hour: λ=8 (avg 8 customers/hour)")
print("   - defects_per_batch: λ=2 (avg 2 defects/batch)")
poisson_df = generator.generate_dataset(n_rows=8, columns_config=poisson_config)
poisson_df


4. Poisson Distribution (for count data):
   - emails_per_day: λ=12 (avg 12 emails/day)
   - customers_per_hour: λ=8 (avg 8 customers/hour)
   - defects_per_batch: λ=2 (avg 2 defects/batch)


Unnamed: 0,emails_per_day,customers_per_hour,defects_per_batch
0,15,10,1
1,12,9,3
2,9,13,2
3,21,11,2
4,17,7,2
5,9,7,3
6,11,7,2
7,15,4,1


## Log-Normal Distribution Examples  

In [9]:
lognormal_config = [
    {"name": "income_usd", "type": "float", "distribution": "lognormal", 
        "params": {"mean": 10.5, "sigma": 0.8}},    # log-normal income
    {"name": "file_size_mb", "type": "float", "distribution": "lognormal", 
        "params": {"mean": 2.0, "sigma": 1.0}},     # file sizes
    {"name": "stock_price", "type": "float", "distribution": "lognormal", 
        "params": {"mean": 4.0, "sigma": 0.3}}      # stock prices
]

print("\n5. Log-Normal Distribution (for skewed positive data):")
print("   - income_usd: mean=10.5, σ=0.8 (income distribution)")
print("   - file_size_mb: mean=2.0, σ=1.0 (file sizes)")
print("   - stock_price: mean=4.0, σ=0.3 (stock prices)")
lognormal_df = generator.generate_dataset(n_rows=8, columns_config=lognormal_config)
lognormal_df


5. Log-Normal Distribution (for skewed positive data):
   - income_usd: mean=10.5, σ=0.8 (income distribution)
   - file_size_mb: mean=2.0, σ=1.0 (file sizes)
   - stock_price: mean=4.0, σ=0.3 (stock prices)


Unnamed: 0,income_usd,file_size_mb,stock_price
0,32123.10506,5.24288,45.701686
1,12754.871554,8.880479,44.522679
2,7387.320476,13.169416,50.449773
3,18287.277738,62.467177,50.189384
4,71111.496769,9.270607,45.549845
5,150999.65962,38.685375,46.303804
6,21121.937384,4.650964,75.87133
7,10518.985604,8.057198,102.918112


 ## Binomial Distribution Examples

In [10]:
binomial_config = [
    {"name": "successful_trials", "type": "integer", "distribution": "binomial", 
        "params": {"n": 20, "p": 0.3}},       # 20 trials, 30% success rate
    {"name": "survey_yes_responses", "type": "integer", "distribution": "binomial", 
        "params": {"n": 100, "p": 0.65}},     # 100 people, 65% say yes
    {"name": "quality_pass", "type": "integer", "distribution": "binomial", 
        "params": {"n": 50, "p": 0.95}}       # 50 items, 95% pass rate
]

print("\n6. Binomial Distribution (for success/failure counts):")
print("   - successful_trials: n=20, p=0.3 (20 trials, 30% success)")
print("   - survey_yes_responses: n=100, p=0.65 (100 people, 65% yes)")
print("   - quality_pass: n=50, p=0.95 (50 items, 95% pass)")
binomial_df = generator.generate_dataset(n_rows=8, columns_config=binomial_config)
binomial_df


6. Binomial Distribution (for success/failure counts):
   - successful_trials: n=20, p=0.3 (20 trials, 30% success)
   - survey_yes_responses: n=100, p=0.65 (100 people, 65% yes)
   - quality_pass: n=50, p=0.95 (50 items, 95% pass)


Unnamed: 0,successful_trials,survey_yes_responses,quality_pass
0,3,57,46
1,9,66,47
2,9,61,49
3,5,58,47
4,2,64,47
5,9,62,49
6,6,61,48
7,6,65,48


 ## Bernoulli Distribution Examples (special case of binomial)

In [11]:
bernoulli_config = [
    {"name": "coin_flip", "type": "integer", "distribution": "bernoulli", 
        "params": {"p": 0.5}},                 # fair coin (50-50)
    {"name": "email_opened", "type": "integer", "distribution": "bernoulli", 
        "params": {"p": 0.25}},                # 25% email open rate
    {"name": "ad_clicked", "type": "integer", "distribution": "bernoulli", 
        "params": {"p": 0.03}}                 # 3% ad click rate
]

print("\n7. Bernoulli Distribution (for binary outcomes):")
print("   - coin_flip: p=0.5 (fair coin)")
print("   - email_opened: p=0.25 (25% open rate)")
print("   - ad_clicked: p=0.03 (3% click rate)")
bernoulli_df = generator.generate_dataset(n_rows=8, columns_config=bernoulli_config)
bernoulli_df


7. Bernoulli Distribution (for binary outcomes):
   - coin_flip: p=0.5 (fair coin)
   - email_opened: p=0.25 (25% open rate)
   - ad_clicked: p=0.03 (3% click rate)


Unnamed: 0,coin_flip,email_opened,ad_clicked
0,0,1,0
1,0,0,0
2,0,0,0
3,0,0,0
4,0,0,0
5,1,0,1
6,1,0,1
7,0,0,0


## Gamma Distribution Examples

In [12]:
gamma_config = [
    {"name": "processing_time", "type": "float", "distribution": "gamma", 
        "params": {"shape": 2.0, "scale": 3.0}},    # processing times
    {"name": "rainfall_mm", "type": "float", "distribution": "gamma", 
        "params": {"shape": 1.5, "scale": 10.0}},   # rainfall amounts
    {"name": "battery_life_hours", "type": "float", "distribution": "gamma", 
        "params": {"shape": 5.0, "scale": 20.0}}    # battery life
]

print("\n8. Gamma Distribution (for positive continuous data):")
print("   - processing_time: shape=2.0, scale=3.0")
print("   - rainfall_mm: shape=1.5, scale=10.0")
print("   - battery_life_hours: shape=5.0, scale=20.0")
gamma_df = generator.generate_dataset(n_rows=8, columns_config=gamma_config)
gamma_df


8. Gamma Distribution (for positive continuous data):
   - processing_time: shape=2.0, scale=3.0
   - rainfall_mm: shape=1.5, scale=10.0
   - battery_life_hours: shape=5.0, scale=20.0


Unnamed: 0,processing_time,rainfall_mm,battery_life_hours
0,3.307433,8.563474,124.199695
1,6.73633,17.351998,118.460634
2,5.357026,32.660287,71.901918
3,5.327307,4.096191,44.444188
4,3.286789,3.756864,20.597928
5,1.371095,23.157478,151.921886
6,1.03537,6.49869,33.028954
7,5.312285,1.832662,121.683606


## Beta Distribution Examples

In [13]:
beta_config = [
    {"name": "completion_rate", "type": "float", "distribution": "beta", 
        "params": {"a": 2.0, "b": 3.0}},       # completion rates (0-1)
    {"name": "satisfaction_score", "type": "float", "distribution": "beta", 
        "params": {"a": 5.0, "b": 2.0}},       # satisfaction (skewed high)
    {"name": "conversion_rate", "type": "float", "distribution": "beta", 
        "params": {"a": 1.0, "b": 9.0}}        # low conversion rate
]

print("\n9. Beta Distribution (for proportions between 0-1):")
print("   - completion_rate: α=2.0, β=3.0")
print("   - satisfaction_score: α=5.0, β=2.0 (skewed high)")
print("   - conversion_rate: α=1.0, β=9.0 (low rates)")
beta_df = generator.generate_dataset(n_rows=8, columns_config=beta_config)
beta_df


9. Beta Distribution (for proportions between 0-1):
   - completion_rate: α=2.0, β=3.0
   - satisfaction_score: α=5.0, β=2.0 (skewed high)
   - conversion_rate: α=1.0, β=9.0 (low rates)


Unnamed: 0,completion_rate,satisfaction_score,conversion_rate
0,0.315865,0.563127,0.076653
1,0.377951,0.767452,0.148537
2,0.111648,0.861354,0.076858
3,0.251346,0.495156,0.079056
4,0.080499,0.778567,0.074958
5,0.229166,0.771453,0.022209
6,0.542396,0.800072,0.061688
7,0.280937,0.79807,0.229748


## Pareto Distribution Examples (80-20 rule)

In [14]:
pareto_config = [
    {"name": "wealth_distribution", "type": "float", "distribution": "pareto", 
        "params": {"a": 1.16}},                # Pareto principle (80-20)
    {"name": "city_population", "type": "integer", "distribution": "pareto", 
        "params": {"a": 1.5}},                 # city size distribution
    {"name": "website_traffic", "type": "integer", "distribution": "pareto", 
        "params": {"a": 2.0}}                  # web traffic distribution
]

print("\n10. Pareto Distribution (for 80-20 type phenomena):")
print("   - wealth_distribution: a=1.16 (80-20 rule)")
print("   - city_population: a=1.5")
print("   - website_traffic: a=2.0")
pareto_df = generator.generate_dataset(n_rows=8, columns_config=pareto_config)
pareto_df


10. Pareto Distribution (for 80-20 type phenomena):
   - wealth_distribution: a=1.16 (80-20 rule)
   - city_population: a=1.5
   - website_traffic: a=2.0


Unnamed: 0,wealth_distribution,city_population,website_traffic
0,1.110927,1,0
1,0.433684,0,0
2,0.943145,2,0
3,0.701287,0,0
4,1.133329,10,0
5,0.553948,0,0
6,1.804516,0,2
7,0.186668,2,0
