In [1]:
# Tutorial on: https://blog.dennisokeeffe.com/blog/2021-08-11-generating-fake-csv-data-with-python

from faker import Faker

fake = Faker()

In [2]:
def capitalize(str):
    return str.capitalize()
words = fake.words()
capitalized_words = list(map(capitalize, words))
movie_name = ' '.join(capitalized_words)
print(movie_name)

Around To Change


In [3]:
from datetime import datetime

date = datetime.strftime(fake.date_time_this_decade(), "%B %d, %Y")
print(date) # April 30, 2020

October 16, 2020


In [4]:
# creating a provider for genre
from faker.providers import BaseProvider
import random

# create new provider class
class GenereProvider(BaseProvider):
    def movie_genre(self):
        return random.choice(['Documentary', 'Thriller', 'Mystery', 'Horror', 'Action', 'Comedy', 'Drama', 'Romance'])

# then add new provider to faker instance
fake.add_provider(GenereProvider)

# now you can use:
movie_genre = fake.movie_genre()
print(movie_genre) # Horror

Mystery


In [6]:
# creating a provider for genre
from faker.providers import BaseProvider
import random

# create new provider class
class LanguageProvider(BaseProvider):
    def language(self):
        return random.choice(['English', 'Chinese', 'Italian', 'Spanish', 'Hindi', 'Japanese'])

# then add new provider to faker instance
fake.add_provider(LanguageProvider)

# now you can use:
language = fake.language()
print(language) # Spanish

Japanese


In [7]:
# Getting random movie length
movie_len = random.randrange(50, 150)
print(movie_len) # 143

109


In [8]:
# Movie rating
random_rating = round(random.uniform(1.0, 5.0), 1)
print(random_rating) # 2.2

3.4


In [47]:
from faker import Faker
from faker.providers import BaseProvider
import random
import csv
from datetime import date
from dateutil.relativedelta import relativedelta

fake = Faker()

# Create subscriptions dataset

def get_subscription_id():
    return random.randrange(1, 5000)

def get_user_id():
    return random.randrange(1, 5000)

def get_subscription_start_timestamp():
    return fake.date_time_between(start_date = '-5y') # Business started at least five years ago

def get_subscription_end_timestamp():
    return get_subscription_start_timestamp() + relativedelta(months=random.randrange(1, 60)) # Business started at least five years ago

def get_subscription_plan_id():
    return random.randrange(1, 3)

def generate_subscription():
    return [get_subscription_id(), get_subscription_start_timestamp(), get_subscription_end_timestamp(), get_subscription_plan_id()]

with open('subscriptions_data.csv', 'w') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['subscription_id', 'subscription_starts_at', 'subscription_ends_at', 'subscription_plan_id'])
    for n in range(1, 10):
        writer.writerow(generate_subscription())

In [None]:
# Create users dataset

def get_subscription_id():
    return random.randrange(1, 5000)

def get_user_id():
    return random.randrange(1, 5000)

def get_subscription_start_timestamp():
    return fake.date_time_between(start_date = '-5y') # Business started at least five years ago

def get_subscription_end_timestamp():
    return get_subscription_start_timestamp() + relativedelta(months=random.randrange(1, 60)) # Business started at least five years ago

def get_subscription_plan_id():
    return random.randrange(1, 3)

def generate_subscription():
    return [get_subscription_id(), get_subscription_start_timestamp(), get_subscription_end_timestamp(), get_subscription_plan_id()]

with open('subscriptions_data.csv', 'w') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['subscription_id', 'subscription_starts_at', 'subscription_ends_at', 'subscription_plan_id'])
    for n in range(1, 10):
        writer.writerow(generate_subscription())

In [60]:
import pandas as pd
import numpy as np
from faker import Faker
from faker.providers import BaseProvider
import random
import csv
from itertools import cycle
from datetime import date
from dateutil.relativedelta import relativedelta

fake = Faker()

def generate_fake_dataframe(size, cols, col_names = None, intervals = None, seed = None):
    
    categories_dict = {'animals': ['cow', 'rabbit', 'duck', 'shrimp', 'pig', 'goat', 'crab', 'deer', 'bee', 'sheep', 'fish', 'turkey', 'dove', 'chicken', 'horse'],
                       'names'  : ['James', 'Mary', 'Robert', 'Patricia', 'John', 'Jennifer', 'Michael', 'Linda', 'William', 'Elizabeth', 'Ahmed', 'Barbara', 'Richard', 'Susan', 'Salomon', 'Juan Luis'],
                       'cities' : ['Stockholm', 'Denver', 'Moscow', 'Marseille', 'Palermo', 'Tokyo', 'Lisbon', 'Oslo', 'Nairobi', 'Río de Janeiro', 'Berlin', 'Bogotá', 'Manila', 'Madrid', 'Milwaukee'],
                       'colors' : ['red', 'orange', 'yellow', 'green', 'blue', 'indigo', 'purple', 'pink', 'silver', 'gold', 'beige', 'brown', 'grey', 'black', 'white']
                      }
    default_intervals = {"i" : (0,10), "f" : (0,100), "c" : ("names", 5), "d" : ("2020-01-01","2020-12-31")}
    rng = np.random.default_rng(seed)

    first_c = default_intervals["c"][0]
    categories_names = cycle([first_c] + [c for c in categories_dict.keys() if c != first_c])
    default_intervals["c"] = (categories_names, default_intervals["c"][1])
    
    if isinstance(col_names,list):
        assert len(col_names) == len(cols), f"The fake DataFrame should have {len(cols)} columns but col_names is a list with {len(col_names)} elements"
    elif col_names is None:
        suffix = {"c" : "cat", "i" : "int", "f" : "float", "d" : "date"}
        col_names = [f"column_{str(i)}_{suffix.get(col)}" for i, col in enumerate(cols)]

    if isinstance(intervals,list):
        assert len(intervals) == len(cols), f"The fake DataFrame should have {len(cols)} columns but intervals is a list with {len(intervals)} elements"
    else:
        if isinstance(intervals,dict):
            assert len(set(intervals.keys()) - set(default_intervals.keys())) == 0, f"The intervals parameter has invalid keys"
            default_intervals.update(intervals)
        intervals = [default_intervals[col] for col in cols]
    df = pd.DataFrame()
    for col, col_name, interval in zip(cols, col_names, intervals):
        if interval is None:
            interval = default_intervals[col]
        assert (len(interval) == 2 and isinstance(interval, tuple)) or isinstance(interval, list), f"This interval {interval} is neither a tuple of two elements nor a list of strings."
        if col in ("i","f","d"):
            start, end = interval
        if col == "i":
            df[col_name] = rng.integers(start, end, size)
        elif col == "f":
            df[col_name] = rng.uniform(start, end, size)
        elif col == "c":
            if isinstance(interval, list):
                categories = np.array(interval)
            else:
                cat_family, length = interval
                if isinstance(cat_family, cycle):
                    cat_family = next(cat_family)
                assert cat_family in categories_dict.keys(), f"There are no samples for category '{cat_family}'. Consider passing a list of samples or use one of the available categories: {categories_dict.keys()}"
                categories = rng.choice(categories_dict[cat_family], length, replace = False, shuffle = True)
            df[col_name] = rng.choice(categories, size, shuffle = True)
        elif col == "d":
            df[col_name] = rng.choice(pd.date_range(start, end), size)
    return df


df1 = generate_fake_dataframe(
  size = 10, 
  cols = "cccfd", 
  col_names=["name", "pet", "city","height", "birthday"],
  intervals = {"f" : (1.72,1.95), "d" : ("1996-01-01","1996-12-31")},
  seed=42)

df1.head()

Unnamed: 0,name,pet,city,height,birthday
0,Ahmed,crab,Oslo,1.877101,1996-02-26
1,Elizabeth,crab,Marseille,1.891295,1996-07-25
2,Juan Luis,crab,Marseille,1.942527,1996-09-02
3,Mary,chicken,Marseille,1.79494,1996-12-10
4,Mary,pig,Milwaukee,1.805206,1996-06-08
