In [20]:
import faker
import pandas as pd
import faker_commerce
import random
from datetime import date, timedelta
fake = faker.Faker()
fake.add_provider(faker_commerce.Provider)

pd.set_option('display.max_colwidth', None)

## Create fake companies

In [None]:
## Utilities

In [4]:
def get_unique_fakes(provider, num_records):
    # To ensure we get enough records we will initially generate 2X the number needed 
    records = [provider() for x in range(num_records * 2)]
    unique_records = list(set(records))
    if len(unique_records) < num_records:
        raise Exception('Not enough unique records. Try generating fewer records.')
    return unique_records[0:num_records]


## Create fake companies

In [5]:
num_companies = 10_000
company_name_provider = lambda : fake.company() + ' ' + fake.company_suffix()  # Adding suffix give more uniqe names
company_names = get_unique_fakes(company_name_provider, num_companies)
company_slogans = get_unique_fakes(fake.catch_phrase, num_companies)
company_purposes = get_unique_fakes(fake.bs, num_companies)

companies = zip(company_names, company_slogans, company_purposes)
company_records = [{'name': company[0], 'slogan': company[1], 'purpose': company[2]} for company in companies]
companies_df = pd.DataFrame(company_records)
companies_df.index.rename('id', inplace=True)
companies_df.head()
companies_df.to_csv('../seeds/sources/fake_companies.csv')


Unnamed: 0_level_0,name,slogan,purpose
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,"Moreno, Ramirez and Williams Ltd",Enterprise-wide intermediate framework,productize viral experiences
1,"Bush, Tanner and Alexander Ltd",User-friendly user-facing customer loyalty,innovate mission-critical niches
2,Armstrong LLC LLC,Up-sized secondary hardware,orchestrate ubiquitous relationships
3,"Henderson, Flowers and Best LLC",Networked asymmetric moratorium,disintermediate B2B users
4,"Leonard, Hamilton and Harper PLC",Enterprise-wide incremental matrices,scale real-time supply-chains


## Create fake people

In [6]:
fake.zipcode_in_state()

'37019'

In [7]:
companies_df

Unnamed: 0_level_0,name,slogan,purpose
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,"Moreno, Ramirez and Williams Ltd",Enterprise-wide intermediate framework,productize viral experiences
1,"Bush, Tanner and Alexander Ltd",User-friendly user-facing customer loyalty,innovate mission-critical niches
2,Armstrong LLC LLC,Up-sized secondary hardware,orchestrate ubiquitous relationships
3,"Henderson, Flowers and Best LLC",Networked asymmetric moratorium,disintermediate B2B users
4,"Leonard, Hamilton and Harper PLC",Enterprise-wide incremental matrices,scale real-time supply-chains
...,...,...,...
9995,Marshall LLC Group,Triple-buffered client-driven synergy,reinvent web-enabled info-mediaries
9996,"Lee, Powell and Jackson and Sons",Virtual needs-based installation,grow efficient solutions
9997,Hendrix-Kaufman PLC,Reduced directional customer loyalty,revolutionize distributed ROI
9998,Mcdonald-Cummings and Sons,Customer-focused even-keeled info-mediaries,optimize extensible convergence


## Create dates 

In [8]:
start_date = date(2020, 1, 1)
end_date = date(2029, 12, 31)
num_added_days = (end_date - start_date).days + 1
dates = [start_date + timedelta(days=day) for day in range(num_added_days)]
dates_df = pd.DataFrame({'date': dates})
dates_df.head()
dates_df.to_csv('../seeds/sources/fake_dates.csv', index=False)

Unnamed: 0,date
0,2020-01-01
1,2020-01-02
2,2020-01-03
3,2020-01-04
4,2020-01-05


## Create number range

In [10]:
numbers = []
for number in range(1, 101):
    numbers.extend([number] * number)
numbers_df = pd.DataFrame({'number': numbers})
numbers_df.head(10)
numbers_df.to_csv('../seeds/sources/fake_numbers.csv', index=False)

Unnamed: 0,number
0,1
1,2
2,2
3,3
4,3
5,3
6,4
7,4
8,4
9,4


## Create fake products

In [11]:
fake.ecommerce_name()

'Mouse'

In [12]:
fake.ecommerce_category()

'Automotive'

In [13]:
fake.ecommerce_price()

87798985

In [14]:
random.randrange(3, 2500) + random.randrange(0, 99) / 100

198.46

In [15]:
def generate_product():
    return {
        'category': fake.ecommerce_category(),
        'name': fake.ecommerce_name(),
        'price': random.randrange(3, 2500) + random.randrange(0, 99) / 100
    }
            
products = [generate_product() for i in range(10_000)]
products_df = pd.DataFrame(products)
products_df.index.rename('id', inplace=True)
products_df
products_df.to_csv('../seeds/sources/fake_products.csv', index=True)


Unnamed: 0_level_0,category,name,price
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,Clothing,Handcrafted Rubber Ball,902.07
1,Tools,Soap,73.97
2,Books,Awesome Granite Fish,955.65
3,Computers,Unbranded Cotton Cheese,1953.43
4,Automotive,Fresh Hat,1624.35
...,...,...,...
9995,Grocery,Wooden Chicken,1272.48
9996,Electronics,Chicken,1511.51
9997,Home,Generic Bike,1144.81
9998,Electronics,Hat,579.23


## Create fake People info

In [24]:
def generate_people_info():
    country_code = random.choice(['+1', '+44', '+91', '+81', '+86'])  # Add more country codes as needed
    phone_number = fake.numerify('##########')  # Assuming 10-digit phone numbers

    return {
        'phone_number': f'{country_code} - {phone_number}',
        'address': {
            'street_address': fake.street_address(),
            'city': fake.city(),
            'state': fake.state(),
            'zipcode': fake.zipcode(),
        },
        'birthdate': fake.date_of_birth(minimum_age=18, maximum_age=80).strftime('%Y-%m-%d'),
        'blood_type': random.choice(['A+', 'B+', 'AB+', 'O+', 'A-', 'B-', 'AB-', 'O-']),  # Less common column
        'favorite_color': fake.color_name(),  # Less common column
        'credit_score': random.randint(300, 850),  # Less common column
    }

p_info = [generate_people_info() for _ in range(10_000)]
p_info_df = pd.DataFrame(p_info)
p_info_df.index.rename('id', inplace=True)

# Display the DataFrame
p_info_df.head()
p_info_df.to_csv('../seeds/sources/fake_personal_info.csv')
