# Create Synthesized Datasets

In [None]:
#Faker library generates fake data for you
!pip install faker



Importing Libraries

In [None]:
import pandas as pd #for data manipulation and analysis
import uuid #Generates unique ids
import random #Generates random numbers
from faker import Faker

Creating variables

In [None]:
#Number of rows or users to create
num_users=100

Adding feature list

In [None]:
#Making a list of 5 features
features = [
    "id",
    "gender",
    "subscriber",
    "name",
    "rating"
]

#Creating a dataframe for these features
df = pd.DataFrame(columns=features)
df

Unnamed: 0,id,gender,subscriber,name,rating


## Generate unique identifiers

UUID is a great library to generate unique IDS for each user because of its astronomically low chance of duplicating an ID.


In [None]:
df['id'] = [uuid.uuid4().node for i in range(num_users)]
df
#populates the 'id' column of DataFrame df with unique identifiers

Unnamed: 0,id,gender,subscriber,name,rating
0,266855871871661,,,,
1,266208876814763,,,,
2,94781789620679,,,,
3,147188407183592,,,,
4,192158026215852,,,,
...,...,...,...,...,...
95,146967315342308,,,,
96,261205810492539,,,,
97,84028108729798,,,,
98,181391708896621,,,,


Checking if all IDs are unique

In [None]:
print(df['id'].nunique()==num_users)

True


## Generating Gender column

In [None]:
genders = ["male","female","na"]

df['gender'] = random.choices(genders, weights=(45,45,10),k=num_users)
df

#assigns a gender to each of the 100 users in your DataFrame, with a higher probability for "male" and "female" compared to "na".

Unnamed: 0,id,gender,subscriber,name,rating
0,266855871871661,male,,,
1,266208876814763,female,,,
2,94781789620679,female,,,
3,147188407183592,female,,,
4,192158026215852,male,,,
...,...,...,...,...,...
95,146967315342308,male,,,
96,261205810492539,male,,,
97,84028108729798,male,,,
98,181391708896621,female,,,


In [None]:
#Getting count of different genders in the list
df.gender.value_counts()

Unnamed: 0_level_0,count
gender,Unnamed: 1_level_1
male,44
female,43
na,13


## Generating Subscriber values

For this attribute, the choices can be randomly selected between True or False.

In [None]:
choice = [True, False]

df['subscriber'] = random.choices(choice, k=num_users)
df

Unnamed: 0,id,gender,subscriber,name,rating
0,266855871871661,male,False,,
1,266208876814763,female,True,,
2,94781789620679,female,True,,
3,147188407183592,female,False,,
4,192158026215852,male,False,,
...,...,...,...,...,...
95,146967315342308,male,True,,
96,261205810492539,male,True,,
97,84028108729798,male,True,,
98,181391708896621,female,True,,


In [None]:
df.subscriber.value_counts()

Unnamed: 0_level_0,count
subscriber,Unnamed: 1_level_1
False,50
True,50


## Generating name values`

In [None]:
#Instantiating faker
faker = Faker(locale="FR_FR")
def name_gen(gender):
    if gender == "male":
        return faker.name_male()
    elif gender == 'female':
        return faker.name_female()
    else:
        return faker.name()

#returns a gender-appropriate fake name using the Faker library

In [None]:
#Generating names for each user
df['name'] = [name_gen(i) for i in df['gender']]
df

Unnamed: 0,id,gender,subscriber,name,rating
0,266855871871661,male,False,Hugues Boyer,
1,266208876814763,female,True,Adélaïde du Ruiz,
2,94781789620679,female,True,Jacqueline Raymond,
3,147188407183592,female,False,Christiane Albert,
4,192158026215852,male,False,Émile Noël,
...,...,...,...,...,...
95,146967315342308,male,True,Jules Michaud,
96,261205810492539,male,True,Georges Laporte,
97,84028108729798,male,True,Emmanuel-Pierre Vallet,
98,181391708896621,female,True,Juliette Guillon,


In [None]:
df.name.value_counts()

Unnamed: 0_level_0,count
name,Unnamed: 1_level_1
Hugues Boyer,1
Adélaïde du Ruiz,1
Jacqueline Raymond,1
Christiane Albert,1
Émile Noël,1
...,...
Jules Michaud,1
Georges Laporte,1
Emmanuel-Pierre Vallet,1
Juliette Guillon,1


## Generate Rating values
We can choose to skew the distribution of 1 to 5 towards the extremes to reflect the tendancies of users being more absolute with their ratings

In [None]:
ratings = [1,2,3,4,5] #Avalaible ratings list

In [None]:
#Weighted ratings with a skew towards the ends
df['rating'] = random.choices(ratings, weights=(5,10,10,10,5), k=num_users)
df

Unnamed: 0,id,gender,subscriber,name,rating
0,266855871871661,male,False,Hugues Boyer,3
1,266208876814763,female,True,Adélaïde du Ruiz,4
2,94781789620679,female,True,Jacqueline Raymond,2
3,147188407183592,female,False,Christiane Albert,2
4,192158026215852,male,False,Émile Noël,4
...,...,...,...,...,...
95,146967315342308,male,True,Jules Michaud,1
96,261205810492539,male,True,Georges Laporte,5
97,84028108729798,male,True,Emmanuel-Pierre Vallet,3
98,181391708896621,female,True,Juliette Guillon,1


In [None]:
df.rating.value_counts() #Getting count of ratings

Unnamed: 0_level_0,count
rating,Unnamed: 1_level_1
2,28
4,24
3,19
5,15
1,14


# Convert dataframe to dataset
Export your generated synthetic data into a file format that can be easily shared or used for further analysis in other tools or programs.

In [None]:
df.to_csv('dataset1.csv')

In [None]:
#Viewing the saved csv file
csv_df = pd.read_csv('dataset1.csv', index_col=0)
csv_df

Unnamed: 0,id,gender,subscriber,name,rating
0,266855871871661,male,False,Hugues Boyer,3
1,266208876814763,female,True,Adélaïde du Ruiz,4
2,94781789620679,female,True,Jacqueline Raymond,2
3,147188407183592,female,False,Christiane Albert,2
4,192158026215852,male,False,Émile Noël,4
...,...,...,...,...,...
95,146967315342308,male,True,Jules Michaud,1
96,261205810492539,male,True,Georges Laporte,5
97,84028108729798,male,True,Emmanuel-Pierre Vallet,3
98,181391708896621,female,True,Juliette Guillon,1
