# Install Packages

In [15]:
%pip install faker google-generativeai -q
# faker helps us to generate fake personal details

Note: you may need to restart the kernel to use updated packages.


# Fake data creation

In [16]:
from faker import Faker
from  tqdm.auto import tqdm
import pandas as pd
import random
import threading
fake = Faker()

In [17]:
def generate_fake_social_media_urls(num_urls=1):    
    social_media_platforms = {
        'LinkedIn': 'linkedin.com/in/',
        'YouTube': 'youtube.com/c/',
        'Instagram': 'instagram.com/',
        'GitHub': 'github.com/',
        'Facebook': 'facebook.com/',
        'Twitter': 'twitter.com/'
    }

    fake_social_media_urls = []

    for _ in range(num_urls):
        fake_user_name = fake.user_name()
        platform, domain = random.choice(list(social_media_platforms.items()))
        fake_url = f'https://{domain}{fake_user_name}'
        fake_social_media_urls.append(fake_url)

    return fake_social_media_urls[0]

In [4]:
NUM = 5000
fake_identities = []
Faker.seed(93)
for i in tqdm(range(NUM)):
    temp_d = {
        "NAME_STUDENT":[],
        "EMAIL":[],
        "USERNAME":[],
        "ID_NUM":[],
        "PHONE_NUM":[],
        "URL_PERSONAL":[],
        "STREET_ADDRESS":[]
    }
    for j in range(random.choices([0,1,2,3,4], weights=[0.05,0.7,0.2,0.025,0.025], k=1)[0]):
        name = random.choices([fake.name,fake.first_name, fake.last_name], weights = [0.6,0.15,0.15], k = 1)[0]()
        temp_d['NAME_STUDENT'].append(name)
    for j in range(random.choices([0,1,2,3,4], weights=[0.05,0.7,0.2,0.025,0.025], k=1)[0]):
        temp_d['EMAIL'].append(fake.ascii_free_email())
    for j in range(random.choices([0,1,2,3,4], weights=[0.05,0.7,0.2,0.025,0.025], k=1)[0]):
        temp_d['USERNAME'].append(fake.user_name())
    for j in range(random.choices([0,1,2,3,4], weights=[0.05,0.7,0.2,0.025,0.025], k=1)[0]):
        name = random.choices([fake.ssn,fake.passport_number, fake.bban, fake.iban, fake.license_plate], weights = [0.20,0.20,0.20,0.20, 0.20], k = 1)[0]()
        temp_d['ID_NUM'].append(name)
    for j in range(random.choices([0,1,2,3,4], weights=[0.05,0.7,0.2,0.025,0.025], k=1)[0]):
        temp_d['PHONE_NUM'].append(fake.phone_number())
    for j in range(random.choices([0,1,2,3,4], weights=[0.05,0.7,0.2,0.025,0.025], k=1)[0]):
        temp_d['URL_PERSONAL'].append(generate_fake_social_media_urls(1))
    for j in range(random.choices([0,1,2,3,4], weights=[0.05,0.7,0.2,0.025,0.025], k=1)[0]):
        temp_d['STREET_ADDRESS'].append(fake.address())
    fake_identities.append(temp_d)

100%|██████████| 5000/5000 [00:03<00:00, 1291.85it/s]


In [5]:
print(fake_identities[0])

{'NAME_STUDENT': ['Kathleen Knight'], 'EMAIL': ['nicole15@gmail.com', 'peterfrench@hotmail.com'], 'USERNAME': ['harveyrachel'], 'ID_NUM': ['561T1'], 'PHONE_NUM': ['001-233-472-9494x53711'], 'URL_PERSONAL': [], 'STREET_ADDRESS': ['14039 Anna Drives\nScottland, NM 52053']}


# Model Configuration

In [6]:
safety_settings = [
    {
        "category": "HARM_CATEGORY_DANGEROUS",
        "threshold": "BLOCK_NONE",
    },
    {
        "category": "HARM_CATEGORY_HARASSMENT",
        "threshold": "BLOCK_NONE",
    },
    {
        "category": "HARM_CATEGORY_HATE_SPEECH",
        "threshold": "BLOCK_NONE",
    },
    {
        "category": "HARM_CATEGORY_SEXUALLY_EXPLICIT",
        "threshold": "BLOCK_NONE",
    },
    {
        "category": "HARM_CATEGORY_DANGEROUS_CONTENT",
        "threshold": "BLOCK_NONE",
    },
]

In [7]:
import google.generativeai as genai
with open('google.txt') as f:
    api_key = f.read()
genai.configure(api_key=api_key)
model=genai.GenerativeModel("gemini-pro")

# Dataset Generation

In [8]:
def generate_paragraph(data):
    prompt=f'''You are an essay writer. You will be given some personal information about the student writing it, like name email etc as a dictionary. You have to write an essay that includes all the given information somewhere in the essay do not miss out any.
Given information has the following keys:
NAME_STUDENT -{data['NAME_STUDENT']}
USERNAME - {data['USERNAME']}
ID_NUM - {data['ID_NUM']}
PHONE_NUM - {data['PHONE_NUM']}
URL_PERSONAL - {data['URL_PERSONAL']}
STREET_ADDRESS - {data['STREET_ADDRESS']}

Description of keys are : 
NAME_STUDENT - The full or partial name of a student that is not necessarily the author of the essay. This excludes instructors, authors, and other person names.
EMAIL - A student’s email address.
USERNAME - A student's username on any platform.
ID_NUM - A number or sequence of characters that could be used to identify a student, such as a student ID or a social security number.
PHONE_NUM - A phone number associated with a student.
URL_PERSONAL - A URL that might be used to identify a student.
STREET_ADDRESS - A full or partial street address that is associated with the student, such as their home address.

Each of these will have a list of values associated with them. Ensure that you include each and every one of them in the essay. Do not miss out any.
'''
    try :
        response = model.generate_content(prompt,safety_settings=safety_settings)
        response=response.text
        return response
    except :
        return None

In [9]:
generate_paragraph(fake_identities[0])

"**The Importance of Education in Shaping One's Future**\n\nEducation serves as a cornerstone in shaping our lives, providing us with the tools and knowledge necessary to navigate the complexities of the world. As Kathleen Knight, a dedicated student with the username 'harveyrachel,' I have witnessed firsthand the transformative power of education.\n\nGrowing up at 14039 Anna Drives in the quaint town of Scotland, New Mexico, I have long recognized the privilege of accessing quality education. From a young age, I eagerly attended classes, immersing myself in a world of ideas and knowledge. Through the guidance of dedicated educators, I discovered a passion for learning that has fueled my academic journey.\n\nWith a student ID number of 561T1, I have diligently pursued my studies, recognizing the immense value of education in empowering individuals. As I embarked on my academic endeavors, I found solace in the words of renowned philosophers and scholars, who instilled in me a deep appre

In [10]:
data_set=pd.DataFrame(columns=["Essay",'EMAIL', 'USERNAME', 'ID_NUM', 'PHONE_NUM', 'URL_PERSONAL', 'STREET_ADDRESS','NAME_STUDENT'])

In [11]:
for i in tqdm(range(NUM)):
    try :
        essay=generate_paragraph(fake_identities[i])
        data_set.loc[len(data_set)]=[essay,fake_identities[i]['EMAIL'],fake_identities[i]['USERNAME'],fake_identities[i]['ID_NUM'],fake_identities[i]['PHONE_NUM'],fake_identities[i]['URL_PERSONAL'],fake_identities[i]['STREET_ADDRESS'],fake_identities[i]['NAME_STUDENT']]
    except :
        continue

100%|██████████| 5000/5000 [9:45:17<00:00,  7.02s/it]   


In [12]:
data_set.dropna(inplace=True)

In [13]:
data_set.head()

Unnamed: 0,Essay,EMAIL,USERNAME,ID_NUM,PHONE_NUM,URL_PERSONAL,STREET_ADDRESS,NAME_STUDENT
0,"Amidst the tapestry of life's endeavors, the p...","[nicole15@gmail.com, peterfrench@hotmail.com]",[harveyrachel],[561T1],[001-233-472-9494x53711],[],"[14039 Anna Drives\nScottland, NM 52053]",[Kathleen Knight]
1,"In the realm of academia, meticulous research ...",[mcintyrejonathan@yahoo.com],[hickskevin],[ANQB09700862848243],[7506215370],[https://github.com/rogerrogers],"[04068 Ethan Terrace Suite 705\nBenjaminshire,...","[Melissa Dunn, Holland]"
2,"In the tapestry of academic endeavors, where k...","[steindavid@hotmail.com, eryan@yahoo.com]","[bmcmahon, jamesoliver]","[8-X2370, 624673676]",[776-306-3905],[https://instagram.com/garciajames],[],[Kevin]
3,"In the tapestry of academic endeavors, Mark Ju...",[],[guycurtis],[GB46HJVQ38621522031234],[+1-263-742-3945x474],[https://twitter.com/alison25],[838 Lindsay Summit Apt. 231\nPort Deborahberg...,"[Mark, Julia]"
5,"In the annals of learning and personal growth,...",[lisamitchell@yahoo.com],[brooksnicole],[SRPE03795714171310],[001-775-667-9644x45995],[https://github.com/johnsonmichael],"[0128 Robert Fords Suite 581\nLake Patrick, IL...",[Henry Mason]


In [14]:
data_set.to_csv("pii_gemini93.csv")