<a href="https://colab.research.google.com/github/ivmarchuk/ssn-generation/blob/main/ssn_gen.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install faker

In [None]:
from faker import Faker
from datetime import datetime
import pandas as pd
import datetime
import random
from random import randint
import sys
from faker.providers.ssn.pl_PL import Provider as SsnProvider

In [None]:
@measure_time
def generate_ssn(num):
  '''Generates a random valid pesel number using faker library.
      As input takes the amount of PESEL values to be generated '''
  fake = Faker()
  
  # According to faker docs we can guarantee that any generated values are unique for this specific instance.
  fake.add_provider(SsnProvider)
  pesels1 = [fake.unique.ssn() for i in range(num)]
  pesel_series1 = pd.Series(pesels1)
  return pesel_series1

In [None]:
def valid_month_value(birth_date): 
  '''https://www.oecd.org/tax/automatic-exchange/crs-implementation-and-assistance/tax-identification-numbers/Poland-TIN.pdf
    Based on the above document, the 3rd and 4th digits in the pesel number are generated based on the month from the person's date of birth. 
    Depending on the century in which the person was born, a number is added to the value of the month. 
    This function verifies the entered month value and outputs the correct month digits for pesel generation. '''

  
  if datetime.date(1800,1,31) <= birth_date <= datetime.date(1899,12,31):
    month = str(int(birth_date.strftime('%m') + 80))

  elif datetime.date(1900,1,1) <= birth_date <= datetime.date(1999,12,31):
    month = birth_date.strftime('%m')

  elif datetime.date(2000,1,1) <= birth_date <= datetime.date(2099,12,31):
    month = str(int(birth_date.strftime('%m') + 20))

  elif datetime.date(2100,1,1) <= birth_date <= datetime.date(2199,12,31):
    month = str(int(birth_date.strftime('%m') + 40))

  elif datetime.date(2200,1,1) <= birth_date <= datetime.date(2299,12,31):
    month = str(int(birth_date.strftime('%m') + 60))

  return month




def get_check_digit(pesel):
  '''The check digit is generated last, based on the remaining digits in the pesel number.
      This function gives the logic for generating such a number.
      Formula: 10 - (last_digit_of_sum(c1*1 + c2*3 + c3*7 + c4*9 + c5*1 + c6*3 + c7*7 + c8*9 + c9 *1 + c10*3))
      For more https://www.oecd.org/tax/automatic-exchange/crs-implementation-and-assistance/tax-identification-numbers/Poland-TIN.pdf''' 

  formula_sum = int(pesel[0]) * 1 + int(pesel[1])*3 + int(pesel[2])*7 + int(pesel[3])*9 \
                + int(pesel[4])* 1 + int(pesel[5])*3 + int(pesel[6])*7 + int(pesel[7])*9 \
                + int(pesel[8])*1 + int(pesel[9])*3
        
  # get the last digit of formula sum (we are sure, that the number will be > 0)
  formula_last_digit = formula_sum % 10
  if 10-formula_last_digit == 10: 
    k = 0
  else: 
    k = 10-formula_last_digit

  return k


In [None]:
@measure_time
def generate_unique_ssn(num, sex, start_date, end_date):
    fake = Faker()
    pesels2 = []
    start_date = datetime.datetime.strptime(start_date, '%Y-%m-%d')
    end_date = datetime.datetime.strptime(end_date, '%Y-%m-%d')

    while len(pesels2) != num: 

        # Get year value
        birth_date = fake.date_between(start_date = start_date, end_date = end_date)
        year = str(birth_date)[2:4]

        # Get month value
        month = valid_month_value(birth_date)

        # Get day value 
        day = birth_date.strftime('%d')

        # XXX - random number
        digit_1 = str(random.randint(0,9))
        digit_2 = str(random.randint(0,9))
        digit_3 = str(random.randint(0,9))
        xxx = digit_1 + digit_2 + digit_3

        # G – random digit indicates person’s gender (odd number is for male, even – for female)
        if sex == 'M': 
            s = str(random.randrange(1, 10, step = 2))
        elif sex == 'F':
            s = str(random.randrange(0, 10, step = 2))
        else:
          x = 'Wrong sex char provided'
          return x 

        # get 10 digits of pesel to generate check digit
        pesel = year + month + day + xxx + s

        # Get check digit 
        k = get_check_digit(pesel)
        pesel = pesel + str(k)

        # Validate if pesel is unique
        if pesel in pesels2:
            continue
        
        pesels2.append(pesel)

    pesel_series2 = pd.Series(pesels2)

    return pesel_series2


In [None]:
# ----------------------------------------------------------------------------------------------------------------------------------

In [None]:
# timer as decorator 
from functools import wraps
import time

def measure_time(func):
    @wraps(func)
    def timeit_wrapper(*args, **kwargs):
        start_time = time.perf_counter()
        result = func(*args, **kwargs)
        end_time = time.perf_counter()
        total_time = end_time - start_time
        print(f'Function {func.__name__}{args} {kwargs} Executed in {total_time:.4f} seconds')
        return result
    return timeit_wrapper

In [None]:
# generate_ssns for 1000
pesel_series1 = generate_ssn(num = 1000)
pesel_series1

Executed in 0.0757 seconds

Length: 1000, dtype: object

In [None]:
# generate_ssns for 10000
pesel_series1 = generate_ssn(num = 10000)
pesel_series1

Executed in 0.5105 seconds

Length: 10000, dtype: object

In [None]:
# generate_ssns for 100000
pesel_series1 = generate_ssn(num = 100000)
pesel_series1

Executed in 4.8408 seconds

Length: 100000, dtype: object

In [None]:
pesel_series1.duplicated().value_counts()

False    100000

dtype: int64

In [None]:
# generate_unique_ssns for 1000
pesel_series2 = generate_unique_ssn(num = 1000, sex = 'F', start_date = '1990-01-01', end_date = '1990-01-19')
pesel_series2

Executed in 0.1173 seconds

Length: 1000, dtype: object

In [None]:
# generate_unique_ssns for 10000
pesel_series2 = generate_unique_ssn(num = 10000, sex = 'F', start_date = '1990-01-01', end_date = '1990-01-19')
pesel_series2

Executed in 1.7615 seconds

Length: 10000, dtype: object

In [None]:
# generate_unique_ssns for 100000
pesel_series2 = generate_unique_ssn(num = 100000, sex = 'F', start_date = '1990-01-01', end_date = '1990-01-19')
pesel_series2

Executed in 573.1589 seconds

Length: 10000, dtype: object

In [None]:
pesel_series2.duplicated().value_counts()

False    100000

dtype: int64

In [None]:
# --------------------------------------------------------------------------------------------------------------------------------------------

In [None]:
def validate_ssns(pesel, sex = None, birth_date = None): 
  x = True
  # check if parsel length is 11 and its consists of digits
  if isinstance(pesel, int):
    pesel = str(pesel)
  if not (pesel.isdigit() and len(pesel) == 11):
    x = False

  # check if YYMMDD is equal to values in provided birth_date
  if birth_date:         
    year = str(birth_date)[2:4]
    if pesel[0:2] != year:
      x = False
    
    month = valid_month_value(birth_date)
    if pesel[2:4] != month:
      x = False

    day = birth_date.strftime('%d')
    if pesel[4:6] != day: 
      x = False

  # check if sex digit is valid
  if sex: 
    sex_int = int(pesel[-2])
    if sex == 'M' and sex_int % 2 == 1:
      x = True
    elif sex == 'F' and sex_int % 2 == 0:
      x = True
    else: 
      x = False

  # checksum
  k = get_check_digit(pesel)

  if pesel[-1] != str(k):
    x = False

  return x


In [None]:
pesels_to_check = [900101244291111, 90012130359772, 90010273727, 90010369048, 90010363424, 90010283867, 90010148863, 900102477122, 90010110371]

In [None]:
checked_pesels = {}
for pesel in pesels_to_check: 
  checked_pesels[pesel] = validate_ssns(pesel, sex = 'F')
checked_pesels

{90010110371: False,

 90010148863: True,

 90010273727: True,

 90010283867: True,

 90010363424: True,

 90010369048: True,

 900102477122: False,

 90012130359772: False,
 
 900101244291111: False}