# Chunkify

You have a file that needs to be divided into n chunks. While it would be straightforward to split the file into equal-bytes sizes and then write those chunks to file, you cannot write any incomplete lines to the files. This means that all of the n files that you create must have no truncated lines. If a split of a certain byte-size would result in a truncated line, then you can back off and only write the previous complete line. You can save the rest of it for the next chunk.


**Validation Tests** <br>
Check for corner cases and constraints in the inputs enlist all cases used for testing

In [None]:
assert isinstance(fname,str) # file name should be string
assert isinstance(n,int) # number of segments should be integer
assert 0< n # number of segments should be greater than 0

**Functional Tests** <br>
Check function output matches expected result enlist all cases used for testing

In [None]:
import os, math


def chunkify_test(name, n):
    '''
    Verify if chunkify program is working correctly
    :param name: name of original file
    :param n: number of splits
    :return: None
    '''
    split_by_n(name, n) # chunkify function
    f_orig = open(name, 'r')  # read original file
    val = f_orig.readlines()
    max_lines = len(val)  # total number of lines in original file
    count = 0  # current line to read from original file content
    avg_file_size = os.stat(name).st_size/n # average chunk size
    avg_line_len = math.ceil((avg_file_size*n)/max_lines) # average line length
    i = 0
    while True:
        if not os.path.exists(name + '_' + format(i, '03d') + '.txt'):
            if i == n:
                break
            else:
                assert False  # check for less number of split files
        assert i <= n  # check for more number of split files
        assert avg_file_size - 2*avg_line_len <= os.stat(name + '_' + format(i, '03d') + '.txt').st_size <= avg_file_size + 2*avg_line_len
        f = open(name + '_' + format(i, '03d') + '.txt', 'r')
        line = f.readline()
        while (line):
            assert count < max_lines  # check if excess is written in split file
            assert line == val[count]  # compare split file content with original file
            count += 1
            line = f.readline()
        i += 1

    assert count == max_lines  # check if all lines of original file are covered

name = 'pg5200.txt'

n = 3
chunkify_test(name, n)

n = 5
chunkify_test(name, n)

n = 8
chunkify_test(name, n)

# Encrypted sentence
We will implement a very simple encryption scheme that closely resembles the one-time-pad. You have probably seen this method used in movies like Unknown. The idea is that you and your counterparty share a book whose words you will use as the raw material for a codebook. In this case, you need Metamorphosis, by Franz Kafka.

Your job is to create a codebook of 2-tuples that map to specific words in the given text based on the line and position the words appears in the text. The text is very long so there will be duplicated words. Strip out all of the punctuation and make everything lowercase.

**Validation Tests**

In [None]:
# validation check for encrypt_message function
assert isinstance(message, str) # check if message is a string
assert isinstance(fname, str) # check if file name is string


# validation check for decrypt_message function
assert isinstance(inlist, list) # check if encrypted message is a list
for i in inlist:
    assert isinstance(i,tuple) # checking if each encrypted word is tuple
assert isinstance(fname, str) # check if file name is string
assert len(inlist) > 0 # check if encrypted message length > 0

In [None]:
def encrypt_test(fname, encrypt, message):
    '''
    Check if encrypted message is valid
    :param fname: file name to create codebook
    :param encrypt: encrypted message
    :param message: orignal message to encrypt
    :return: None
    '''
    assert len(encrypt) == len(set(encrypt)) # check for uniqueness of encryption
    # create a dictionary with key as tuple of word position and value as the word
    table = str.maketrans(dict.fromkeys(string.punctuation))
    line_nos = -1
    lookup = defaultdict(str)
    with open(fname) as f:
        line = f.readline()
        while line:
            line_nos += 1
            line = line.strip()
            # removing punctuation
            line = line.translate(table)
            l = line.split(" ")
            for idx, i in enumerate(l):
                lookup[(line_nos, idx)] = i.lower()
            line = f.readline()

    m = message.split(" ")
    # checking if encryption is correct
    for idx, i in enumerate(encrypt):
        assert lookup[i] == m[idx]

def decrypt_test(decrypt,message):
  
    # checking to see if the input of encrypt message matches the output of decrypt message. 
    assert decrypt == message

def check_for_more_occurences(message, fname):
    '''
    Assert for more occurences of a word in a message than in the codebook 
    :param message: message to encrypt
    :param fname: codebook
    :return: None
    '''
    try:
        # try encrypting the message. Should throw error
        encrypt_message(message, fname)
    except AssertionError:
        # expected error is thrown
        pass
    else:
        # assert false since error not thrown
        assert False

**Functional Tests**

In [None]:
fname = 'pg5200.txt'
message = 'we are going to the lets'
# encrypting the message
enc_x = encrypt_message(message,fname)
# functional test for encryption
encrypt_test(fname,enc_x,message)
# decrypting the message
enc_y = decrypt_message(x, fname)
# functional test for decryption
decrypt_test(enc_y,message)

message = 'let us not say we met late at the night about the secret'
# encrypting the message
enc_x = encrypt_message(message,fname)
# functional test for encryption
encrypt_test(fname,enc_x,message)
# decrypting the message
enc_y = decrypt_message(x, fname)
# functional test for decryption
decrypt_test(enc_y,message)

# message which has more occurences of a word than the codebook
message = 'secret secret secret secret'
# functional test for this case
check_for_more_occurences(message,fname)

# Multinomial Sampler

Write a function to return samples from the Multinomial distribution using pure Python (i.e., no third-party modules like Numpy, Scipy). 

**Validation Tests**

In [None]:
# validation check for multinomial_sample function
assert isinstance(n, int)
assert isinstance(p, list)
assert sum(p) == 1
assert all(isinstance(x, (int,float)) for x in p)
#assert all(0<=x<=1 for y in p)  #change this to x?
assert all(0<=x<=1 for x in p) 
assert isinstance(k, int)

**Functional Tests**

In [None]:
import math
import random
def check_multinomial():
    #calls multinomial_sample with 5 trials and 10 samples
    result = multinomial_sample(5,[1/3,1/3,1/3],k=10)
    #Checks if 10 samples were created and that each sample had 5 trials
    assert len(result) == 10
    assert all(sum(sample) == 5 for sample in result)

    #calls multinomial_sample with 50 trials and 100 samples
    result = multinomial_sample(50,[1/3,1/3,1/3],k=100)
    #Checks if 100 samples were created and that each sample had 50 trials
    assert len(result) == 100
    assert all(sum(sample) == 50 for sample in result)

    #Asserts that generated trial follows the distribution we specified with prob 1/3 for all
    probabilityResult = multinomial_sample(100000,[1/3,1/3,1/3],k=1)[0]
    expectedResult = 100000/3
    assert all(abs(sample - expectedResult) < 500 for sample in probabilityResult)

    #Asserts that generated trial follows the distribution we specified with prob 1/4 for all
    probabilityResult = multinomial_sample(100000,[1/4,1/4,1/4,1/4],k=1)[0]
    expectedResult = 100000/4
    assert all(abs(sample - expectedResult) < 500 for sample in probabilityResult)

    #Asserts that generated trial follows the distribution we specified with prob 1/2 for first and 1/4th for the rest
    probabilityResult = multinomial_sample(100000,[1/2,1/4,1/4],k=1)[0]
    expectedResult1 = 100000/2
    expectedResult2 = 100000/4
    assert abs(probabilityResult[0] - expectedResult1) < 500
    assert abs(probabilityResult[1] - expectedResult2) < 500
    assert abs(probabilityResult[2] - expectedResult2) < 500

    #Asserts that a custom random list of probabilities are getting into the expected result in each item in the generatedP list
    numProb = 10
    generatedP = generateProbability(numProb)
    probabilityResult = multinomial_sample(100000,generatedP,k=1)[0]


    for index in range(len(probabilityResult)):
        currentP = generatedP[index]
        expectedResult = currentP * 100000
        currentResult = probabilityResult[index]

        assert abs(currentResult-expectedResult) < expectedResult * 0.5

def generateProbability(n):
    '''
    Create custom list of random probabilities according to n
    '''
    returnArray = []
    for i in range(n):
        currentProb = random.randint(1, 10)/10
        returnArray.append(currentProb)
    
    finalArray = [currentVal/sum(returnArray) for currentVal in returnArray]

    return finalArray