## SAP Internship 2020 : De-Duplication of Streaming Data

### 1.1   De-Duplication in Unbounded real-time Streaming Data

In [17]:
# Importing libraries for random generation of strings
import random
import string
import time
import sys
import warnings
warnings.filterwarnings('ignore')


# Defining a function to generate random letters
def randomLetterGenerator(): 
    
    # Setting the sleep timer as 0.5 seconds to generate the next input
    sleep_seconds = 0.5
    
    try:
        # Loop forever to generate streaming data
        while True:
            time.sleep(sleep_seconds)

            # Declaring the uppercase alphabets ABCDEFGHIJKLMNOPQRSTUVWXYZ
            letters = string.ascii_uppercase

            # Produce a series of random values (letters)
            yield random.choice(letters)
            
    except KeyboardInterrupt:
        sys.exit(0)
    
# Declaring Set as set()
Set = set()

print("*------------De-duplication in Unbounded real-time streaming data-------------*")

# Iterate for every value generated by the function randomLetterGenerator
for value in randomLetterGenerator():  

    # Logic handling for Duplicate Value
    if value in Set:
        print("Duplicate Value: ", value)
    
    # Logic handling for incoming New Value
    else:
        Set.add(value)
        print("New Value: ", value)


*------------De-duplication in Unbounded real-time streaming data-------------*
New Value:  X
New Value:  W
New Value:  Y
Duplicate Value:  W
New Value:  F
New Value:  Z
Duplicate Value:  X
New Value:  S
New Value:  A
New Value:  L
New Value:  I
Duplicate Value:  F
New Value:  Q
Duplicate Value:  L
Duplicate Value:  S
New Value:  R
Duplicate Value:  X
Duplicate Value:  W
New Value:  P
Duplicate Value:  S
New Value:  H
New Value:  O
Duplicate Value:  I
Duplicate Value:  I
New Value:  V
New Value:  E


SystemExit: 0

In [18]:
# Printing De-duplicated Data
print("\n\n*-----------After De-duplication in Unbounded real-time streaming data---------*")
print("\nDe-duplicated Data: ", Set)



*-----------After De-duplication in Unbounded real-time streaming data---------*

De-duplicated Data:  {'L', 'H', 'Q', 'X', 'F', 'I', 'R', 'P', 'O', 'W', 'E', 'V', 'Z', 'Y', 'A', 'S'}


### 1.2   De-Duplication in Bounded Streaming Data

In [19]:
# Importing libraries for random generation of strings
import random
import string

# Defining a function to generate random words of length 1 (i.e. letters)
def randomString(stringLength = 1):
    
    # Declaring the uppercase alphabets ABCDEFGHIJKLMNOPQRSTUVWXYZ
    letters = string.ascii_uppercase
    
    # Returning random combination of alphabtets to obtain words of length = 1 (i.e. letters)
    return ''.join(random.choice(letters) for i in range(stringLength))


# Defining a function deDuplicate
def deDuplicate(originalData):
    
    #Declaring a dictionary
    dictionary = {}
    
    # If the incoming streaming data is empty
    if not originalData:
        print("Incoming streaming data is empty")
            
    # Looping over the incoming streaming data
    for i in range(len(originalData)):
            
        # If the data element is not present in the keys of the dictionary,
        # add the value as 'Duplicate' and update the data element to dictionary
        if originalData[i] in dictionary.keys():
            dictionary[originalData[i]] = "Duplicate"
        
        # If the data element is present in the keys of the dictionary,
        # add the value as 'Original Data' and mark the data element to dictionary
        else:
            dictionary[originalData[i]] = "Original Data"
            
    return dictionary

# Creating a list to store Original Data
originalData = []

# Genrating 50 random alphabets
for i in range(1,50):
    
    # Calling the function randomString
    k = randomString()
    
    # Appending the alphabets to the list originalData
    originalData.append(k)
    
# Calling the deDuplicate function
de_duplicate_data = deDuplicate(originalData)

print("*-----------------------Before De-duplication---------------------*")

# Printing Original Streaming Data
print("\nOriginal Streaming Data: ", originalData)

# Creating a list to store De-duplicated Data
deDuplicateData = []

print("\n\n*-----------After De-duplication in Bounded streaming data---------*")

# Iterating over dictionary items of de_duplicate_data
for key, value in de_duplicate_data.items():
    if value == "Original Data":
        deDuplicateData.append(key)

# Printing De-duplicated Data
print("\nDe-duplicated Data: ", deDuplicateData)

*-----------------------Before De-duplication---------------------*

Original Streaming Data:  ['U', 'H', 'Z', 'G', 'U', 'U', 'D', 'Y', 'G', 'X', 'X', 'F', 'O', 'S', 'M', 'V', 'P', 'Y', 'H', 'Z', 'B', 'P', 'A', 'N', 'Q', 'L', 'D', 'Z', 'R', 'K', 'M', 'M', 'F', 'Z', 'O', 'K', 'R', 'O', 'S', 'G', 'V', 'I', 'K', 'N', 'J', 'K', 'A', 'Q', 'C']


*-----------After De-duplication in Bounded streaming data---------*

De-duplicated Data:  ['B', 'L', 'I', 'J', 'C']
