In [1]:
import os
import numpy as np
import pandas as pd

In [2]:
file_list = [file for file in os.listdir("Data/Top/") if file[-3:] == "txt" ]

In [3]:
split_lengths = [10, 32, 1, 3, 9, 2, 3, 32]   # roll length changed 8 <- 9

# split a string according to a given length
def split_string_with_length(string, split_length):
    splitted_string = []
    start = 0
    for length in split_length:
        splitted_string.append(string[start:start+length])
        start += length
    splitted_string.append(string[start:])   # take the rest of the string
    return splitted_string

In [4]:
dataframe_structure = {
    "File Name": [],
    "Serial #": [],
    "Litho Code 1": [],
    "Set Code": [],
    "Subject Code": [],
    "Roll": [],
    "Exam #": [],
    "Venue Code": [],
    "Litho Code 2": [],
    "Machine Generate #": []
}


for file in file_list:
    current_file = open("Data/Top/"+file, "r")
    
    for line in current_file:
        if len(line) < 5:
            continue
        dataframe_structure["File Name"].append(file[:-4])
        
        line = line.replace(",", "")

        split_line = split_string_with_length(line[:len(line)-1], split_lengths)   # len(line)-1  - ignoring trailing new line character

        dataframe_structure["Serial #"].append(split_line[0])
        dataframe_structure["Litho Code 1"].append(split_line[1].replace(" ", "0"))
        dataframe_structure["Set Code"].append(split_line[2])
        dataframe_structure["Subject Code"].append(split_line[3])
        dataframe_structure["Roll"].append(split_line[4])
        dataframe_structure["Exam #"].append(split_line[5])
        dataframe_structure["Venue Code"].append(split_line[6])
        dataframe_structure["Litho Code 2"].append(split_line[7].replace(" ", "0"))
        dataframe_structure["Machine Generate #"].append(split_line[8])
    
    current_file.close()


df = pd.DataFrame(dataframe_structure)

In [6]:
# check for valid but duplicate data

In [7]:
Top_Duplicate_Row = df[df.duplicated(['Litho Code 1', 'Roll', 'Litho Code 2'])]
Top_Duplicate_Row.to_csv("Duplicate_Top.csv", index=False, sep=';')

df = df.drop(Top_Duplicate_Row.index)

In [8]:
# processing applicants info

In [9]:
applicants_file = open("Data/Applicants.txt", "rb")

In [10]:
applicants_structure = {
    "Invoice": [],
    "Roll": [],
    "Name": [],
    "Venue Code": []
}

for line in applicants_file:
    line = str(line)[2:-5]   # binary reading ambiguity
    applicants_data = line.split(';')
    
    applicants_structure["Invoice"].append(applicants_data[0]) 
    applicants_structure["Roll"].append(applicants_data[1]) 
    applicants_structure["Name"].append(applicants_data[2]) 
    applicants_structure["Venue Code"].append(applicants_data[3]) 
    
applicants_file.close()

applicants = pd.DataFrame(applicants_structure)

In [11]:
# seperating invalid roll

In [12]:
def not_number(number):
    number = str(number)
    
    for n in number:
        if n not in "0123456789":
            return True
    return False

In [13]:
def binary_search(element, sorted_array):
    start = 0
    end = len(sorted_array) - 1
    
    while start <= end:
        mid = (start + end) // 2
        
        if sorted_array[mid] == element:
            return True
        else:
            if sorted_array[mid] > element:
                end = mid - 1
            else:
                start = mid + 1
        
    return False

In [14]:
applicants_roll = list(applicants["Roll"].astype(int).sort_values())

roll = df["Roll"]

In [15]:
invalid_roll_index = []

for i in range(len(roll)):
    df_roll = roll.iloc[i]
    
    if not_number(df_roll) == True:
        invalid_roll_index.append(i)
    else:
        if binary_search(int(df_roll), applicants_roll) == False:
            invalid_roll_index.append(i)

In [16]:
invalid_roll = df.iloc[invalid_roll_index]
df = df.drop(invalid_roll.index)

In [17]:
invalid_roll.to_csv("Invalid_Roll.csv", index=False, sep=';')

In [18]:
# seperate duplicate rolls

In [19]:
roll = df["Roll"]

duplicate_rolls = roll[roll.duplicated(keep=False)]

duplicate_roll = duplicate_rolls.drop_duplicates()

duplicate_rolls_series = pd.Series([])

for dr in duplicate_roll:
    mask = duplicate_rolls == dr
    duplicate_rolls_series = duplicate_rolls_series.append(duplicate_rolls[mask])
        

duplicate_rolls = df.loc[duplicate_rolls_series.index]
df = df.drop(duplicate_rolls.index)

In [20]:
duplicate_rolls.to_csv("Duplicate_Roll.csv", index=False, sep=';')

In [21]:
# seperate invalid set code

In [22]:
set_code = df["Set Code"]

In [23]:
invalid_set_code_index = []

for i in range(len(set_code)):
    if set_code.iloc[i] not in ['1', '2', '3', '4']:
        invalid_set_code_index.append(i)

In [24]:
invalid_set_code = df.iloc[invalid_set_code_index]
df = df.drop(invalid_set_code.index)

In [25]:
invalid_set_code.to_csv("Invalid_Set_Code.csv", index=False, sep=';')

In [26]:
# generate subject code from first digit of roll

In [27]:
subject_code_dict = {'1': 100, '2': 200, '3': 300, '4': 400}

generated_subject_code = [subject_code_dict[ str(roll)[0] ] for roll in df["Roll"]]

df["Generated Subject Code"] = generated_subject_code

In [28]:
# finalize top processing

In [29]:
main_top_dict = {
                 "Roll": df["Roll"],
                 "Serial_Top": df["Serial #"],
                 "File_Name_Top": df["File Name"],
                 "Set_Code": df["Set Code"],
                 "Subject_Code": df["Subject Code"],
                 "Generated_Subject_Code": df["Generated Subject Code"],
                 "Litho_Top_1": df["Litho Code 1"],
                 "Litho_Top_2": df["Litho Code 2"]
                }

Main_Top = pd.DataFrame(main_top_dict)

Main_Top.to_csv("Main_Top.csv", index=False, sep=';')