In [17]:
import data_preprocessor
import pandas as pd
import random

abbreviations = data_preprocessor.load_json("abbreviations.json")

In [2]:
fname = 'data/khi_tickets_2022.csv'

df_complete = data_preprocessor.load_corpus(fname, pandas = True, header = True)

df = df_complete.drop(columns=['Title', 'Created', 'Close Time', 'Queue'], axis=1)

In [3]:
columns = ['Ticket #', 'Type', 'House #', 'Apartment #', 'Building #', 'Building Name', 'Street', 'Road', 'Area & Sub Area', 'Neighbourhood', 'City']

In [4]:
def pre_processing(address):
    standardized_address = data_preprocessor.lowercase_conversion(address)
    # standardized_address = data_preprocessor.remove_punctuation(standardized_address, True)
    standardized_address = data_preprocessor.standard_abbreviations_fix(standardized_address, abbreviations)
    # standardized_address = data_preprocessor.remove_extra_spaces(standardized_address, True)
    standardized_address = data_preprocessor.remove_extra_spaces(standardized_address, False)

    address_type = data_preprocessor.check_address_type(standardized_address)
    tokenized_address = data_preprocessor.standard_tokenization(standardized_address)

    # print(standardized_address)
    # print(address_type)
    # print(tokenized_address)

    return (standardized_address, address_type, tokenized_address)

In [18]:
def field_finder(field_name, tokenized_list):

    street_keywords = ['street', 'lane']
    road_keywords = ['road', 'highway', 'khayaban', 'avenue', 'boulevard', 'shahrah', 'alley']
    house_keywords = ['house', 'house no', 'house number', 'house #', 'plot']
    apartment_keywords = ['flat', 'flat no', 'flat number', 'flat #', 'apartment', 'suite']
    floor_keywords = ['floor', 'fl', 'level']
    area_keywords = ['phase', 'scheme', 'sector']
    keywords = []
    
    field_name = field_name.lower()

    if field_name == 'street':
        keywords = street_keywords
    elif field_name == 'road':
        keywords = road_keywords
    elif field_name == 'house':
        keywords = house_keywords
    elif field_name == 'apartment':
        keywords = apartment_keywords
    elif field_name == 'floor':
        keywords = floor_keywords
    elif field_name == 'area':
        keywords = area_keywords

    for index, token in enumerate(tokenized_list):
        if any(keyword in token for keyword in keywords):
            return index
    
    return None

# house_index = field_finder('house', address)
# apartment_index = field_finder('apartment', address)
# street_index = field_finder('street', address)

# print('House Index:', house_index)
# print('Apartment Index:', apartment_index)
# print('Street Index:', street_index)


In [6]:
def probabilistic_identifiers(reference_tokenized_address, remaining_address):
    # tokenized_address = data_preprocessor.standard_tokenization(original_address)
    # reference_tokenized_address = list(tokenized_address)

    index_p_scores = []
    count = 0

    # potienal_area, potienal_building_name, potienal_building_number = [], [], []

    for item in remaining_address:
        true_index_in_original = reference_tokenized_address.index(item)+1
        index_percentage = (true_index_in_original/len(reference_tokenized_address))*100
        index_p_scores.append((count, index_percentage))
        count += 1


    potienal_area = [(index, score) for index, score in index_p_scores if score > 50]
    remaining_fields = [(index, score) for index, score in index_p_scores if score <= 50]

    if len(remaining_fields) >= 2:
        max_score_index = max(remaining_fields, key=lambda x: x[1])[0]
        potienal_building_name_tuple = remaining_fields.pop(max_score_index)
        potienal_building_name = list()
        potienal_building_name.append(potienal_building_name_tuple)

        potienal_building_number = list(remaining_fields)
    
    elif len(remaining_fields) == 1:
        potienal_building_name = list(remaining_fields)
        potienal_building_number = []

    else:
        potienal_building_name, potienal_building_number = [], []

    areas_indexes = [ip_tuple[0] for ip_tuple in potienal_area]
    building_name_indexes = [ip_tuple[0] for ip_tuple in potienal_building_name]
    building_number_indexes = [ip_tuple[0] for ip_tuple in potienal_building_number]

    
    return areas_indexes, building_name_indexes, building_number_indexes

In [7]:
test2 = df[['Ticket#', 'Address']][0:20]

test2.head()


Unnamed: 0,Ticket#,Address
0,105207394681,"House # SC-4, Al Kareem Centre, Flat# 107, 1s..."
1,109247984771,"House # Plot# 28-C, Flat A-2, 1st Floor, Lane..."
2,101042042052,"Apartment/Suite# Flat 204, Building All Block..."
3,101087613080,"Apartment/Suite# BI3, Building All Blocks, Ib..."
4,107178824611,"Apartment/Suite# 26, Building Block B, Marhab..."


In [8]:
''' creating random test dataframes '''

def create_random_sample(df, sample_size):
    selected_columns = ['Ticket#', 'Address']
    random_indices = random.sample(range(len(df)), sample_size)
    random_sample = df.loc[random_indices, selected_columns]
    return random_sample

test3 = create_random_sample(df, 50)

test3.head()

Unnamed: 0,Ticket#,Address
121370,105594438805,"House # G5/4, 6th Gizri Street, Phase 4, Defe..."
129387,108682737336,"House # 59-B, Hatim Ali Alvi Road, Block 4, C..."
9612,109527837383,"House # B/3 Ground Floor, Ziauddin Road, Bloc..."
140058,103522503769,"House # 158 BLOCK B Off, Jauhar Road, Shab..."
134242,102803787566,"House # B-155, Street 7, Block-4, Saadi Town,..."


In [20]:
address_df = data_preprocessor.create_dataframe(columns)

# if instead of creating new for each run, we load this from previously stored one (a file) then we can have old ones as well as new ones added to it.

In [21]:
# Hierarchal and Fields parsing: 

def parse(dataframe):

    global address_df

    list_of_addresses = dataframe['Address'].tolist()
    tickets = dataframe['Ticket#'].tolist()
    # counter = 0
    
    # for address in list_of_addresses:
    
    for index in range(len(dataframe)):
        ticketno = tickets[index]
        address = list_of_addresses[index]

        data = {'Ticket #': [], 'Type': [], 'House #': [], 'Apartment #': [], 'Building #': [], 'Building Name': [], 'Street': [], 'Road': [], 'Area & Sub Area': [], 'Neighbourhood': [], 'City': []}
        
        address_info = pre_processing(address)
        address_type = address_info[1]
        tokenized_address = address_info[2]
        reference_tokenized_address = list(tokenized_address)

        ''' Ticket # '''
        data['Ticket #'].append(ticketno)
        # data['Ticket #'].append(tickets[counter])
        # counter+=1

        ''' Type '''
        data['Type'].append(address_type)

        ''' City '''
        data['City'].append(tokenized_address.pop(-1).strip())

        ''' Neighbourhood '''
        data['Neighbourhood'].append(tokenized_address.pop(-1).strip())


        ''' Road '''
        road_index = field_finder('road', tokenized_address)
        if road_index is not None:
            data['Road'].append(tokenized_address.pop(road_index).strip())
        else:
            data['Road'].append('None')


        ''' Street '''
        street_index = field_finder('street', tokenized_address)
        if street_index is not None:
            data['Street'].append(tokenized_address.pop(street_index).strip())
        else:
            data['Street'].append('None')

        
        '''Appartment # '''
        appartment_index = field_finder('apartment', tokenized_address)
        if appartment_index is not None:
            data['Apartment #'].append(tokenized_address.pop(appartment_index).strip())
        else:
             data['Apartment #'].append('None')

        floor_index = field_finder('floor', tokenized_address)
        if floor_index is not None:
            if data['Apartment #'] == ['None']:
                data['Apartment #'] = [tokenized_address.pop(floor_index).strip()]
            else:
                data['Apartment #'].append(tokenized_address.pop(floor_index))
                value_lst = data['Apartment #']
                joined_string = ' '.join(value_lst)
                data['Apartment #'] = [joined_string.strip()]


        ''' House # '''
        house_index = field_finder('house', tokenized_address)
        if house_index is not None:
            data['House #'].append(tokenized_address.pop(house_index).strip())
        else:
            data['House #'].append('None')


        # p_area_index, p_buildingname_index, p_buildingnumber_index = probabilistic_identifiers(reference_tokenized_address, tokenized_address)

        
        ''' Area/Sub Area '''
        area_index = field_finder('area', tokenized_address)
        if area_index is not None:
            data['Area & Sub Area'].append(tokenized_address.pop(area_index).strip())
        else:
            data['Area & Sub Area'].append('None')

            # add index_percentage identifer here and checks if last not bulding etc or first classify bulilding name and number from start then area automatically  
            # data['Area & Sub Area'].append(tokenized_address.pop(-1).strip())

        p_area_index, p_buildingname_index, p_buildingnumber_index = probabilistic_identifiers(reference_tokenized_address, tokenized_address)
            
        if len(p_area_index) > 0:
                if data['Area & Sub Area'] != ['None']:
                    for index in p_area_index:
                        data['Area & Sub Area'].append(tokenized_address[index].strip())
                    value_lst = data['Area & Sub Area']
                    joined_string = ', '.join(value_lst)
                    data['Area & Sub Area'] = [joined_string.strip()]
                else:
                    data['Area & Sub Area'] = []
                    for index in p_area_index:
                        data['Area & Sub Area'].append(tokenized_address[index].strip())
                    value_lst = data['Area & Sub Area']
                    joined_string = ', '.join(value_lst)
                    data['Area & Sub Area'] = [joined_string.strip()]

        for index in sorted(p_area_index, reverse=True):
            tokenized_address.pop(index)

        
        if address_type == 'house':
            if len(p_buildingname_index) + len(p_buildingnumber_index) > 0:
                area_indexes_more = p_buildingnumber_index + p_buildingname_index 

                if data['Area & Sub Area'] != ['None']:
                        temp = data['Area & Sub Area']
                        data['Area & Sub Area'] = []
                        for index in area_indexes_more:
                            data['Area & Sub Area'].append(tokenized_address[index].strip())
                        value_lst = list(data['Area & Sub Area'])
                        for x in temp:
                            value_lst.append(x)
                        joined_string = ', '.join(value_lst)
                        data['Area & Sub Area'] = [joined_string.strip()]
                else:
                    data['Area & Sub Area'] = []
                    for index in area_indexes_more:
                        data['Area & Sub Area'].append(tokenized_address[index].strip())
                    value_lst = data['Area & Sub Area']
                    joined_string = ', '.join(value_lst)
                    data['Area & Sub Area'] = [joined_string.strip()]

                for index in sorted(area_indexes_more, reverse=True):
                    tokenized_address.pop(index)    
        else:
            ''' Building Name '''
            if len(p_buildingname_index)>0:
                for index in p_buildingname_index:
                    data['Building Name'].append(tokenized_address[index].strip())
                value_lst = data['Building Name']
                joined_string = ', '.join(value_lst)
                data['Building Name'] = [joined_string.strip()]

            ''' Building Number '''
            if len(p_buildingnumber_index)>0:
                for index in p_buildingnumber_index:
                    data['Building #'].append(tokenized_address[index].strip())
                value_lst = data['Building #']
                joined_string = ', '.join(value_lst)
                data['Building #'] = [joined_string.strip()]


            for index in sorted(p_buildingname_index, reverse=True):
                        tokenized_address.pop(index) 

            for index in sorted(p_buildingnumber_index, reverse=True):
                        tokenized_address.pop(index)


        ''' Shifting Entries '''
        if address_type == 'house' and data['House #'] == ['None'] and data['Apartment #'] != ['None']:
            data['House #'] = data['Apartment #']
            data['Apartment #'] = []


        ''' Null Entires'''
        for field in data:
            if len(data[field]) == 0:
                data[field].append('None')
                # data[field].append(-1)

        
        df_temp = data_preprocessor.create_dataframe(columns, data, datacheck=True)
        address_df = pd.concat([address_df, df_temp], axis=0)

    return address_df



# parse(test2)

parse(test3)


address_df.head()


Unnamed: 0,Ticket #,Type,House #,Apartment #,Building #,Building Name,Street,Road,Area & Sub Area,Neighbourhood,City
0,105594438805,house,house # g5 / 4,,,,6th gizri street,,phase 4,defence,karachi
0,108682737336,house,house # 59 - b,,,,,hatim ali alvi road,block 4,clifton,karachi
0,109527837383,house,house # b / 3 ground floor,,,,,ziauddin road,block 11,federal b area,karachi
0,103522503769,house,house # 158 block b off,,,,,jauhar road,shabbirabad,pechs,karachi
0,102803787566,house,house # b - 155,,,,street 7,,block - 4,saadi town,karachi


In [22]:
# address_df.to_csv('data/data_a3.csv', index=False)

address_df.to_csv('data/datarand_a3.csv', index=False)

In [None]:
# df.to_excel('data/data.xlsx', sheet_name = 'Sheet1', index=False)