In [1]:
import data_preprocessor

abbreviations = data_preprocessor.load_json("abbreviations.json")

In [2]:
fname = 'data/khi_tickets_2022.csv'

df_complete = data_preprocessor.load_corpus(fname, pandas = True, header = True)

df = df_complete.drop(columns=['Title', 'Created', 'Close Time', 'Queue'], axis=1)

In [3]:
def pre_processing(address):
    standardized_address = data_preprocessor.lowercase_conversion(address)
    # standardized_address = data_preprocessor.remove_punctuation(standardized_address, True)
    standardized_address = data_preprocessor.standard_abbreviations_fix(standardized_address, abbreviations)
    # standardized_address = data_preprocessor.remove_extra_spaces(standardized_address, True)
    standardized_address = data_preprocessor.remove_extra_spaces(standardized_address, False)

    address_type = data_preprocessor.check_address_type(standardized_address)
    tokenized_address = data_preprocessor.standard_tokenization(standardized_address)

    # print(standardized_address)
    # print(address_type)
    # print(tokenized_address)

    return (standardized_address, address_type, tokenized_address)

In [5]:
test = df['Address'][0:10]

In [4]:
''' Generating a random list of address from the dataset '''

import random

def generate_random_numbers():
    random_numbers = []
    for _ in range(10):
        random_number = random.randint(0, 213874)
        random_numbers.append(random_number)
    return random_numbers


In [5]:
''' Generating a random list of address from the dataset '''

random_numbers = generate_random_numbers()

random_addresses = []

for i in random_numbers:
    random_addresses.append(df['Address'][i])

print(random_addresses)

[' House # F-37 Off, Khayaban-e-Iqbal, Block 8, Clifton, Karachi ', ' House # 82-B ,1st Floor, 21st Street, Khayaban e badar, Phase 6, Defence, Karachi ', ' Apartment/Suite# 702 7th Floor, Building All Block, Jheel ParkView, Siraj Road, Block 2, PECHS, Karachi ', ' House # R-18, Street No 37, Block-1, Gulistan-e-Johar, Karachi ', ' Apartment/Suite# 409 4th fl, Building All Blocks, Sahil Promenade, Khayaban e Sadi, Block 3, Clifton, Karachi ', ' House # 6C 1st Floor Ismail Center, Alamgir Road, Bahadurabad, PECHS, Karachi ', ' Apartment/Suite# 204 2nd Floor, Building C, Farhan Classic, Pro Abdul Ghafoor Rd, Block 12, Gulistan-e-Johar, Karachi ', ' House # 16B/1 Main, National Highway, Phase 2, Defence, Karachi ', ' House # 187/2/B Off, Shahrah e Qaideen, Block 2, PECHS, Karachi ', ' House # R-11/1, Khayaban e Sadi, Phase 7, Defence, Karachi ']


In [None]:
''' Show address preprocessing results '''

def show_pre_processing(list_of_addresses):
    for ady in list_of_addresses:
        address_info = pre_processing(ady)
        print('Address: ', address_info[0])
        print('Type: ', address_info[1])
        print(' ')

show_pre_processing(random_addresses)

In [None]:
''' Show address preprocessing results '''

show_pre_processing(test)

In [6]:
''' Appending 2 Pandas Dataframes '''

import pandas as pd

columns = ['Ticket #', 'Type', 'House #', 'Apartment #', 'Building #', 'Building Name', 'Street Number/Name', 'Area & Sub Area', 'Neighbourhood', 'City']

# data = [['123456', 'Appartment', '123', 'A2', 'Building A', 'ABC Towers', 'Main Street', 'Central Area', 'ABC Neighborhood', 'City X']]
# data = {'Ticket #': [], 'Type': [], 'House #': [], 'Apartment #': [], 'Building #': [], 'Building Name': [], 'Street Number/Name': [], 'Area & Sub Area': [], 'Neighbourhood': [], 'City': []}
data = {'Ticket #': ['123456'], 'Type': ['Appartment'], 'House #': ['123'], 'Apartment #': ['A2'], 'Building #': ['Building A'], 'Building Name': ['ABC Towers'], 'Street Number/Name': ['Main Street'], 'Area & Sub Area': ['Central Area'], 'Neighbourhood': ['ABC Neighborhood'], 'City': ['City X']}

df1 = data_preprocessor.create_dataframe(columns)

df2 = data_preprocessor.create_dataframe(columns, data, datacheck=True)

df1 = pd.concat([df1, df2], axis=0)

print(df1)

  Ticket #        Type House # Apartment #  Building # Building Name  \
0   123456  Appartment     123          A2  Building A    ABC Towers   

  Street Number/Name Area & Sub Area     Neighbourhood    City  
0        Main Street    Central Area  ABC Neighborhood  City X  


In [7]:
def field_finder(field_name, tokenized_list):

    street_keywords = ['street', 'road', 'highway', 'lane', 'avenue', 'boulevard', 'sharah']
    house_keywords = ['house', 'house no', 'house number', 'house #', 'plot']
    apartment_keywords = ['flat', 'flat no', 'flat number', 'flat #', 'apartment', 'suite']
    floor_keywords = ['floor', 'fl', 'level']
    area_keywords = ['block', 'phase', 'scheme', 'sector']
    keywords = []
    
    field_name = field_name.lower()

    if field_name == 'street':
        keywords = street_keywords
    elif field_name == 'house':
        keywords = house_keywords
    elif field_name == 'apartment':
        keywords = apartment_keywords
    elif field_name == 'floor':
        keywords = floor_keywords
    elif field_name == 'area':
        keywords = area_keywords

    for index, token in enumerate(tokenized_list):
        if any(keyword in token for keyword in keywords):
            return index
    
    return None

# house_index = field_finder('house', address)
# apartment_index = field_finder('apartment', address)
# street_index = field_finder('street', address)

# print('House Index:', house_index)
# print('Apartment Index:', apartment_index)
# print('Street Index:', street_index)


In [8]:
test2 = df[['Ticket#', 'Address']][0:20]

test2.head()


Unnamed: 0,Ticket#,Address
0,105207394681,"House # SC-4, Al Kareem Centre, Flat# 107, 1s..."
1,109247984771,"House # Plot# 28-C, Flat A-2, 1st Floor, Lane..."
2,101042042052,"Apartment/Suite# Flat 204, Building All Block..."
3,101087613080,"Apartment/Suite# BI3, Building All Blocks, Ib..."
4,107178824611,"Apartment/Suite# 26, Building Block B, Marhab..."


In [15]:
# import pandas as pd
import random

def create_random_sample(df, sample_size):
    selected_columns = ['Ticket#', 'Address']
    random_indices = random.sample(range(len(df)), sample_size)
    random_sample = df.loc[random_indices, selected_columns]
    return random_sample

test3 = create_random_sample(df, 20)

test3.head()

Unnamed: 0,Ticket#,Address
145261,106702315255,"House # 5/1 street 23rd, Khayaban-e-Tanzeem, ..."
141804,105922138584,"House # 45/1 10th Street, Commercial Street, ..."
67817,101924367789,House # 57-C Ground Floor 10th Commercial Str...
75795,100858953274,"House # 69/2/4 , street 4, Khayaban e badar, ..."
16693,103498119436,"House # 36 9th Street, Khayaban-e-Mujahid, Ph..."


In [19]:
address_df = data_preprocessor.create_dataframe(columns)

In [20]:
# Hierarchal and Fields parsing: 

def parse(dataframe):

    global address_df

    list_of_addresses = dataframe['Address'].tolist()
    tickets = dataframe['Ticket#'].tolist()
    # counter = 0
    
    # for address in list_of_addresses:
    
    for index in range(len(dataframe)):
        ticketno = tickets[index]
        address = list_of_addresses[index]

        data = {'Ticket #': [], 'Type': [], 'House #': [], 'Apartment #': [], 'Building #': [], 'Building Name': [], 'Street Number/Name': [], 'Area & Sub Area': [], 'Neighbourhood': [], 'City': []}
        
        address_info = pre_processing(address)
        address_type = address_info[1]
        tokenized_address = address_info[2]

        ''' Ticket # '''
        data['Ticket #'].append(ticketno)
        # data['Ticket #'].append(tickets[counter])
        # counter+=1

        ''' Type '''
        data['Type'].append(address_type)

        ''' City '''
        data['City'].append(tokenized_address.pop(-1).strip())

        ''' Neighbourhood '''
        data['Neighbourhood'].append(tokenized_address.pop(-1).strip())


        ''' Street '''
        street_index = field_finder('street', tokenized_address)
        if street_index is not None:
            data['Street Number/Name'].append(tokenized_address.pop(street_index).strip())
        else:
            data['Street Number/Name'].append('None')


        '''Appartment # '''
        appartment_index = field_finder('apartment', tokenized_address)
        if appartment_index is not None:
            data['Apartment #'].append(tokenized_address.pop(appartment_index).strip())
        else:
             data['Apartment #'].append('None')

        floor_index = field_finder('floor', tokenized_address)
        if floor_index is not None:
            if data['Apartment #'] == ['None']:
                data['Apartment #'] = [tokenized_address.pop(floor_index).strip()]
            else:
                data['Apartment #'].append(tokenized_address.pop(floor_index))
                value_lst = data['Apartment #']
                joined_string = ' '.join(value_lst)
                data['Apartment #'] = [joined_string.strip()]


        ''' House # '''
        house_index = field_finder('house', tokenized_address)
        if house_index is not None:
            data['House #'].append(tokenized_address.pop(house_index).strip())
        else:
            data['House #'].append('None')


        ''' Area/Sub Area '''
        area_index = field_finder('area', tokenized_address)
        if area_index is not None:
            data['Area & Sub Area'].append(tokenized_address.pop(area_index).strip())
        else:
            # add index_percentage identifer here and checks if last not bulding etc or first classify bulilding name and number from start then area automatically
            data['Area & Sub Area'].append(tokenized_address.pop(-1).strip())
            # data['Area & Sub Area'].append('None')

        


        ''' Null Entires'''
        for field in data:
            if len(data[field]) == 0:
                data[field].append('None')
                # data[field].append(-1)

        
        df_temp = data_preprocessor.create_dataframe(columns, data, datacheck=True)
        address_df = pd.concat([address_df, df_temp], axis=0)

    return address_df



parse(test2)

# parse(random_addresses)

# parse(test3)

# print(address_df)

address_df.head()


Unnamed: 0,Ticket #,Type,House #,Apartment #,Building #,Building Name,Street Number/Name,Area & Sub Area,Neighbourhood,City
0,105207394681,apartment,house # sc - 4,flat # 107 1st floor,,,faizan street,block d,north nazimabad,karachi
0,109247984771,apartment,house # plot # 28 - c,flat a - 2 1st floor,,,lane 9,phase 6,defence,karachi
0,101042042052,apartment,,apartment / suite # flat 204,,,8th street,building all blocks,clifton,karachi
0,101087613080,apartment,,apartment / suite # bi3,,,adam road,building all blocks,clifton,karachi
0,107178824611,apartment,,apartment / suite # 26,,,akhtar masood road,building block b,north nazimabad,karachi


In [21]:
address_df.to_csv('data/data3.csv', index=False)


In [None]:
# df.to_excel('data/data.xlsx', sheet_name = 'Sheet1', index=False)