In [1]:
import data_preprocessor
import data_processor
import pandas as pd

abbreviations = data_preprocessor.load_json("abbreviations.json")

In [2]:
fname = 'data/khi_tickets_2022.csv'

df_complete = data_preprocessor.load_corpus(fname, pandas = True, header = True)

df = df_complete.drop(columns=['Title', 'Created', 'Close Time', 'Queue'], axis=1)

In [3]:
columns = ['Ticket #', 'Type', 'House #', 'Apartment #', 'Building #', 'Building Name', 'Street', 'Road', 'Area & Sub Area', 'Neighbourhood', 'City']

In [41]:
test2 = df[['Ticket#', 'Address']][0:20]

test2.head()


Unnamed: 0,Ticket#,Address
0,105207394681,"House # SC-4, Al Kareem Centre, Flat# 107, 1s..."
1,109247984771,"House # Plot# 28-C, Flat A-2, 1st Floor, Lane..."
2,101042042052,"Apartment/Suite# Flat 204, Building All Block..."
3,101087613080,"Apartment/Suite# BI3, Building All Blocks, Ib..."
4,107178824611,"Apartment/Suite# 26, Building Block B, Marhab..."


In [4]:
test3 = data_processor.create_random_sample(df, 50, ['Ticket#', 'Address'])

test3.head()

Unnamed: 0,Ticket#,Address
51040,107649302947,"House # D-75, Shahra-e-Attar, Block 4, Clifto..."
124955,100959675094,"House # A-100, Street 1, Block 12, Federal B ..."
92962,105347134691,House # ST-03 DEBS (D Education Bureau School...
53390,105367715493,"House # Flat 16 3rd Floor GOR 1, Bath Island ..."
133275,102073951466,"House # 8/1, Khayaban-e-Tanzeem, Phase 5, Def..."


In [5]:
address_df = data_preprocessor.create_dataframe(columns)

# if instead of creating new for each run, we load this from previously stored one (a file) then we can have old ones as well as new ones added to it.

In [6]:
# Hierarchal and Fields parsing: 

def parse(dataframe):

    global address_df

    list_of_addresses = dataframe['Address'].tolist()
    tickets = dataframe['Ticket#'].tolist()
    
    for index in range(len(dataframe)):
        ticketno = tickets[index]
        address = list_of_addresses[index]

        data = {'Ticket #': [], 'Type': [], 'House #': [], 'Apartment #': [], 'Building #': [], 'Building Name': [], 'Street': [], 'Road': [], 'Area & Sub Area': [], 'Neighbourhood': [], 'City': []}
        
        address_info = data_processor.pre_processing(address)
        address_type = address_info[1]
        tokenized_address = address_info[2]
        reference_tokenized_address = list(tokenized_address)

        ''' Ticket # '''
        data['Ticket #'].append(ticketno)

        ''' Type '''
        data['Type'].append(address_type)

        ''' City '''
        data['City'].append(tokenized_address.pop(-1).strip())

        ''' Neighbourhood '''
        data['Neighbourhood'].append(tokenized_address.pop(-1).strip())


        ''' Road '''
        road_index = data_processor.field_finder('road', tokenized_address)
        if road_index is not None:
            data['Road'].append(tokenized_address.pop(road_index).strip())
        else:
            data['Road'].append('None')


        ''' Street '''
        street_index = data_processor.field_finder('street', tokenized_address)
        if street_index is not None:
            data['Street'].append(tokenized_address.pop(street_index).strip())
        else:
            data['Street'].append('None')

        
        '''Appartment # '''
        appartment_index = data_processor.field_finder('apartment', tokenized_address)
        if appartment_index is not None:
            data['Apartment #'].append(tokenized_address.pop(appartment_index).strip())
        else:
             data['Apartment #'].append('None')

        floor_index = data_processor.field_finder('floor', tokenized_address)
        if floor_index is not None:
            if data['Apartment #'] == ['None']:
                data['Apartment #'] = [tokenized_address.pop(floor_index).strip()]
            else:
                data['Apartment #'].append(tokenized_address.pop(floor_index))
                value_lst = data['Apartment #']
                joined_string = ' '.join(value_lst)
                data['Apartment #'] = [joined_string.strip()]


        ''' House # '''
        house_index = data_processor.field_finder('house', tokenized_address)
        if house_index is not None:
            data['House #'].append(tokenized_address.pop(house_index).strip())
        else:
            data['House #'].append('None')

        
        ''' Area/Sub Area '''
        area_index = data_processor.field_finder('area', tokenized_address)
        if area_index is not None:
            data['Area & Sub Area'].append(tokenized_address.pop(area_index).strip())
        else:
            data['Area & Sub Area'].append('None')

        p_area_index, p_buildingname_index, p_buildingnumber_index = data_processor.probabilistic_identifiers(reference_tokenized_address, tokenized_address)
            
        if len(p_area_index) > 0:
                if data['Area & Sub Area'] != ['None']:
                    for index in p_area_index:
                        data['Area & Sub Area'].append(tokenized_address[index].strip())
                    value_lst = data['Area & Sub Area']
                    joined_string = ', '.join(value_lst)
                    data['Area & Sub Area'] = [joined_string.strip()]
                else:
                    data['Area & Sub Area'] = []
                    for index in p_area_index:
                        data['Area & Sub Area'].append(tokenized_address[index].strip())
                    value_lst = data['Area & Sub Area']
                    joined_string = ', '.join(value_lst)
                    data['Area & Sub Area'] = [joined_string.strip()]

        # for index in sorted(p_area_index, reverse=True):
        #     tokenized_address.pop(index)

        
        if address_type == 'house':
            if len(p_buildingname_index) + len(p_buildingnumber_index) > 0:
                area_indexes_more = p_buildingnumber_index + p_buildingname_index 

                if data['Area & Sub Area'] != ['None']:
                        temp = data['Area & Sub Area']
                        data['Area & Sub Area'] = []
                        for index in area_indexes_more:
                            data['Area & Sub Area'].append(tokenized_address[index].strip())
                        value_lst = list(data['Area & Sub Area'])
                        for x in temp:
                            value_lst.append(x)
                        joined_string = ', '.join(value_lst)
                        data['Area & Sub Area'] = [joined_string.strip()]
                else:
                    data['Area & Sub Area'] = []
                    for index in area_indexes_more:
                        data['Area & Sub Area'].append(tokenized_address[index].strip())
                    value_lst = data['Area & Sub Area']
                    joined_string = ', '.join(value_lst)
                    data['Area & Sub Area'] = [joined_string.strip()]

                # for index in sorted(area_indexes_more, reverse=True):
                #     tokenized_address.pop(index)    
        else:
            ''' Building Name '''
            if len(p_buildingname_index)>0:
                for index in p_buildingname_index:
                    data['Building Name'].append(tokenized_address[index].strip())
                value_lst = data['Building Name']
                joined_string = ', '.join(value_lst)
                data['Building Name'] = [joined_string.strip()]

            ''' Building Number '''
            if len(p_buildingnumber_index)>0:
                for index in p_buildingnumber_index:
                    data['Building #'].append(tokenized_address[index].strip())
                value_lst = data['Building #']
                joined_string = ', '.join(value_lst)
                data['Building #'] = [joined_string.strip()]

            if len(tokenized_address) > 0:
                for index in sorted(p_area_index, reverse=True):
                    tokenized_address.pop(index)

            if len(tokenized_address) > 0:
                for index in sorted(p_buildingname_index, reverse=True):
                    tokenized_address.pop(index) 

            if len(tokenized_address) > 0:
                for index in sorted(p_buildingnumber_index, reverse=True):
                    tokenized_address.pop(index)


        ''' Shifting Entries '''
        if address_type == 'house' and data['House #'] == ['None'] and data['Apartment #'] != ['None']:
            data['House #'] = data['Apartment #']
            data['Apartment #'] = []


        ''' Null Entires'''
        for field in data:
            if len(data[field]) == 0:
                data[field].append('None')
      

        df_temp = data_preprocessor.create_dataframe(columns, data, datacheck=True)
        address_df = pd.concat([address_df, df_temp], axis=0)

    return address_df



# parse(test2)

parse(test3)


address_df.head()


Unnamed: 0,Ticket #,Type,House #,Apartment #,Building #,Building Name,Street,Road,Area & Sub Area,Neighbourhood,City
0,107649302947,house,house # d - 75,,,,,shahrah - e - attar,block 4,clifton,karachi
0,100959675094,house,house # a - 100,,,,street 1,,block 12,federal b area,karachi
0,105347134691,house,,,,,house # street - 03 debs ( d education bureau ...,a s usmani road,block 2,gulshan - e - iqbal,karachi
0,105367715493,apartment,,house # flat 16 3rd floor gor 1,,,,bath island road,bath island,clifton,karachi
0,102073951466,house,house # 8 / 1,,,,,khayaban - e - tanzeem,phase 5,defence,karachi


In [7]:
# address_df.to_csv('data/data_a3.csv', index=False)

address_df.to_csv('data/datarand_a4.csv', index=False)

In [None]:
# df.to_excel('data/data.xlsx', sheet_name = 'Sheet1', index=False)