In [1]:
import os
import random

import pandas as pd
from pandarallel import pandarallel
pandarallel.initialize(progress_bar=True, use_memory_fs=True, nb_workers=12)
import numpy as np

from tqdm.auto import tqdm
tqdm.pandas()

from utils.metrics import f1_macro

data_path = "./data/"
mapping_path = "./mapping/"
save_path = "./submissions/"

INFO: Pandarallel will run on 12 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [2]:
df_test = pd.read_csv(os.path.join(data_path, "addresses-close.csv"), encoding='windows-1251', dtype=str)
df_test.shape

(5112420, 1)

In [3]:
region_mapper = pd.read_csv(os.path.join(mapping_path, "region.csv"))
region_mapper = region_mapper.rename(columns={"region_type": "address_type", "region_type_str": "address_type_str", "region_type_str.1": "count"})

municipality_mapper = pd.read_csv(os.path.join(mapping_path, "municipality.csv"))
municipality_mapper = municipality_mapper.rename(columns={"municipality_type": "address_type", "municipality_type_str": "address_type_str", "municipality_type_str.1": "count"})

settlement_mapper = pd.read_csv(os.path.join(mapping_path, "settlement.csv"))
settlement_mapper = settlement_mapper.rename(columns={"settlement_type": "address_type", "settlement_type_str": "address_type_str", "settlement_type_str.1": "count"})

location_mapper = pd.read_csv(os.path.join(mapping_path, "location.csv"))
location_mapper = location_mapper.rename(columns={"location_type": "address_type", "location_type_str": "address_type_str", "location_type_str.1": "count"})

street_mapper = pd.read_csv(os.path.join(mapping_path, "street.csv"))
street_mapper = street_mapper.rename(columns={"street_type": "address_type", "street_type_str": "address_type_str", "street_type_str.1": "count"})

In [4]:
street_mapper_add = pd.DataFrame([
        ["съезд", "сзд", 1],
        ["железнодорожная казарма", "ж/д к-ма.", 2]
    ], columns=["address_type", "address_type_str", "count"])
street_mapper = pd.concat([street_mapper, street_mapper_add])
street_mapper = street_mapper.reset_index(drop=True)

settlement_mapper_add = pd.DataFrame([
        ["железнодорожный пост", "ж/д п", 1],
    ], columns=["address_type", "address_type_str", "count"])
settlement_mapper = pd.concat([settlement_mapper, settlement_mapper_add])
settlement_mapper = settlement_mapper.reset_index(drop=True)

In [5]:
df_pred = pd.DataFrame(df_test["address"].str.split(';').tolist(), columns=["region_str", "municipality_str", "settlement_str", "location_str", "street_str", "house", "source"])
# df_pred = pd.concat([df_test, df_pred], axis=1)
df_pred.head()

Unnamed: 0,region_str,municipality_str,settlement_str,location_str,street_str,house,source
0,А.обл. Еврейская,,г Биробиджан,,переулок Авангардный,дом 6,1
1,аобл Еврейская,,г. Биробиджан,,пер-к Алмазный,дом 11,2
2,Аобл Еврейская,,г Биробиджан,,пер-к Ангарский,дом 2,2
3,а.обл. Еврейская,,г. Биробиджан,,переулок Апрельский,дом 1Б,2
4,Аобл Еврейская,,Г. Биробиджан,,переулок Апрельский,дом 6,2


In [6]:
df_pred["house"] = df_pred["house"].str.strip(' ')
df_pred["source"] = df_pred["source"].str.strip(' ').astype(int)
df_pred.head()

Unnamed: 0,region_str,municipality_str,settlement_str,location_str,street_str,house,source
0,А.обл. Еврейская,,г Биробиджан,,переулок Авангардный,дом 6,1
1,аобл Еврейская,,г. Биробиджан,,пер-к Алмазный,дом 11,2
2,Аобл Еврейская,,г Биробиджан,,пер-к Ангарский,дом 2,2
3,а.обл. Еврейская,,г. Биробиджан,,переулок Апрельский,дом 1Б,2
4,Аобл Еврейская,,Г. Биробиджан,,переулок Апрельский,дом 6,2


In [7]:
def map_str(address):
    result = dict()
    if address == '  ':
        return pd.Series({column_name: '', column_name_type: ''})
    address = address.strip(' ')
    address = address.split()

    result[column_name_type] = address[0]
    result[column_name] = ' '.join(address[1:])

    for i in range(1, len(address)):
        try:
            result[column_name_type] = address_mapper.loc[address_mapper[address_mapper["address_type_str"] == ' '.join(address[:i])]["count"].idxmax(), "address_type"]
            result[column_name] = ' '.join(address[i:])
        except ValueError:
            pass

    return pd.Series(result)

In [8]:
address_mapper = region_mapper
column_name_type = "region_type"
column_name = "region"
df_pred = pd.concat([df_pred["region_str"].parallel_apply(map_str), df_pred], axis=1).drop(columns=["region_str"])

address_mapper = municipality_mapper
column_name_type = "municipality_type"
column_name = "municipality"
df_pred = pd.concat([df_pred["municipality_str"].parallel_apply(map_str), df_pred], axis=1).drop(columns=["municipality_str"])

address_mapper = settlement_mapper
column_name_type = "settlement_type"
column_name = "settlement"
df_pred = pd.concat([df_pred["settlement_str"].parallel_apply(map_str), df_pred], axis=1).drop(columns=["settlement_str"])

address_mapper = location_mapper
column_name_type = "location_type"
column_name = "location"
df_pred = pd.concat([df_pred["location_str"].parallel_apply(map_str), df_pred], axis=1).drop(columns=["location_str"])

address_mapper = street_mapper
column_name_type = "street_type"
column_name = "street"
df_pred = pd.concat([df_pred["street_str"].parallel_apply(map_str), df_pred], axis=1).drop(columns=["street_str"])

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=426035), Label(value='0 / 426035')…

  return data.apply(
  return data.apply(
  return data.apply(
  return data.apply(
  return data.apply(
  return data.apply(
  return data.apply(
  return data.apply(
  return data.apply(
  return data.apply(
  return data.apply(
  return data.apply(


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=426035), Label(value='0 / 426035')…

  return data.apply(
  return data.apply(
  return data.apply(
  return data.apply(
  return data.apply(
  return data.apply(
  return data.apply(
  return data.apply(
  return data.apply(
  return data.apply(
  return data.apply(
  return data.apply(


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=426035), Label(value='0 / 426035')…

  return data.apply(
  return data.apply(
  return data.apply(
  return data.apply(
  return data.apply(
  return data.apply(
  return data.apply(
  return data.apply(
  return data.apply(
  return data.apply(
  return data.apply(
  return data.apply(


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=426035), Label(value='0 / 426035')…

  return data.apply(
  return data.apply(
  return data.apply(
  return data.apply(
  return data.apply(
  return data.apply(
  return data.apply(
  return data.apply(
  return data.apply(
  return data.apply(
  return data.apply(
  return data.apply(


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=426035), Label(value='0 / 426035')…

  return data.apply(
  return data.apply(
  return data.apply(
  return data.apply(
  return data.apply(
  return data.apply(
  return data.apply(
  return data.apply(
  return data.apply(
  return data.apply(
  return data.apply(
  return data.apply(


In [9]:
df_pred = df_pred.fillna('')

In [10]:
df_pred.to_csv(os.path.join(save_path, "close_5.csv"), index=False, encoding="windows-1251")