In [1]:
import os
import pandas as pd
import numpy as np
import re
from transliterate import translit
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import QuantileTransformer
from scipy.sparse import csr_matrix, hstack
from tqdm import tqdm
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import accuracy_score
import pymorphy2

import warnings
warnings.filterwarnings('ignore')
import platform

In [2]:
def normalizeString(s):
    s = re.sub(r'([\/\\\.\-a-zA-Z-а-яА-Я!?]+)([0-9]+)', r" \1 \2", s) 
    return s

In [3]:
%%time
add_data = "./train_dataset_Датасет/additional_data"

if platform.node() == 'vlad-F17':
    add_data = "../input/additional_data"

df_building = pd.read_csv(f"{add_data}/building_20230808.csv", low_memory = False)
df_building = df_building.rename(columns={'id': 'target_building_id'})

path = './train_dataset_Датасет/datasets/'
if platform.node() == 'vlad-F17':
    path = "../input/datasets/"

files = os.listdir(path)

for i, file in enumerate(files):
    if i == 0:
        df = pd.read_csv(path + file, low_memory = False)
    else:
        temp = pd.read_csv(path + file, low_memory = False)
        df = pd.concat([df, temp], ignore_index = True)
        
use = ['address', 'target_address']
df.drop_duplicates(subset = use, inplace = True)
df.drop_duplicates(subset = 'address', inplace = True)

df = df.merge(df_building, how = 'left', on = ['target_building_id'])

df = df[df['is_actual'] == True]
df['target_building_id'] = df['target_building_id'].astype(int)


df['address'] = df['address'].apply(lambda x: translit(x, 'ru'))
df['address'] = df['address'].apply(lambda x: normalizeString(x))

df['target_address'] = df['target_address'].apply(lambda x: normalizeString(x))

CPU times: user 2.51 s, sys: 147 ms, total: 2.66 s
Wall time: 2.77 s


In [4]:
%%time
RANDOM_STATE = 0
TOKEN_PATTERN = '\\w+'
count_char = CountVectorizer(
    analyzer = 'char_wb', 
    ngram_range = (1, 1), 
    token_pattern=TOKEN_PATTERN, 
    max_df = 0.8
)

count_char2 = CountVectorizer(
    analyzer = 'char_wb', 
    ngram_range = (1, 2), 
    token_pattern=TOKEN_PATTERN, 
    max_df = 0.2
)

count_ngram = CountVectorizer(
    ngram_range=(1, 1), 
    token_pattern=TOKEN_PATTERN, 
    max_df = 0.9
)

char_csr = count_char.fit_transform(df['target_address'])
ngram_csr = count_ngram.fit_transform(df['target_address'])
char_csr2 = count_char2.fit_transform(df['target_address'])


csr = hstack([char_csr, ngram_csr, char_csr2])
sca = QuantileTransformer(random_state = RANDOM_STATE)
csr = sca.fit_transform(csr)
csr

CPU times: user 8.66 s, sys: 464 ms, total: 9.13 s
Wall time: 9.15 s


<76655x4296 sparse matrix of type '<class 'numpy.float64'>'
	with 2047755 stored elements in Compressed Sparse Column format>

# ТЕСТ - ВЫБОРКА ИЗ ТРЕЙН ДАТАСЕТА

In [5]:
np.random.seed(0)
TEST_SIZE = 5000

num_tests = np.random.randint(0, df.shape[0], size = TEST_SIZE)
y_test = df['target_building_id'].iloc[num_tests]
char_csr_test = count_char.transform(df['address'])
ngram_csr_test = count_ngram.transform(df['address'])
char_csr_test2 = count_char2.transform(df['address'])

csr_test = hstack([char_csr_test, ngram_csr_test, char_csr_test2])
csr_test = sca.transform(csr_test)
csr_test

<76655x4296 sparse matrix of type '<class 'numpy.float64'>'
	with 1963738 stored elements in Compressed Sparse Column format>

In [6]:
%%time

neigh = NearestNeighbors(n_neighbors=10, metric='cosine')
neigh.fit(csr)

CPU times: user 32.7 ms, sys: 4.04 ms, total: 36.7 ms
Wall time: 36.3 ms


In [7]:
# если 1 из top_n предсказанных - верный
PREDICT_SIZE = 100
pred = neigh.kneighbors(csr_test[num_tests], PREDICT_SIZE, return_distance=False)

def accuracy_score_(y_test, pred, top_n = 10):
    y_pred_ = []
    for i in range(len(pred)):
        y_test_i = y_test.iloc[i]
        y_pred_i = df['target_building_id'].iloc[pred[i].flatten()].drop_duplicates().values[:top_n]
        if (y_test_i in y_pred_i):
            y_pred_.append(y_test_i)
        else:
            y_pred_.append(y_pred_i[0])

    return accuracy_score(y_test, y_pred_)

print('Accuracy_scores:')
print('1 ответ на 1 запрос', accuracy_score_(y_test, pred, top_n = 1))
print('3 ответа на 1 запрос', accuracy_score_(y_test, pred, top_n = 3))
print('10 ответов на 1 запрос', accuracy_score_(y_test, pred, top_n = 10))

Accuracy_scores:
1 ответ на 1 запрос 0.744
3 ответа на 1 запрос 0.8424
10 ответов на 1 запрос 0.882


# ТЕСТОВЫЙ ДАТАСЕТ

In [8]:
if platform.node() == 'vlad-F17':
    df_test = pd.read_csv('../input/test_example.csv', sep = ';')
else:
    df_test = pd.read_csv('test_example.csv', delimiter =';')


df_test['address'] = df_test['address'].apply(lambda x: translit(x, 'ru'))
df_test['address'] = df_test['address'].apply(lambda x: normalizeString(x))

In [9]:
np.random.seed(0)
y_test_ = df_test['target_building_id']
char_csr_test_ = count_char.transform(df_test['address'])
ngram_csr_test_ = count_ngram.transform(df_test['address'])
char_csr_test2_ = count_char2.transform(df_test['address'])

csr_test_ = hstack([char_csr_test_, ngram_csr_test_, char_csr_test2_])
csr_test_ = sca.transform(csr_test_)
csr_test_

<328x4296 sparse matrix of type '<class 'numpy.float64'>'
	with 9347 stored elements in Compressed Sparse Column format>

In [10]:
# если 1 из top_n предсказанных - верный
PREDICT_SIZE = 100
pred_ = neigh.kneighbors(csr_test_, PREDICT_SIZE, return_distance=False)

print('Accuracy_scores:')
print('1 ответ на 1 запрос', accuracy_score_(y_test_, pred_, top_n=1))
print('3 ответа на 1 запрос', accuracy_score_(y_test_, pred_, top_n=3))
print('10 ответов на 1 запрос', accuracy_score_(y_test_, pred_, top_n=10))

Accuracy_scores:
1 ответ на 1 запрос 0.7347560975609756
3 ответа на 1 запрос 0.7865853658536586
10 ответов на 1 запрос 0.7987804878048781


In [11]:
df_test['targe_model'] = 0
for i in tqdm(range(len(df_test))):
    char_csr_test_ = count_char.transform([df_test['address'].iloc[i]])
    ngram_csr_test_ = count_ngram.transform([df_test['address'].iloc[i]])
    char_csr_test2_ = count_char2.transform([df_test['address'].iloc[i]])
    
    csr_test_ = hstack([char_csr_test_, ngram_csr_test_, char_csr_test2_])
    csr_test_ = sca.transform(csr_test_)
    
    PREDICT_SIZE = 1
    pred_ = neigh.kneighbors(csr_test_, PREDICT_SIZE, return_distance = False)
    y_pred = df['target_building_id'].iloc[pred_.flatten()].drop_duplicates().values[:1]
    df_test['targe_model'].iloc[i] = y_pred

100%|█████████████████████████████████████████████| 328/328 [00:23<00:00, 13.88it/s]


In [12]:
print(accuracy_score(df_test['target_building_id'], df_test['targe_model']))

0.7317073170731707
