In [4]:
pip install transliterate

Collecting transliterate
  Using cached transliterate-1.10.2-py2.py3-none-any.whl (45 kB)
Installing collected packages: transliterate
Successfully installed transliterate-1.10.2
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.1.2 -> 23.2.1
[notice] To update, run: D:\Python\Python310\python.exe -m pip install --upgrade pip


In [5]:
import os
import pandas as pd
from transliterate import translit
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import QuantileTransformer
from scipy.sparse import csr_matrix, hstack
import numpy as np
from tqdm import tqdm
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import accuracy_score

import warnings
warnings.filterwarnings('ignore')

In [6]:
add_data = "additional_data"

df_building = pd.read_csv(f"{add_data}/building_20230808.csv", low_memory = False)
df_building = df_building.rename(columns={'id': 'target_building_id'})

path = 'datasets/'
files = os.listdir(path)

for i, file in enumerate(files):
    if i == 0:
        df = pd.read_csv(path + file, low_memory = False)
    else:
        temp = pd.read_csv(path + file, low_memory = False)
        df = pd.concat([df, temp], ignore_index = True)
        
use = ['address', 'target_address']
df.drop_duplicates(subset = use, inplace = True)
df.drop_duplicates(subset = 'address', inplace = True)

df = df.merge(df_building, how = 'left', on = ['target_building_id'])

df = df[df['is_actual'] == True]
df['target_building_id'] = df['target_building_id'].astype(int)

df['address'] = df['address'].apply(lambda x: translit(x, 'ru'))

In [7]:
%%time
count_char = CountVectorizer(analyzer='char_wb', ngram_range=(1, 2), token_pattern='\\w+', max_df=0.9)
count_ngram = CountVectorizer(ngram_range=(1, 1), token_pattern='\\w+', max_df=0.9)

char_csr = count_char.fit_transform(df['target_address'])
ngram_csr = count_ngram.fit_transform(df['target_address'])

csr = hstack([char_csr, ngram_csr])
sca = QuantileTransformer()
csr = sca.fit_transform(csr)
csr

CPU times: total: 15.7 s
Wall time: 15.8 s


<76655x4365 sparse matrix of type '<class 'numpy.float64'>'
	with 2985636 stored elements in Compressed Sparse Column format>

# ТЕСТ - ВЫБОРКА ИЗ ТРЕЙН ДАТАСЕТА

In [8]:
np.random.seed(0)
TEST_SIZE = 100
num_tests = np.random.randint(0, df.shape[0], size=TEST_SIZE)
y_test = df['target_building_id'].iloc[num_tests]
char_csr_test = count_char.transform(df['address'])
ngram_csr_test = count_ngram.transform(df['address'])
csr_test = hstack([char_csr_test, ngram_csr_test])
csr_test = sca.transform(csr_test)
csr_test

<76655x4365 sparse matrix of type '<class 'numpy.float64'>'
	with 2362819 stored elements in Compressed Sparse Column format>

In [9]:
%%time

neigh = NearestNeighbors(n_neighbors=10, metric='cosine')
neigh.fit(csr)

CPU times: total: 125 ms
Wall time: 121 ms


NearestNeighbors(metric='cosine', n_neighbors=10)

In [10]:
# если 1 из top_n предсказанных - верный
PREDICT_SIZE = 100
pred = neigh.kneighbors(csr_test[num_tests], PREDICT_SIZE, return_distance=False)

def accuracy_score_(y_test, pred, top_n=10):
    y_pred_ = []
    for i in range(len(pred)):
        y_test_i = y_test.iloc[i]
        y_pred_i = df['target_building_id'].iloc[pred[i].flatten()].drop_duplicates().values[:top_n]
        if (y_test_i in y_pred_i):
            y_pred_.append(y_test_i)
        else:
            y_pred_.append(y_pred_i[0])

    return accuracy_score(y_test, y_pred_)

print('Accuracy_scores:')
print('1 ответ на 1 запрос', accuracy_score_(y_test, pred, top_n=1))
print('3 ответа на 1 запрос', accuracy_score_(y_test, pred, top_n=3))
print('10 ответов на 1 запрос', accuracy_score_(y_test, pred, top_n=10))

Accuracy_scores:
1 ответ на 1 запрос 0.71
3 ответа на 1 запрос 0.81
10 ответов на 1 запрос 0.9


# ТЕСТОВЫЙ ДАТАСЕТ

In [11]:
df_test = pd.read_csv('test_example.csv', sep=';')

In [12]:
np.random.seed(0)
y_test_ = df_test['target_building_id']
char_csr_test_ = count_char.transform(df_test['address'])
ngram_csr_test_ = count_ngram.transform(df_test['address'])
csr_test_ = hstack([char_csr_test_, ngram_csr_test_])
csr_test_ = sca.transform(csr_test_)
csr_test_

<328x4365 sparse matrix of type '<class 'numpy.float64'>'
	with 10177 stored elements in Compressed Sparse Column format>

In [13]:
# если 1 из top_n предсказанных - верный
PREDICT_SIZE = 100
pred_ = neigh.kneighbors(csr_test_, PREDICT_SIZE, return_distance=False)

print('Accuracy_scores:')
print('1 ответ на 1 запрос', accuracy_score_(y_test_, pred_, top_n=1))
print('3 ответа на 1 запрос', accuracy_score_(y_test_, pred_, top_n=3))
print('10 ответов на 1 запрос', accuracy_score_(y_test_, pred_, top_n=10))

Accuracy_scores:
1 ответ на 1 запрос 0.6280487804878049
3 ответа на 1 запрос 0.7103658536585366
10 ответов на 1 запрос 0.7896341463414634
