In [1]:
import numpy as np
import pandas as pd
import re
from scipy import spatial
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from time import process_time

stops = open('/content/drive/MyDrive/Colab Notebooks/russian.txt').read().split()

def open_data(size):
    y_train, x_train, y_test, x_test = open(f'/content/drive/MyDrive/Colab Notebooks/data_spacy/data_{size}.txt', encoding='utf-8').read().split('\n&&&\n')
    y_train, x_train, y_test, x_test = y_train.split('\n'), x_train.split('\n'), y_test.split('\n'), x_test.split('\n')
    return y_train, x_train, y_test, x_test

In [2]:
def ap(relev, k):    # average presicion
    ap = []
    for i in range(1, k):
        if relev[i-1] is 1:    # if doc is relevant
            ap.append(sum(relev[:i])/i)    # summary of precisions
    try: 
        ap = sum(ap)/sum(relev)
    except ZeroDivisionError:
        ap = 0.0
    return ap

def evaluation(query, relev, index=0, k=10):    # mean average precision (10)
    prec = round(sum(relev)/k, 4)    # rank is not take into account
    k += 1
    avp = ap(relev, k)    # rank is take into account
    evaluat = pd.DataFrame({'precision': prec,
                            'average_precision': avp},
                            index=[index])
    return evaluat

def retrieval(fit, predict, y_train, x_train, y_test, x_test):
    df = pd.DataFrame(columns=['query', 'precision', 'average_precision'])
    train_start = process_time()
    x = fit(x_train)
    train_stop = process_time()
    train_time = train_stop - train_start

    test_start = process_time()
    for index, query in enumerate(x_test):
        predictions = predict(x, query, y_train)
        relev = [0] * 10
        for i, pred in enumerate(predictions):
            if pred[1] == y_test[index]:
                relev[i] = 1
        if relev != [0] * 10:
            df = df.append(evaluation(query, relev, index))
        else:
            df2 = pd.DataFrame({'precision': 0.0,
                                'average_precision': 0.0},
                                index=[index])
            df = df.append(df2)
    test_stop = process_time()
    test_time = test_stop - test_start
    return df, train_time, test_time
# map = sum(ap)/Q    # Q - number of quaries
# recall = sum(p)/Q
MAP = pd.DataFrame(columns=[
  'embedding',
  'train size',
  'test size', 
  'training time', 
  'inference time', 
  'recall', 
  'MAP'
])

In [3]:
vectorizers = [CountVectorizer(stop_words=stops)] * 2  # 100, 500
vectorizers += [CountVectorizer(max_df=0.7, min_df=0.1, max_features=15000, stop_words=stops)] * 6  # 837, 1000, 5000, 10000, 20000, 30000

# fitting
def CV_fit(x_train):    
    X = vectorizer.fit_transform(x_train)
    return X.toarray()

# similarity
def CV_predict(x, query, y_train):
    pred = []
    vec = vectorizer.transform([query]).toarray()
    simil = []
    for vector, film in zip(x, y_train):
        simil.append([1 - spatial.distance.cosine(vector, vec), film])
    simil.sort(reverse=True)
    for sim in simil[:10]:
        pred.append(sim)
    return pred

for i, size, vectorizer in zip((0, 1, 2, 3, 4, 5, 6, 7),
                               (100, 500, 837, 1000, 5000, 10000, 20000, 30000),
                               vectorizers):
    y_train, x_train, y_test, x_test = open_data(size)
    df, train_time, test_time = retrieval(CV_fit, CV_predict, y_train, x_train, y_test, x_test)
    print(f'done {size}')
    df2 = pd.DataFrame({'embedding': 'CountVectorizer',
                        'train size': len(y_train),
                        'test size': len(y_test),
                        'training time': round(train_time, 2),
                        'inference time': round(test_time / len(y_test), 2),
                        'recall': round(len([1 for prec in df.precision if prec != 0])/len(y_test), 4),
                        'MAP': round(sum(df.average_precision)/len(y_test), 4)},
                        index=[i])
    MAP = MAP.append(df2)
    MAP.to_csv('/content/drive/MyDrive/Colab Notebooks/data_spacy/MAP_kernel_plot.csv', index=False)

done 100
done 500


  dist = 1.0 - uv / np.sqrt(uu * vv)


done 837
done 1000
done 5000
done 10000
done 20000
done 30000


In [None]:
vectorizers = [TfidfVectorizer(stop_words=stops)] * 2  # 100, 500
vectorizers += [TfidfVectorizer(max_df=0.7, min_df=0.1, max_features=15000, stop_words=stops)] * 6  # 837, 1000, 5000, 10000, 20000, 30000

# fitting
def tfidf_fit(x_train):
    X = vectorizer.fit_transform(x_train)
    return X.toarray()

# similarity
def tfidf_predict(x, query, y_train):
    pred = []
    vec = vectorizer.transform([query]).toarray()
    simil = []
    for vector, film in zip(x, y_train):
        simil.append([1 - spatial.distance.cosine(vector, vec), film])
    simil.sort(reverse=True)
    for sim in simil[:10]:
        pred.append(sim)
    return pred


for i, size, vectorizer in zip((8, 9, 10, 11, 12, 13, 14, 15),
                               (100, 500, 837, 1000, 5000, 10000, 20000, 30000),
                               vectorizers):
    y_train, x_train, y_test, x_test = open_data(size)
    df, train_time, test_time  = retrieval(tfidf_fit, tfidf_predict, y_train, x_train, y_test, x_test)
    print(f'done {size}')
    df2 = pd.DataFrame({'embedding': 'TF-IDF',
                        'train size': len(y_train),
                        'test size': len(y_test),
                        'training time': round(train_time, 2),
                        'inference time': round(test_time / len(y_test), 2),
                        'recall': round(len([1 for prec in df.precision if prec != 0])/len(y_test), 4),
                        'MAP': round(sum(df.average_precision)/len(y_test), 4)},
                        index=[i])
    MAP = MAP.append(df2)
    MAP.to_csv('/content/drive/MyDrive/Colab Notebooks/data_spacy/MAP_kernel_plot.csv', index=False)