In [1]:
import numpy as np
import pyximport
pyximport.install(pyimport=True)


In [2]:
%load_ext Cython

In [149]:
%%cython --compile-args=-fopenmp --link-args=-fopenmp

import numpy as np
from cython.parallel import parallel, prange

ctypedef fused number_t:
    int
    double
    float
    long long

ctypedef fused float_t:
    double
    float

cimport cython


def swap_fitness(
    float[:,::1] population,
    float[:] population_fitness,
    int index,
    float[:,::1] population2,
    float[:] population2_fitness,
    int index2,
):
    """Swap positions between two populations."""
    cdef float[:] temp_swap = population[index].copy()
    cdef float temp_swap_fitness = population_fitness[index]
    population[index] = population2[index2]
    population_fitness[index] = population2_fitness[index2]
    population2[index2] = temp_swap
    population2_fitness[index2] = temp_swap_fitness


@cython.boundscheck(False) # turn off bounds-checking for entire function
@cython.wraparound(False)  # turn off negative index wrapping for entire function 
def best_generation(
    float[:,::1] population,
    float[:] population_fitness,
    float[:,::1] offspring_population,
    float[:] offspring_population_fitness,
    int n_population,
):
    """Get best generation.
    Get best population from population and offspring by fitness.
    Similat to tournament selection.

    """
    cdef int index
    for index in range(n_population):
        # Horizontal swap
        if index + 1 < n_population:
            if population_fitness[index] < population_fitness[index + 1]:
                swap_fitness(
                    population,
                    population_fitness,
                    index,
                    population,
                    population_fitness,
                    index + 1,
                )
            if (
                offspring_population_fitness[index]
                > offspring_population_fitness[index + 1]
            ):
                swap_fitness(
                    offspring_population,
                    offspring_population_fitness,
                    index,
                    offspring_population,
                    offspring_population_fitness,
                    index + 1,
                )
            if (
                offspring_population_fitness[index + n_population]
                > offspring_population_fitness[index + n_population + 1]
            ):
                swap_fitness(
                    offspring_population,
                    offspring_population_fitness,
                    index + n_population,
                    offspring_population,
                    offspring_population_fitness,
                    index + n_population + 1,
                )
        # Vertical swap
        if population_fitness[index] < offspring_population_fitness[index]:
            swap_fitness(
                population,
                population_fitness,
                index,
                offspring_population,
                offspring_population_fitness,
                index,
            )
        if (
            population_fitness[index]
            < offspring_population_fitness[index + n_population]
        ):
            swap_fitness(
                population,
                population_fitness,
                index,
                offspring_population,
                offspring_population_fitness,
                index + n_population,
            )

@cython.boundscheck(False)
@cython.wraparound(False)
cpdef inline float cosine_similarity(float[:] vector_a, float[:] vector_b) nogil:
    """Calculate cosine similarity between two vectors."""
    cdef float dot_product = 0.0
    cdef float norm_a = 0.0
    cdef float norm_b = 0.0
    cdef Py_ssize_t i
    for i in range(vector_a.shape[0]):
        dot_product += vector_a[i] * vector_b[i]
        norm_a += vector_a[i] ** 2
        norm_b += vector_b[i] ** 2
    return dot_product / (norm_a**0.5 * norm_b**0.5)


@cython.boundscheck(False) # turn off bounds-checking for entire function
@cython.wraparound(False)  # turn off negative index wrapping for entire function 
def select(
    float[:,::1] population,
    float[:] population_fitness,
    float[:,::1] offspring_population,
    float[:] offspring_population_fitness,
    int n_population,
    float threshold,
) -> np.ndarray:
    """Select individuals.
    Use shared fitness.
    The fitness is shared using cosine similarity.
    """
    cdef int num_population = population.shape[0]
    # Group individuals by cosine similarity
    cdef int total_population = num_population + offspring_population.shape[0]
    cdef float[:] num_members = np.zeros(total_population, dtype=np.float32)
    population[:, 1] = -100
    offspring_population[:, 1] = -100
    cdef Py_ssize_t index_start_ind = 2
    cdef float simil = 0.0
    # Fitness shared using cosine similarity
   
    cdef Py_ssize_t index, index2
    
    for index in prange(total_population, nogil=True):
        # Calculate cosine similarity for population
        for index2 in range(total_population):
            # Population with itself
            if index < num_population and index2 < num_population:
                simil = cosine_similarity(
                    population[index, index_start_ind:],
                    population[index2, index_start_ind:],
                )
                if simil > threshold:
                    num_members[index] += simil
            # Population with offspring
            elif index < num_population and index2 >= num_population:
                simil = cosine_similarity(
                    population[index, index_start_ind:],
                    offspring_population[index2 - num_population, index_start_ind:],
                )
                if simil > threshold:
                    num_members[index] += simil
            # Offspring with offspring
            elif index >= num_population and index2 >= num_population:
                simil = cosine_similarity(
                    offspring_population[index - num_population, index_start_ind:],
                    offspring_population[index2 - num_population, index_start_ind:],
                )
                if simil > threshold:
                    num_members[index] += simil
            # Offspring with population
            elif index >= num_population and index2 < num_population:
                simil = cosine_similarity(
                    offspring_population[index - num_population, index_start_ind:],
                    population[index2, index_start_ind:],
                )
                if simil > threshold:
                    num_members[index] += simil

    for index in range(population.shape[0]):
        population_fitness[index] = population_fitness[index] / num_members[index]

    for index in range(offspring_population.shape[0]):
        offspring_population_fitness[index] = (
            offspring_population_fitness[index] / num_members[index + num_population]
        )

    best_generation(
        population,
        population_fitness,
        offspring_population,
        offspring_population_fitness,
        n_population,
    )
    

In [150]:
population = np.array([[1,2,3,5], [7,8,9,10]], dtype=np.float32)
population_fitness = np.array([1.,2.], dtype=np.float32)
index = 0
population2 = np.array([[11,12,13,14], [15,16,17,18], [21,22,23,24], [25,26,27,28]], dtype=np.float32)
population2_fitness = np.array([4.,3.,2.,1.], dtype=np.float32)
index2 = 0

select(population, population_fitness, population2, population2_fitness, 2, 0.9)



In [151]:
cosine_similarity(
    np.array([-1,13,-1,13,-1,13], dtype=np.float32), 
    np.array([1,13,2,13,2,13], dtype=np.float32)
)

0.9785742163658142

In [152]:
np.log10([1,13,1,42,42,420])

array([0.        , 1.11394335, 0.        , 1.62324929, 1.62324929,
       2.62324929])

In [153]:
0.5*(1+cosine_similarity(
    np.array([-1.11394335, 1.62324929, 1.62324929,2.62324929]), 
    np.array([1.11394335, 1.62324929, 1.62324929,2.62324929])))

ValueError: Buffer dtype mismatch, expected 'float' but got 'double'

In [154]:
def ned(array_1, array_2):
    return (1-((2*np.sum(array_1 * array_2))/(np.sum(array_1**2) + np.sum(array_2**2))))*0.5

In [155]:
1-ned(np.array([-13,-42,42,420]), np.array([13,42,42,420]))

0.9892668950621054

In [156]:
population

array([[  15., -100.,   17.,   18.],
       [  11., -100.,   13.,   14.]], dtype=float32)

In [157]:
population2

array([[   7., -100.,    9.,   10.],
       [   1., -100.,    3.,    5.],
       [  25., -100.,   27.,   28.],
       [  21., -100.,   23.,   24.]], dtype=float32)

In [158]:
population_fitness

array([0.50198543, 0.66912055], dtype=float32)

In [159]:
population2_fitness

array([0.33444494, 0.16989033, 0.16740428, 0.3347596 ], dtype=float32)

In [160]:
array_1 = np.random.uniform(0, 1000, size=(3000)).astype(np.float32)
array_2 = np.random.uniform(0, 1000, size=(3000)).astype(np.float32)

In [161]:
timeit_result = %timeit -o cosine_similarity(array_1, array_2)

2.67 µs ± 8.54 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)


In [37]:
cosine_similarity(array_1, array_2)

0.7446993768686008

In [None]:
ned(array_1, array_2)

0.12371255471209491

In [None]:
timeit_result = %timeit -o cosine_similarity_sklearn(array_1.reshape(1, -1), array_2.reshape(1, -1))

145 µs ± 1.62 µs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


In [None]:
import pyximport
pyximport.install(pyimport=True)

(None, None)

In [15]:
%%cython --compile-args=-fopenmp --link-args=-fopenmp


import numpy as np
from cython.parallel import parallel, prange

cimport cython

@cython.boundscheck(False) # turn off bounds-checking for entire function
@cython.wraparound(False)  # turn off negative index wrapping for entire function
cpdef slice_doc(
    float[:,::1] doc,
    int[:,::1] target,
    int windows, 
    int doc_size
):
    """Generate slice of doc."""
    # slices: List[Tuple[np.ndarray, np.ndarray]] = []
    # for index in range(0, doc_size - windows + 2):
    #     slices.append((doc[index : index + windows], target[index : index + windows]))
    # return slices



cpdef inline float perfomance_by_doc(
    float[:] individual,
    float[:,::1] doc,
    int[:] target,
    int doc_size,
    int individual_size,
    int unknown_id,

)nogil:
    """Performance of individual on doc."""

    #individual_data = individual[:individual_size].reshape(-1, 1)
    #entity_mask: np.ndarray = individual_data > 0
    #mask_unknown: np.ndarray = individual_data == unknown_id
    #union_doc: int = 0
    #intercep_doc: int = 0
    #retrive_doc: int = 0
    cdef float[:] individual_data = individual[:individual_size]
    cdef int[:] entity_mask = np.zeros(individual_size, dtype=int)
    for index in range(individual_size):
        if individual_data[index] > 0:
            entity_mask[index] = 1
    cdef int[:] mask_unknown = np.zeros(individual_size, dtype=int)
    for index in range(individual_size):
        if individual_data[index] == unknown_id:
            mask_unknown[index] = 1
    cdef int union_doc = 0
    cdef int intercep_doc = 0
    cdef int retrive_doc = 0

         #if individual_size > doc_size + 2:
         #    return 0.0
    if individual_size > doc_size + 2:
        return 0.0

         #for sliced_doc, sliced_targed in slice_doc(doc, target, individual_size, doc_size):
         #    macth_tokens: np.ndarray = (sliced_doc == np.abs(individual_data)) | (
         #        mask_unknown
         #    )
    for index in range(0, doc_size - individual_size + 2):
        cdef int[:] sliced_doc = doc[index : index + individual_size]
        cdef int[:] sliced_targed = target[index : index + individual_size]
        # cdef int[:] macth_tokens = np.zeros(individual_size, dtype=int)
        # for index2 in range(individual_size):
        #     if sliced_doc[index2] == np.abs(individual_data[index2]) or mask_unknown[index2] == 1:
        #         macth_tokens[index2] = 1
    #for i in range(0, doc_size - individual_size + 2):

         #    any_match = np_any_axis1(macth_tokens)
         #    total_match = np.sum(any_match)

         #    predict_span: np.ndarray = np.zeros(individual_size, dtype=np.bool8)
         #    if total_match == individual_size:
         #        # predict_span: np.ndarray = np_sum_axis1(macth_tokens * entity_mask)
         #        # np_sum_axis1(macth_tokens * entity_mask)
         #        for i in range(individual_size):
         #            predict_span[i] = np.sum(macth_tokens[i] * entity_mask[i])

         #    intercep_doc += (predict_span * sliced_targed.flatten()).sum()
         #    union_doc += (predict_span | sliced_targed.flatten()).sum()
         #    retrive_doc += predict_span.sum()
         #    if intercep_doc > retrive_doc:
         #        print(f"{intercep_doc} {retrive_doc}")

         #if union_doc == 0:
         #    return -1.0

         #if intercep_doc == 0:
         #    return 0.0

         #if intercep_doc / retrive_doc > 1:
         #    print(intercep_doc / retrive_doc, intercep_doc, retrive_doc)

         #return (intercep_doc / retrive_doc) * np.log2(intercep_doc)



# def fitness_by_individual(self, individual: np.ndarray) -> float:
#     """Fitness function.
#     Return fitness of individual.
#     F1 score
#     F(R) = frac{2*S_p*S_r,S_p + S_r)
#     """
#     entity_type = self.map_inv_entity[individual[2]]
#     perfomance_doc = np.zeros(len(self.data[entity_type]["input"]))
#     individual_size: int = int(individual[0])
#     indivual_rep = individual[3:]

#     for index_doc, doc in enumerate(self.data[entity_type]["input"]):
         #perfomance = self.perfomance_by_doc(
         #    indivual_rep,
         #    doc,
         #    self.data[entity_type]["target"][index_doc],
         #    int(self.data[entity_type]["meta"][index_doc, 0]),
         #    individual_size,
         #)
         #perfomance_doc[index_doc] = perfomance

#     # if perfomance_doc[perfomance_doc >= 0].mean() > 1:
#     #     print(perfomance_doc[perfomance_doc >= 0].mean())
#     return perfomance_doc[perfomance_doc >= 0].mean()

# def fitness(self, population: np.ndarray) -> np.ndarray:
#     """Fitness function.
#     Return fitness of population.
#     F1 score
#     F(R) = frac{2*S_p*S_r,S_p + S_r)
#     """
#     return np.array(
         #[self.fitness_by_individual(individual) for individual in population],
         #dtype=np.float32,
#     )


Error compiling Cython file:
------------------------------------------------------------
...
         #for sliced_doc, sliced_targed in slice_doc(doc, target, individual_size, doc_size):
         #    macth_tokens: np.ndarray = (sliced_doc == np.abs(individual_data)) | (
         #        mask_unknown
         #    )
    for index in range(0, doc_size - individual_size + 2):
        cdef int[:] sliced_doc = doc[index : index + individual_size]
            ^
------------------------------------------------------------

/home/jraba/.cache/ipython/cython/_cython_magic_6eab27922a6593776746341fc97fa021.pyx:64:13: cdef statement not allowed here
