In [1]:
import pandas as pd 

In [2]:
df = pd.read_parquet("../files/processed/test.parquet",
    engine="fastparquet")


In [3]:
cities = pd.read_json("../files/processed/final_datasets/cities_with_fourier.json")

In [4]:
len(cities[cities["fourier_features"].isna()])

0

In [5]:
cities_cand=cities[["id", "fourier_features"]].rename(columns={"id":"candidate_city_id", "fourier_features": "candidate_fourier_features"})

In [6]:
df= df.merge(cities_cand, on="candidate_city_id")

In [7]:
df[df["candidate_fourier_features"].isna()]

Unnamed: 0,candidate_id,vacant_id,t_apply,stage_max,publish_date,label,scenario,vacant_city_loc,vacant_full_text,vacant_city_ids,vacant_remote,candidate_full_text,candidate_city_loc,candidate_city_id,candidate_fourier_features


In [3]:
import numpy as np
from ast import literal_eval

def _to_array32(x):
    # Parse "[0.1, 0.2, ...]" or Python lists into float32 arrays
    if isinstance(x, str):
        x = literal_eval(x)
    return np.asarray(x, dtype=np.float32) if x is not None else None

def _cosine_distance(a, b):
    # Same semantics as before, but no prints
    if a is None or b is None:
        return np.inf
    an = np.linalg.norm(a)
    bn = np.linalg.norm(b)
    if an == 0 or bn == 0:
        return np.inf
    return 1.0 - float(np.dot(a, b) / (an * bn))

def _to_list_of_str(x):
    if isinstance(x, str):
        try:
            x = literal_eval(x)
        except Exception:
            x = [x]
    return [str(i) for i in (x or [])]

def select_vacancy_city_by_features(df, cities,
                                    id_col="id",
                                    city_feat_col="fourier_features",
                                    prefer_exact=True):
    """
    Memoryâ€‘friendly version:
      - No explode
      - Uses a dict: city_id -> Fourier vector
      - Iterates rows and computes the best city in Python
      - Preserves the old priority:
          exact match first, then smaller distance, then first in list
    """
    df = df.copy()

    # Build lookup: city_id -> feature vector
    cities = cities[[id_col, city_feat_col]].copy()
    cities[id_col] = cities[id_col].astype(str)
    cities[city_feat_col] = cities[city_feat_col].map(_to_array32)
    city_feat = dict(zip(cities[id_col].values, cities[city_feat_col].values))

    # Normalise df
    df["candidate_city_id"] = df["candidate_city_id"].astype(str)
    df["vacant_city_ids"] = df["vacant_city_ids"].map(_to_list_of_str)

    n = len(df)
    selected_city_id = np.empty(n, dtype=object)
    selected_distance = np.empty(n, dtype=np.float32)
    exact_match = np.zeros(n, dtype=bool)

    cand_ids = df["candidate_city_id"].to_numpy()
    vac_lists = df["vacant_city_ids"].to_numpy()

    for i, (cid, vac_ids) in enumerate(zip(cand_ids, vac_lists)):
        # If nothing to choose from
        if not vac_ids:
            selected_city_id[i] = None
            selected_distance[i] = np.inf
            exact_match[i] = False
            continue

        # Candidate feature (in your code this comes from the same city table)
        cfeat = city_feat.get(cid, None)

        best_city = None
        best_dist = np.inf
        best_exact = False

        # Iterate in given order to mimic the old fallback_order behaviour
        for vid in vac_ids:
            feat = city_feat.get(vid, None)
            d = _cosine_distance(cfeat, feat)
            is_exact = prefer_exact and (vid == cid)

            # Exact matches get priority, then smaller distance, then first seen
            if (is_exact and not best_exact) or (is_exact == best_exact and d < best_dist):
                best_city, best_dist, best_exact = vid, d, is_exact

        # If all distances were inf / all features missing: still pick first option
        if best_city is None:
            best_city = str(vac_ids[0])
            best_dist = np.inf
            best_exact = prefer_exact and (best_city == cid)

        selected_city_id[i] = best_city
        selected_distance[i] = best_dist
        exact_match[i] = best_exact

    df["selected_city_id"] = selected_city_id
    df["selected_distance"] = selected_distance
    df["exact_match"] = exact_match

    return df


In [9]:
from ast import literal_eval

def _to_list_of_str(x):
    if isinstance(x, str):
        try:
            x = literal_eval(x)
        except Exception:
            x = [x]
    return [str(i) for i in (x or [])]

valid_ids = set(cities["id"].astype(str))

df = df.copy()
df["vacant_city_ids"] = df["vacant_city_ids"].map(_to_list_of_str)
df["vacant_city_ids"] = df["vacant_city_ids"].map(lambda ids: [i for i in ids if i in valid_ids ])

# Optional: quick visibility into rows that lost all options
df["no_valid_vacant_city_ids"] = df["vacant_city_ids"].map(len).eq(0)
print("Rows with no valid options after filtering:",
      int(df["no_valid_vacant_city_ids"].sum()))



Rows with no valid options after filtering: 169


In [10]:
print("count na", len(df[df["candidate_fourier_features"].isna()]))
print("count na", len(cities[cities["fourier_features"].isna()]))

count na 0
count na 0


In [11]:
from itertools import chain
all_opts = set(chain.from_iterable(df["vacant_city_ids"]))
missing = sorted(all_opts - valid_ids)
assert not missing, f"Unknown city IDs: {missing[:10]} (+{len(missing)-10} more)" if len(missing) > 10 else f"Unknown city IDs: {missing}"



In [12]:
df = df[~df["no_valid_vacant_city_ids"]]

In [13]:
df_out = select_vacancy_city_by_features(df, cities)


In [14]:
cities.columns

Index(['id', 'code', 'name', 'department_id', 'created_at', 'updated_at',
       'location', 'slug', 'deleted_at', 'department_name', 'country_id',
       'lat', 'lon', 'fourier_features'],
      dtype='object')

In [15]:
cities_vac = cities[['id', 'fourier_features']].rename(columns={"id":"selected_city_id", "fourier_features":"vacant_fourier_feature"})
cities_vac["selected_city_id"]=cities_vac["selected_city_id"].astype(str)

In [16]:
df_out=df_out.merge(cities_vac, on="selected_city_id")

In [17]:
len(df_out)

42734

In [18]:
df_out.to_parquet("../files/processed/final_datasets/test.parquet")

In [4]:
import pandas as pd 
df = pd.read_parquet("../files/processed/val.parquet",
    engine="fastparquet")


cities = pd.read_json("../files/processed/final_datasets/cities_with_fourier.json")
len(cities[cities["fourier_features"].isna()])
cities_cand=cities[["id", "fourier_features"]].rename(columns={"id":"candidate_city_id", "fourier_features": "candidate_fourier_features"})
df= df.merge(cities_cand, on="candidate_city_id")
df[df["candidate_fourier_features"].isna()]

valid_ids = set(cities["id"].astype(str))

df = df.copy()
df["vacant_city_ids"] = df["vacant_city_ids"].map(_to_list_of_str)
df["vacant_city_ids"] = df["vacant_city_ids"].map(lambda ids: [i for i in ids if i in valid_ids ])

# Optional: quick visibility into rows that lost all options
df["no_valid_vacant_city_ids"] = df["vacant_city_ids"].map(len).eq(0)
print("Rows with no valid options after filtering:",
      int(df["no_valid_vacant_city_ids"].sum()))


print("count na", len(df[df["candidate_fourier_features"].isna()]))
print("count na", len(cities[cities["fourier_features"].isna()]))
from itertools import chain
all_opts = set(chain.from_iterable(df["vacant_city_ids"]))
missing = sorted(all_opts - valid_ids)
assert not missing, f"Unknown city IDs: {missing[:10]} (+{len(missing)-10} more)" if len(missing) > 10 else f"Unknown city IDs: {missing}"


df = df[~df["no_valid_vacant_city_ids"]]
df_out = select_vacancy_city_by_features(df, cities)

cities.columns
cities_vac = cities[['id', 'fourier_features']].rename(columns={"id":"selected_city_id", "fourier_features":"vacant_fourier_feature"})
cities_vac["selected_city_id"]=cities_vac["selected_city_id"].astype(str)
df_out=df_out.merge(cities_vac, on="selected_city_id")
len(df_out)
df_out.to_parquet("../files/processed/final_datasets/val.parquet")

Rows with no valid options after filtering: 4137
count na 0
count na 0


In [5]:
print(len(df_out))

834855
