In [38]:
import pandas as pd
import random
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score
import joblib

# Составление датасета

Делаем таблицу с полями

categorial: [cold_sensitivity, user_style | item_type, item_subtype, item_style, item_color]

numeric: [item_warmth, is_waterproof, is_windproof, has_hood | weather_warmth, is_rain, is_snow, is_windy]

таргетом будет понравилась пользователю вещь для такой погоды или нет

In [39]:
cold_sensitivity_values = ["low", "medium", "high"]

styles = ["casual", "sport", "official", "street", "party", "outdoor"]

item_types = ["outerwear", "top", "bottom"]

top_subtypes = [
    # Onepiece (если считаются топами)
    "dress",
    "jumpsuit",

    # Layered tops
    "blazer",
    "zip_hoodie",

    # Transformable tops
    "hoodie",
    "longsleeve",
    "cardigan",
    "shirt",

    # Final layer tops
    "turtleneck",
    "bodysuit",

    # Base tops
    "tshirt",
    "tank_top"
]

bottom_subtypes = [
    "jeans",
    "trousers",
    "sweatpants",
    "shorts",
    "skirt",
    "joggers"
]

outerwear_subtypes = [
    "coat",
    "jacket",
    "bomber",
    "trench",
    "puffer",
    "fur_coat",
    "sheepskin_coat",
    "windbreaker",
    "jeans_jacket",
    "biker_jacket",
]

colors = ["black", "white", "grey", "beige", "brown",
          "blue", "navy", "green", "red", "yellow", "pink",
          "purple", "orange", "khaki", "multicolor"]

warmth_levels = ["light", "mid", "warm", "very_warm"]

warmth_to_num = {
    "light": 1,
    "mid": 2,
    "warm": 3,
    "very_warm": 4,
}

weather_warmth_levels = ["very_cold", "cold", "mild", "hot"]
weather_warmth_to_num = {
    "very_cold": 4,
    "cold": 3,
    "mild": 2,
    "hot": 1,
}

waterproof_items = {
    "coat": 0.3,              # пальто — немного защищает, но не особо
    "jacket": 0.6,            # обычная куртка — средняя влагозащита
    "bomber": 0.2,            # бомбер почти не защищает
    "trench": 0.7,            # тренч — хорошо от дождя
    "puffer": 0.5,            # пуховик держит влагу, но намокает
    "fur_coat": 0.0,          # шуба — почти нет влагозащиты
    "sheepskin_coat": 0.1,    # дублёнка — слабая влагозащита
    "windbreaker": 1.0,       # ветровка — максимум waterproof
    "jeans_jacket": 0.0,     # джинсовка — не держит воду
    "biker_jacket": 0.1,      # косуха — кожа держит капли, но не дождь
}

windproof_items = {
    "coat": 0.6,              # пальто — неплохо держит ветер
    "jacket": 0.7,            # куртка — типично хорошо защищает
    "bomber": 0.5,            # бомбер держит ветер средне
    "trench": 0.5,            # тренч — средняя защита
    "puffer": 0.9,            # пуховик — отличная защита от ветра
    "fur_coat": 0.8,          # шуба — высокая защита
    "sheepskin_coat": 0.8,    # дублёнка — отличная защита
    "windbreaker": 1.0,      # ветровка — максимальная защита
    "jeans_jacket": 0.3,      # джинсовка — слабая защита
    "biker_jacket": 0.5,      # косуха — кожа хорошо держит ветер
}

In [40]:
allowed_styles = {
    # ONEPIECE
    "dress": ["casual", "official", "party"],
    "jumpsuit": ["casual", "street", "party"],

    # LAYERED TOPS
    "blazer": ["official", "party"],
    "zip_hoodie": ["casual", "sport", "street", "outdoor"],

    # TRANSFORMABLE TOPS
    "hoodie": ["casual", "sport", "street", "outdoor"],
    "longsleeve": ["casual", "street", "sport"],
    "cardigan": ["casual", "official"],
    "shirt": ["casual", "official", "party"],

    # FINAL LAYER TOPS
    "turtleneck": ["casual", "official"],
    "bodysuit": ["casual", "party"],

    # BASE TOPS
    "tshirt": ["casual", "sport", "street", "party"],
    "tank_top": ["casual", "sport", "street", "party"],

    # BOTTOMS
    "jeans": ["casual", "street", "party"],
    "trousers": ["official", "casual", "party"],
    "sweatpants": ["sport", "street", "outdoor"],
    "shorts": ["casual", "sport", "street", "outdoor"],
    "skirt": ["casual", "official", "party"],
    "joggers": ["casual", "sport", "street"],

    # OUTERWEAR
    "coat": ["official", "casual"],
    "jacket": ["casual", "street", "outdoor"],
    "bomber": ["casual", "street", "party"],
    "trench": ["official", "casual"],
    "puffer": ["casual", "street", "outdoor"],
    "fur_coat": ["official", "party"],
    "sheepskin_coat": ["casual", "street"],
    "windbreaker": ["sport", "outdoor", "street"],
    "jeans_jacket": ["casual", "street"],
    "biker_jacket": ["casual", "street", "party"],
}

warmth_rules = {
    # ONEPIECE
    "dress": ["light", "mid"],
    "jumpsuit": ["light", "mid"],

    # LAYERED TOPS
    "blazer": ["light", "mid"],
    "zip_hoodie": ["mid", "warm"],

    # TRANSFORMABLE TOPS
    "hoodie": ["mid", "warm"],
    "longsleeve": ["light", "mid"],
    "cardigan": ["light", "mid"],
    "shirt": ["light"],

    # FINAL LAYER TOPS
    "turtleneck": ["mid"],
    "bodysuit": ["light"],

    # BASE TOPS
    "tshirt": ["light"],
    "tank_top": ["light"],

    # BOTTOMS
    "jeans": ["mid"],                  
    "trousers": ["light", "mid"],
    "sweatpants": ["mid", "warm"],
    "shorts": ["light"],
    "skirt": ["light", "mid"],
    "joggers": ["mid"],

    # OUTERWEAR
    "coat": ["warm", "very_warm"],
    "jacket": ["mid", "warm"],
    "bomber": ["mid", "warm"],
    "trench": ["light", "mid"],
    "puffer": ["warm", "very_warm"],
    "fur_coat": ["warm", "very_warm"],
    "sheepskin_coat": ["warm", "very_warm"],
    "windbreaker": ["light", "mid"],
    "jeans_jacket": ["mid"],
    "biker_jacket": ["mid"],
}

similar_styles = {
    "casual":     {"casual", "official", "street", "sport"},
    "sport":      {"sport", "casual", "street"},
    "street":     {"street", "casual", "sport", "party"},
    "official":   {"official", "casual"},
    "party":      {"party", "official", "street", "casual"},
    "outdoor":    {"outdoor", "sport", "street"},
}

preferred_colors_by_style = {
    "casual": [
    "black", "white", "grey", "beige", "brown",
    "navy", "blue",
    "green", "khaki",
    "red", "orange"],
    
    "sport": [
    "red", "yellow", "green", "blue", "black",
    "orange", "pink", "purple",
    "multicolor"],
    
    "official": [
    "white", "black", "grey", "navy", "beige",
    "brown", "blue", "khaki"],
    
    "street": [
    "black", "white", "grey",
    "red", "green", "blue", "navy",
    "pink", "purple", "yellow", "orange",
    "khaki", "brown",
    "multicolor"],
    
    "party": [
    "red", "yellow", "pink", "purple",
    "orange", "multicolor",
    "black"],
    
    "outdoor": [
    "brown", "green", "khaki",
    "beige", "navy", "black",
    "orange"]
}

bright_colors = [
    "red", "yellow", "pink", "purple", "green",
    "orange", "multicolor"
]
neutral_colors = [
    "black", "white", "grey", "beige", "brown",
    "blue", "navy", "khaki"
]

color_weights_by_style = {
    "casual":  {"bright": 0.25, "neutral": 0.75},
    "sport":   {"bright": 0.5, "neutral": 0.5},
    "official": {"bright": 0.1, "neutral": 0.9},
    "party":   {"bright": 0.5, "neutral": 0.5},
    "street":  {"bright": 0.5, "neutral": 0.5},
    "outdoor": {"bright": 0.4, "neutral": 0.6},
}

In [41]:
def choose_color_for_style(item_style: str) -> str:
    weights = color_weights_by_style[item_style]

    group = random.choices(
        population=["bright", "neutral"],
        weights=[weights["bright"], weights["neutral"]],
        k=1
    )[0]

    if group == "bright":
        palette = bright_colors
    else:
        palette = neutral_colors

    return random.choice(palette)

In [42]:
def generate_one_row(row_id: int) -> dict:
    cold_sens = random.choice(cold_sensitivity_values)
    user_style = random.choice(styles)

    # выбор типа одежды
    item_type = random.choices(
        population=["outerwear", "top", "bottom"],
        weights=[0.2, 0.4, 0.4],
        k=1,
    )[0]

    if item_type == "top":
        item_subtype = random.choice(top_subtypes)
    elif item_type == "bottom":
        item_subtype = random.choice(bottom_subtypes)
    else:
        item_subtype = random.choice(outerwear_subtypes)

    # выбор настроек одежды
    valid_styles = allowed_styles[item_subtype]
    item_style = random.choice(valid_styles)
    item_color = choose_color_for_style(item_style)

    valid_warmths = warmth_rules[item_subtype]
    item_warmth_level = random.choice(valid_warmths)
    item_warmth_num = warmth_to_num[item_warmth_level]

    is_waterproof = int(random.random() < waterproof_items.get(item_subtype, 0))
    is_windproof = int(random.random() < windproof_items.get(item_subtype, 0))

    # выбор погоды
    weather_warmth = random.choices(weather_warmth_levels, weights=[0.25, 0.25, 0.3, 0.2], k=1)[0]
    weather_coldness_num = weather_warmth_to_num[weather_warmth]
    # здесь колдесс потому что шкала в обратную сторону (чем холоднее тем больше)

    # логика осадков и ветра
    if weather_coldness_num >= 3:  # холодно
        is_snow = int(random.random() < 0.3)
        is_rain = 0 if is_snow else int(random.random() < 0.3)
    else:
        is_rain = int(random.random() < 0.3)
        is_snow = 0

    is_windy = int(random.random() < 0.4)

    # считаем скор, который превращается в liked
    score = 0.0

    # стиль вещи совпадает с любимым стилем пользователя или похожим
    if item_style == user_style:
        score += 1.8         # полное совпадение — супер
    elif item_style in similar_styles[user_style]:
        score += 0.8         # стиль близкий → небольшой бонус
    else:
        score -= 1.0         # совсем не подходит → штраф

    # цвет в любимой палитре user_style
    if item_color in preferred_colors_by_style[user_style]:
        score += 0.6
    else:
        score -= 0.3

    # одежда и погода по теплоте (оба в шкале 1..4)
    diff = item_warmth_num - weather_coldness_num

    # идеально, если diff == 0
    if diff == 0:
        score += 0.7
    # чуть теплее/чуть холоднее
    elif abs(diff) == 1:
        score += 0.5
    # одежда слишком холодная для погоды
    elif diff <= -2:
        score -= 2.5
    # одежда слишком тёплая для погоды
    elif diff >= 2:
        score -= 2.0
    # отдельный штраф за верхнюю одежду в жару
    if item_type == "outerwear" and weather_coldness_num == 1:
        score -= 2.0

    # мерзлявость пользователя
    if cold_sens == "high":
        if diff >= 0:
            score += 0.5
        if diff <= -1:            # холодновато, мерзляк страдает
            score -= 0.7
    elif cold_sens == "low":
        if diff >= 1:             # слишком тепло для немёрзнущего
            score -= 0.5

    # дождь/снег и защищённость
    if is_rain or is_snow:
        if is_waterproof:
            score += 1.0
        else:
            score -= 1.0
        if (is_windy and is_windproof):
            score += 0.5
        else:
            score -= 0.5

    liked = int(score > 0)

    return {
        "id": row_id,
        "cold_sensitivity": cold_sens,
        "user_style": user_style,
        "item_type": item_type,
        "item_subtype": item_subtype,
        "item_style": item_style,
        "item_color": item_color,
        "item_warmth": item_warmth_num,
        "is_waterproof": is_waterproof,
        "is_windproof": is_windproof,
        "weather_coldness": weather_coldness_num,
        "is_rain": is_rain,
        "is_snow": is_snow,
        "is_windy": is_windy,
        "liked": liked,
    }


def generate_dataset(n_rows: int = 1500) -> pd.DataFrame:
    rows = [generate_one_row(i) for i in range(n_rows)]
    return pd.DataFrame(rows)

In [43]:
df = generate_dataset(20000)
df

Unnamed: 0,id,cold_sensitivity,user_style,item_type,item_subtype,item_style,item_color,item_warmth,is_waterproof,is_windproof,weather_coldness,is_rain,is_snow,is_windy,liked
0,0,high,outdoor,bottom,jeans,casual,navy,2,0,0,3,0,0,0,0
1,1,low,street,top,turtleneck,casual,blue,2,0,0,4,0,0,1,0
2,2,low,sport,outerwear,windbreaker,sport,red,2,1,1,2,0,0,0,1
3,3,low,sport,outerwear,coat,casual,navy,4,0,0,1,0,0,1,0
4,4,low,official,outerwear,bomber,casual,grey,3,0,1,3,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19995,19995,high,casual,top,tank_top,street,red,1,0,0,3,1,0,1,0
19996,19996,medium,street,outerwear,trench,official,black,1,1,0,2,0,0,1,1
19997,19997,high,casual,bottom,joggers,casual,blue,2,0,0,2,0,0,1,1
19998,19998,medium,casual,bottom,jeans,casual,black,2,0,0,2,0,0,0,1


In [45]:
df["liked"].value_counts()

liked
0    11359
1     8641
Name: count, dtype: int64

In [46]:
df.to_csv('items_data.csv')

# Подготовка

In [47]:
X = df.drop(columns=["liked", "id"])
y = df["liked"]

In [48]:
categorical_features = [
    "cold_sensitivity",
    "user_style",
    "item_type",
    "item_subtype",
    "item_style",
    "item_color",
]

binary_features = [
    "is_waterproof",
    "is_windproof",
    "is_rain",
    "is_snow",
    "is_windy",
]

numeric_features = [
    "item_warmth",
    "weather_coldness",
]

# Создание модели

In [49]:
categorical_transformer = OneHotEncoder(handle_unknown="ignore")

preprocessor = ColumnTransformer(
    transformers=[
        ("cat", categorical_transformer, categorical_features),
        ("bin", "passthrough", binary_features),
        ("num", "passthrough", numeric_features),
    ]
)

model = RandomForestClassifier(
    n_estimators=400,
    max_depth=12,
    random_state=42,
)

clf = Pipeline(
    steps=[
        ("preprocess", preprocessor),
        ("model", model),
    ]
)

In [50]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42, stratify=y
)

clf.fit(X_train, y_train)

0,1,2
,steps,"[('preprocess', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('cat', ...), ('bin', ...), ...]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,n_estimators,400
,criterion,'gini'
,max_depth,12
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [51]:
y_pred = clf.predict(X_test)
y_proba = clf.predict_proba(X_test)[:, 1]

print(classification_report(y_test, y_pred))
print("ROC-AUC:", roc_auc_score(y_test, y_proba))

              precision    recall  f1-score   support

           0       0.90      0.92      0.91      2840
           1       0.89      0.87      0.88      2160

    accuracy                           0.89      5000
   macro avg       0.89      0.89      0.89      5000
weighted avg       0.89      0.89      0.89      5000

ROC-AUC: 0.970868544600939


In [52]:
joblib.dump(clf, "items_recommender.pkl")

['items_recommender.pkl']

# Пример использования

In [53]:
import joblib
import pandas as pd

clf = joblib.load("items_recommender.pkl")

def score_item(features: dict) -> float:
    """
    features — словарь с теми же ключами, что и колонки X:
    cold_sensitivity, user_style, item_type, item_subtype
    """
    df = pd.DataFrame([features])
    proba = clf.predict_proba(df)[0, 1]
    return float(proba)

## Примеры

In [59]:
features = {"cold_sensitivity": "medium",
        "user_style": "official",
        "item_type": "bottom",
        "item_subtype": "sweatpants",
        "item_style": "street",
        "item_color": "yellow",
        "item_warmth": 2,
        "is_waterproof": 0,
        "is_windproof": 0,
        "weather_coldness": 2,
        "is_rain": 0,
        "is_snow": 0,
        "is_windy": 0}

score_item(features)

0.5195024797418184

In [36]:
features = {"cold_sensitivity": "low",
        "user_style": "official",
        "item_type": "top",
        "item_subtype": "hoodie",
        "item_style": "street",
        "item_color": "red",
        "item_warmth": 3,
        "is_waterproof": 0,
        "is_windproof": 0,
        "weather_coldness": 2,
        "is_rain": 0,
        "is_snow": 0,
        "is_windy": 0}

score_item(features)

0.46233590362501364

In [37]:
features = {"cold_sensitivity": "medium",
        "user_style": "official",
        "item_type": "outerwear",
        "item_subtype": "jacket",
        "item_style": "outdoor",
        "item_color": "yellow",
        "item_warmth": 2,
        "is_waterproof": 0,
        "is_windproof": 0,
        "weather_coldness": 2,
        "is_rain": 0,
        "is_snow": 0,
        "is_windy": 0}

score_item(features)

0.41457661298289544