In [2]:
import pandas as pd
import random
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score
import joblib

# Составление датасета

Делаем таблицу с полями

categorial: [cold_sensitivity, user_style | item_type, item_subtype, item_style, item_color]

numeric: [item_warmth, is_waterproof, is_windproof, has_hood | weather_warmth, is_rain, is_snow, is_windy]

таргетом будет понравилась пользователю вещь для такой погоды или нет

In [19]:
cold_sensitivity_values = ["low", "medium", "high"]

styles = ["casual", "sport", "classic", "street", "business", "party", "outdoor"]

item_types = ["outerwear", "top", "bottom"]

top_subtypes = ["hoodie", "sweatshirt", "blazer", "shirt", "tshirt"]

bottom_subtypes = ["jeans", "trousers", "sweatpants"]

outerwear_subtypes = ["coat", "jacket"]

colors = ["black", "white", "grey", "beige", "brown", "blue", "navy", "green", "red", "yellow", "pink", "purple"]

warmth_levels = ["light", "mid", "warm", "very_warm"]

warmth_to_num = {
    "light": 1,
    "mid": 2,
    "warm": 3,
    "very_warm": 4,
}

weather_warmth_levels = ["very_cold", "cold", "mild", "hot"]
weather_warmth_to_num = {
    "very_cold": 4,
    "cold": 3,
    "mild": 2,
    "hot": 1,
}

waterproof_items = {"jacket": 0.7, "coat": 0.5, "hoodie": 0.1}

windproof_items =  {"jacket": 0.7, "coat": 0.5, "hoodie": 0.1}

hood_items =  {"jacket": 0.4 , "coat": 0.4 , "hoodie": 1.0}

In [20]:
allowed_styles = {
    "tshirt": ["casual", "sport", "street", "party"],
    "shirt": ["casual", "classic", "business", "party"],
    "hoodie": ["casual", "sport", "street"],
    "sweatshirt": ["casual", "sport", "street"],
    "blazer": ["classic", "business", "party"],

    "jeans": ["casual", "street", "party"],
    "trousers": ["classic", "business", "casual", "party"],
    "sweatpants": ["sport", "street", "outdoor"],

    "coat": ["classic", "business", "casual"],
    "jacket": ["casual", "sport", "street", "outdoor"],
}

warmth_rules = {
    "tshirt": ["light"],
    "shirt": ["light"],
    "hoodie": ["mid", "warm"],
    "sweatshirt": ["mid", "warm"],
    "blazer": ["mid"],

    "jeans": ["mid"],
    "shorts": ["light"]
    "trousers": ["light", "mid"],
    "sweatpants": ["mid", "warm"],

    "jacket": ["warm"],
    "coat": ["warm", "very_warm"],
}

similar_styles = {
    "casual":     {"casual", "business", "street", "sport", "classic"},
    "sport":      {"sport", "casual", "street"},
    "classic":    {"classic", "business", "casual"},
    "street":     {"street", "casual", "sport", "party"},
    "business":   {"business", "classic", "casual"},
    "party":      {"party", "street", "casual"},
    "outdoor":    {"outdoor", "sport", "street"},
}

preferred_colors_by_style = {
    "casual":  ["grey", "beige", "navy", "blue", "black"],
    "sport":   ["red", "blue", "yellow", "green", "black"],
    "classic": ["white", "black", "grey", "navy", "beige"],
    "street":  ["black", "red", "green", "purple", "blue", "pink"],
    "business":["white", "grey", "navy", "black", "beige"],
    "party":   ["red", "yellow", "pink", "purple"],
    "outdoor": ["brown", "green", "beige", "navy"]
}

bright_colors = ["red", "yellow", "pink", "purple", "green"]
neutral_colors = ["black", "white", "grey", "beige", "brown", "navy", "blue"]


color_weights_by_style = {
    "casual":  {"bright": 0.35, "neutral": 0.75},
    "sport":   {"bright": 0.5, "neutral": 0.5},
    "classic": {"bright": 0.2, "neutral": 0.8},
    "business":{"bright": 0.2, "neutral": 0.8},
    "party":   {"bright": 0.7, "neutral": 0.3},
    "street":  {"bright": 0.5, "neutral": 0.5},
    "outdoor": {"bright": 0.4, "neutral": 0.6},
}

In [21]:
def choose_color_for_style(item_style: str) -> str:
    weights = color_weights_by_style[item_style]

    group = random.choices(
        population=["bright", "neutral"],
        weights=[weights["bright"], weights["neutral"]],
        k=1
    )[0]

    if group == "bright":
        palette = bright_colors
    else:
        palette = neutral_colors

    return random.choice(palette)

In [22]:
def generate_one_row(row_id: int) -> dict:
    cold_sens = random.choice(cold_sensitivity_values)
    user_style = random.choice(styles)

    # выбор типа одежды
    item_type = random.choices(
        population=["outerwear", "top", "bottom"],
        weights=[0.2, 0.4, 0.4],
        k=1,
    )[0]

    if item_type == "top":
        item_subtype = random.choice(top_subtypes)
    elif item_type == "bottom":
        item_subtype = random.choice(bottom_subtypes)
    else:
        item_subtype = random.choice(outerwear_subtypes)

    # выбор настроек одежды
    valid_styles = allowed_styles[item_subtype]
    item_style = random.choice(valid_styles)
    item_color = choose_color_for_style(item_style)

    valid_warmths = warmth_rules[item_subtype]
    item_warmth_level = random.choice(valid_warmths)
    item_warmth_num = warmth_to_num[item_warmth_level]

    is_waterproof = int(random.random() < waterproof_items.get(item_subtype, 0))
    is_windproof = int(random.random() < windproof_items.get(item_subtype, 0))
    has_hood = int(random.random() < hood_items.get(item_subtype, 0))

    # выбор погоды
    weather_warmth = random.choices(weather_warmth_levels, weights=[0.25, 0.25, 0.3, 0.2], k=1)[0]
    weather_coldness_num = weather_warmth_to_num[weather_warmth]
    # здесь колдесс потому что шкала в обратную сторону (чем холоднее тем больше)

    # логика осадков и ветра
    if weather_coldness_num >= 3:  # холодно
        is_snow = int(random.random() < 0.3)
        is_rain = 0 if is_snow else int(random.random() < 0.3)
    else:
        is_rain = int(random.random() < 0.3)
        is_snow = 0

    is_windy = int(random.random() < 0.4)

    # считаем скор, который превращается в liked
    score = 0.0

    # стиль вещи совпадает с любимым стилем пользователя или похожим
    if item_style == user_style:
        score += 1.2         # полное совпадение — супер
    elif item_style in similar_styles[user_style]:
        score += 0.8         # стиль близкий → небольшой бонус
    else:
        score -= 1.0         # совсем не подходит → штраф

    # цвет в любимой палитре user_style
    if item_color in preferred_colors_by_style[user_style]:
        score += 0.6
    else:
        score -= 0.3

    # одежда и погода по теплоте (оба в шкале 1..4)
    diff = item_warmth_num - weather_coldness_num

    # идеально, если diff == 0
    if diff == 0:
        score += 0.7
    # чуть теплее/чуть холоднее
    elif abs(diff) == 1:
        score += 0.5
    # одежда слишком холодная для погоды
    elif diff <= -2:
        score -= 2.5
    # одежда слишком тёплая для погоды
    elif diff >= 2:
        score -= 2.0
    # отдельный штраф за верхнюю одежду в жару
    if item_type == "outerwear" and weather_coldness_num == 1:
        score -= 2.0

    # мерзлявость пользователя
    if cold_sens == "high":
        if diff >= 0:
            score += 0.5
        if diff <= -1:            # холодновато, мерзляк страдает
            score -= 0.5
    elif cold_sens == "low":
        if diff >= 1:             # слишком тепло для немёрзнущего
            score -= 0.5

    # дождь/снег и защищённость
    if is_rain or is_snow:
        if is_waterproof:
            score += 1.0
        else:
            score -= 1.0
        if (is_windy and is_windproof) or has_hood:
            score += 0.5

    liked = int(score > 0)

    return {
        "id": row_id,
        "cold_sensitivity": cold_sens,
        "user_style": user_style,
        "item_type": item_type,
        "item_subtype": item_subtype,
        "item_style": item_style,
        "item_color": item_color,
        "item_warmth": item_warmth_num,
        "is_waterproof": is_waterproof,
        "is_windproof": is_windproof,
        "has_hood": has_hood,
        "weather_coldness": weather_coldness_num,
        "is_rain": is_rain,
        "is_snow": is_snow,
        "is_windy": is_windy,
        "liked": liked,
    }


def generate_dataset(n_rows: int = 1500) -> pd.DataFrame:
    rows = [generate_one_row(i) for i in range(n_rows)]
    return pd.DataFrame(rows)

In [23]:
df = generate_dataset(20000)
df

Unnamed: 0,id,cold_sensitivity,user_style,item_type,item_subtype,item_style,item_color,item_warmth,is_waterproof,is_windproof,has_hood,weather_coldness,is_rain,is_snow,is_windy,liked
0,0,high,sport,bottom,trousers,business,purple,2,0,0,0,3,0,1,1,0
1,1,medium,street,bottom,jeans,casual,black,2,0,0,0,3,0,1,1,1
2,2,medium,casual,outerwear,jacket,outdoor,grey,3,1,1,0,2,0,0,1,1
3,3,high,street,top,tshirt,sport,brown,1,0,0,0,3,0,0,1,0
4,4,medium,outdoor,top,shirt,party,navy,1,0,0,0,2,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19995,19995,medium,street,bottom,trousers,casual,yellow,1,0,0,0,2,0,0,1,1
19996,19996,medium,casual,top,shirt,classic,white,1,0,0,0,3,0,0,0,0
19997,19997,low,outdoor,outerwear,coat,casual,purple,3,0,1,1,4,0,1,1,0
19998,19998,medium,business,top,shirt,casual,yellow,1,0,0,0,1,0,0,0,1


In [24]:
df.to_csv('items_data.csv')

# Подготовка

In [25]:
X = df.drop(columns=["liked", "id"])
y = df["liked"]

In [26]:
categorical_features = [
    "cold_sensitivity",
    "user_style",
    "item_type",
    "item_subtype",
    "item_style",
    "item_color",
]

binary_features = [
    "is_waterproof",
    "is_windproof",
    "has_hood",
    "is_rain",
    "is_snow",
    "is_windy",
]

numeric_features = [
    "item_warmth",
    "weather_coldness",
]

# Создание модели

In [27]:
categorical_transformer = OneHotEncoder(handle_unknown="ignore")

preprocessor = ColumnTransformer(
    transformers=[
        ("cat", categorical_transformer, categorical_features),
        ("bin", "passthrough", binary_features),
        ("num", "passthrough", numeric_features),
    ]
)

model = RandomForestClassifier(
    n_estimators=400,
    max_depth=12,
    random_state=42,
)

clf = Pipeline(
    steps=[
        ("preprocess", preprocessor),
        ("model", model),
    ]
)

In [28]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42, stratify=y
)

clf.fit(X_train, y_train)

  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():


Pipeline(steps=[('preprocess',
                 ColumnTransformer(transformers=[('cat',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  ['cold_sensitivity',
                                                   'user_style', 'item_type',
                                                   'item_subtype', 'item_style',
                                                   'item_color']),
                                                 ('bin', 'passthrough',
                                                  ['is_waterproof',
                                                   'is_windproof', 'has_hood',
                                                   'is_rain', 'is_snow',
                                                   'is_windy']),
                                                 ('num', 'passthrough',
                                                  ['item_warmth',
                              

In [29]:
y_pred = clf.predict(X_test)
y_proba = clf.predict_proba(X_test)[:, 1]

print(classification_report(y_test, y_pred))
print("ROC-AUC:", roc_auc_score(y_test, y_proba))

  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():


              precision    recall  f1-score   support

           0       0.91      0.95      0.93      2859
           1       0.93      0.88      0.91      2141

    accuracy                           0.92      5000
   macro avg       0.92      0.92      0.92      5000
weighted avg       0.92      0.92      0.92      5000

ROC-AUC: 0.9785769562722111


In [30]:
joblib.dump(clf, "items_recommender.pkl")

['items_recommender.pkl']

# Пример использования

In [1]:
import joblib
import pandas as pd

clf = joblib.load("items_recommender.pkl")

def score_item(features: dict) -> float:
    """
    features — словарь с теми же ключами, что и колонки X:
    cold_sensitivity, user_style, item_type, item_subtype
    """
    df = pd.DataFrame([features])
    proba = clf.predict_proba(df)[0, 1]
    return float(proba)

## Примеры

In [3]:
features = {"cold_sensitivity": "low",
        "user_style": "casual",
        "item_type": "bottom",
        "item_subtype": "trousers",
        "item_style": "casual",
        "item_color": "blue",
        "item_warmth": 1,
        "is_waterproof": 0,
        "is_windproof": 0,
        "has_hood": 0,
        "weather_coldness": 4,
        "is_rain": 0,
        "is_snow": 0,
        "is_windy": 0}

score_item(features)

  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():


0.13958078958363296

In [33]:
features = {"cold_sensitivity": "medium",
        "user_style": "classic",
        "item_type": "top",
        "item_subtype": "hoodie",
        "item_style": "street",
        "item_color": "red",
        "item_warmth": 3,
        "is_waterproof": 0,
        "is_windproof": 0,
        "has_hood": 1,
        "weather_coldness": 1,
        "is_rain": 0,
        "is_snow": 0,
        "is_windy": 0}

score_item(features)

  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():


0.2477891076143283