<a href="https://colab.research.google.com/github/lilyariver-design/Zarplata/blob/main/%D1%80%D0%B0%D0%B1%D0%BE%D1%82%D0%B0%D0%B5%D1%82_ipynb%22.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import statsmodels.api as sm

In [None]:
item = 'gss_wages' # Выбрать из поля Item для вашего датасета
package = 'stevedata' # Выбрать из поля Package для вашего датасета

df = sm.datasets.get_rdataset(item, package , cache=True).data
df.head()

Unnamed: 0,year,realrinc,age,occ10,occrecode,prestg10,childs,wrkstat,gender,educcat,maritalcat
0,1974,4935.0,21.0,5620.0,Office and Administrative Support,25.0,0.0,School,Male,High School,Married
1,1974,43178.0,41.0,2040.0,Professional,66.0,3.0,Full-Time,Male,Bachelor,Married
2,1974,,83.0,,,,2.0,Housekeeper,Female,Less Than High School,Widowed
3,1974,,69.0,,,,2.0,Housekeeper,Female,Less Than High School,Widowed
4,1974,18505.0,58.0,5820.0,Office and Administrative Support,37.0,0.0,Full-Time,Female,High School,Never Married


In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
import joblib
import warnings
warnings.filterwarnings("ignore")

# === 1. ЗАГРУЗКА ДАННЫХ ===
df = pd.read_csv("gss_wages.csv", header=None)

# Присваиваем колонки
df.columns = [
    "id", "year", "realrinc", "age", "occ10", "occupation", "prestg10",
    "childs", "wrkstat", "gender", "educcat", "maritalcat"
]

# Удаляем строки без дохода
df = df.dropna(subset=["realrinc"]).copy()
print(f"Размер после удаления NaN в realrinc: {df.shape}")

# === 2. ОБРАБОТКА EDUCATION ===
edu_map = {
    "Less Than High School": 10,
    "High School": 12,
    "Junior College": 14,
    "Bachelor": 16,
    "Graduate": 18
}

# Преобразуем educcat → education_num, всё, что не в словаре → NaN
df["education_num"] = df["educcat"].map(edu_map)

# Удаляем строки, где education_num не определён
df = df.dropna(subset=["education_num"]).copy()

# Явно приводим к целому числу
df["education_num"] = df["education_num"].astype(int)

# === 3. ПОЛ ===
df["is_male"] = (df["gender"] == "Male").astype(int)

# === 4. ЗАНЯТОСТЬ ===
employed_statuses = ["Full-Time", "Part-Time", "Temporarily Not Working"]
df["is_employed"] = df["wrkstat"].isin(employed_statuses).astype(int)

# === 5. ПРОИЗВОДНЫЕ ПРИЗНАКИ ===
# Убедимся, что age и education_num — числа
df["age"] = pd.to_numeric(df["age"], errors='coerce')
df["prestg10"] = pd.to_numeric(df["prestg10"], errors='coerce')

# Удаляем строки, где age или prestg10 стали NaN после преобразования
df = df.dropna(subset=["age", "prestg10"]).copy()

# Теперь безопасно вычисляем
df["work_experience"] = df["age"] - df["education_num"] - 6
df["work_experience"] = df["work_experience"].clip(lower=0)
df["age_squared"] = df["age"] ** 2
df["experience_squared"] = df["work_experience"] ** 2

# === 6. ONE-HOT ДЛЯ MARITAL ===
marital_dummies = pd.get_dummies(df["maritalcat"], prefix="marital")
needed_marital = ["marital_Married", "marital_Never Married", "marital_Widowed", "marital_Separated"]
for col in needed_marital:
    if col not in marital_dummies.columns:
        marital_dummies[col] = 0
marital_dummies = marital_dummies[needed_marital]
df = pd.concat([df, marital_dummies], axis=1)

# === 7. ВЫБОР ПРИЗНАКОВ ===
feature_columns = [
    "prestg10", "education_num", "is_male", "childs", "age",
    "experience_squared", "work_experience", "age_squared",
    "marital_Married", "is_employed",
    "marital_Never Married", "marital_Widowed", "marital_Separated"
]

X = df[feature_columns]
y = df["realrinc"]

# Удаляем строки с NaN в признаках
X = X.dropna()
y = y[X.index]  # синхронизируем y

print(f"Финальный размер датасета: {X.shape}")

# === 8. РАЗДЕЛЕНИЕ ===
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# === 9. ОБУЧЕНИЕ ===
print("Обучение модели...")
rf = RandomForestRegressor(
    n_estimators=200,
    max_depth=20,
    min_samples_split=10,
    min_samples_leaf=5,
    random_state=42,
    n_jobs=-1
)
rf.fit(X_train, y_train)

# === 10. ОЦЕНКА ===
def evaluate(name, X, y):
    pred = rf.predict(X)
    r2 = r2_score(y, pred)
    rmse = np.sqrt(mean_squared_error(y, pred))
    print(f"{name} → R²: {r2:.4f}, RMSE: {rmse:,.2f}")

evaluate("Train", X_train, y_train)
evaluate("Val", X_val, y_val)
evaluate("Test", X_test, y_test)

# === 11. СОХРАНЕНИЕ ===
joblib.dump(rf, "random_forest_model.pkl")
print("\n✅ Модель сохранена в random_forest_model.pkl")

# === 12. ВАЖНОСТЬ ПРИЗНАКОВ ===
importances = pd.DataFrame({
    "Признак": feature_columns,
    "Важность": rf.feature_importances_
}).sort_values("Важность", ascending=False)
print("\nТОП-5 признаков:")
print(importances.head())
importances.to_csv("feature_importance_generated.csv", index=False)

Размер после удаления NaN в realrinc: (37888, 12)
Финальный размер датасета: (37189, 13)
Обучение модели...
Train → R²: 0.4553, RMSE: 22,354.14
Val → R²: 0.2682, RMSE: 22,289.28
Test → R²: 0.2520, RMSE: 22,331.85

✅ Модель сохранена в random_forest_model.pkl

ТОП-5 признаков:
         Признак  Важность
0       prestg10  0.262060
1  education_num  0.186668
2        is_male  0.100612
4            age  0.099558
7    age_squared  0.097445
