In [1]:
import numpy as np
import pandas as pd

df = pd.read_csv('train.csv')
df.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [2]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()

df['Status'] = df['Name'].apply(lambda name: name.split(',')[1].split('.')[0].strip())
df['Sex'] = encoder.fit_transform(df['Sex'])
df['Status'] = encoder.fit_transform(df['Status'])
df = df.drop(columns=['Survived', 'Name', 'Ticket', 'Cabin', 'Embarked', 'PassengerId'])

In [3]:
known_age = df[df['Age'].notnull()]

# Строки, где возраст отсутствует (NaN)
unknown_age = df[df['Age'].isnull()]
print(known_age.shape)
X = known_age.drop(columns=['Age'])
y = known_age['Age']

(714, 7)


In [4]:
from sklearn.model_selection import train_test_split
import xgboost as xgb

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

X_train = X_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)

train = xgb.DMatrix(X_train, label=y_train)
test = xgb.DMatrix(X_test, label=y_test)

param = {
    'max_depth': 30, #the depth of the tree
    'eta': 0.5, #the learning rate
    'objective': 'reg:squarederror', #defines the type of learning problem and objective function
}
epochs = 50 #the number of boosting rounds

In [5]:
model = xgb.train(param, train, epochs)

In [28]:
# Predict ages for rows with missing values
# Recreate unknown_age with current df structure
unknown_age = df[df['Age'].isnull()]

unknown_age_features = unknown_age.drop(columns=['Age'])
# Create DMatrix and predict
unknown_dmatrix = xgb.DMatrix(unknown_age_features)
predicted_ages = model.predict(unknown_dmatrix)
# Round and convert to int
predicted_ages = np.round(predicted_ages).astype(int)
# Get indices where Age is null
null_indices = df[df['Age'].isnull()].index
# Assign predicted ages
for i, idx in enumerate(null_indices):
    df.at[idx, 'Age'] = predicted_ages[i]
# Save complete dataset with filled ages
df.to_csv('titanic_ages.csv', index=False)
print(f"✓ Predicted {len(predicted_ages)} missing ages")
print("✓ Saved to 'titanic_ages.csv'")

✓ Predicted 0 missing ages
✓ Saved to 'titanic_ages.csv'


  return func(**kwargs)


In [9]:
df2 = pd.read_csv('titanic_ages.csv')
df2.head

<bound method NDFrame.head of      Pclass  Sex   Age  SibSp  Parch     Fare  Status
0         3    1  22.0      1      0   7.2500      11
1         1    0  38.0      1      0  71.2833      12
2         3    0  26.0      0      0   7.9250       8
3         1    0  35.0      1      0  53.1000      12
4         3    1  35.0      0      0   8.0500      11
..      ...  ...   ...    ...    ...      ...     ...
886       2    1  27.0      0      0  13.0000      14
887       1    0  19.0      0      0  30.0000       8
888       3    0   5.0      1      2  23.4500       8
889       1    1  26.0      0      0  30.0000      11
890       3    1  32.0      0      0   7.7500      11

[891 rows x 7 columns]>