In [144]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import os

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/titanic/train.csv
/kaggle/input/titanic/test.csv
/kaggle/input/titanic/gender_submission.csv


In [145]:
train_data = pd.read_csv("/kaggle/input/titanic/train.csv")
test_data = pd.read_csv("/kaggle/input/titanic/test.csv")
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


## Feature Engineering (dummy&poly)

In [147]:
def dummy(data, obj_features):
    dummy_data = pd.get_dummies(data[obj_features])
    dummy_data['PassengerId'] = data['PassengerId']
    df = dummy_data.merge(data[["PassengerId", "Fare", "Age"]], on = 'PassengerId',  how = 'inner')
    return df

y = train_data["Survived"] # уберем Survived, т.к. в тестовом df его нет
object_features = ["Pclass", "Sex", "SibSp", "Parch", "Embarked"]

X_train = dummy(train_data, object_features)
X_test = dummy(test_data, object_features)

In [148]:
X_train.sample(5)

Unnamed: 0,Pclass,SibSp,Parch,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S,PassengerId,Fare,Age
443,2,0,0,1,0,0,0,1,444,13.0,28.0
854,2,1,0,1,0,0,0,1,855,26.0,44.0
554,3,0,0,1,0,0,0,1,555,7.775,22.0
489,3,1,1,0,1,0,0,1,490,15.9,9.0
123,2,0,0,1,0,0,0,1,124,13.0,32.5


In [149]:
X_train.info()
print('\n::::::::::::::::::::::::::::::::::::::::::::\n')
X_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 0 to 890
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Pclass       891 non-null    int64  
 1   SibSp        891 non-null    int64  
 2   Parch        891 non-null    int64  
 3   Sex_female   891 non-null    uint8  
 4   Sex_male     891 non-null    uint8  
 5   Embarked_C   891 non-null    uint8  
 6   Embarked_Q   891 non-null    uint8  
 7   Embarked_S   891 non-null    uint8  
 8   PassengerId  891 non-null    int64  
 9   Fare         891 non-null    float64
 10  Age          714 non-null    float64
dtypes: float64(2), int64(4), uint8(5)
memory usage: 53.1 KB

::::::::::::::::::::::::::::::::::::::::::::

<class 'pandas.core.frame.DataFrame'>
Int64Index: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Pclass       418 non-null    int64  
 1   SibSp

In [150]:
X_train.Age.fillna(value = X_train.Age.mean(), inplace = True)
X_test.Age.fillna(value = X_test.Age.mean(), inplace = True)
X_test.Fare.fillna(value = X_test.Age.std(), inplace = True)

In [151]:
def polyfeatures(df):
    poly = PolynomialFeatures(2, interaction_only=True)
    x = poly.fit(df)
    df = pd.DataFrame(x.transform(df), columns=x.get_feature_names(df.columns))
    return df

X_test = polyfeatures(X_test)
X_train = polyfeatures(X_train)

## Machine learning

In [152]:
forest = RandomForestClassifier(n_estimators = 2500, max_depth = 7, random_state = 42)

x_train, x_test, y_train, y_test = train_test_split(X_train, y, test_size = 0.3, random_state = 77)
forest.fit(x_train, y_train)
predictions = forest.predict(x_test)

print(f"accuracy = {np.round(accuracy_score(predictions, y_test), 4)}")

accuracy = 0.8097


## Output

In [153]:
forest.fit(X_train, y)
predictions = forest.predict(X_test)

output = pd.DataFrame({'PassengerId': X_test["PassengerId"], 'Survived': predictions})
output.to_csv('my_submission.csv', index=False)