# 데이터 전처리

In [17]:
import pandas as pd

train = pd.read_csv("train.csv")
train['Age']= train['Age'].fillna(train['Age'].mean())

features = ['Pclass', 'Sex', 'Age','SibSp', 'Parch', 'Fare', 'Embarked']
X = pd.get_dummies(train[features], drop_first=True).values

y = train['Survived'].values

In [18]:
from sklearn.tree import DecisionTreeClassifier
import numpy as np


class RandomForestClassifier:
    def __init__(self, n_estimators=100, max_depth=None, max_features='sqrt', max_samples=None):
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.max_features = max_features
        self.max_samples = max_samples
        self.estimators = []

        for i in range(n_estimators):
            dt = DecisionTreeClassifier(max_depth=max_depth, max_features=max_features)
            self.estimators.append(dt)

    def fit(self, X, y):
        for i in range(self.n_estimators):
            x_sample, y_sample = self.sample(X, y)
            self.estimators[i].fit(x_sample, y_sample)
    
    def sample(self, X, y):
        if self.max_samples is None:
            n_samples = X.shape[0]
        else:
            n_samples = min(self.max_samples, X.shape[0])
        
        indices = np.random.choice(X.shape[0], n_samples, replace=True)
        return X[indices], y[indices]
    
    def predict(self, X):
        all_predictions = np.zeros((self.n_estimators, X.shape[0]), dtype=np.int64)
        for i in range(self.n_estimators):
            all_predictions[i] = self.estimators[i].predict(X)

        predictions = np.zeros(X.shape[0], dtype=np.int64)
        for i in range(X.shape[0]):
            predictions[i] = np.bincount(all_predictions[:, i]).argmax()
            
        return predictions

In [19]:
rf = RandomForestClassifier(n_estimators=50, max_depth=10, max_features=2)
rf.fit(X, y)
y_pred = rf.predict(X)

# train accuracy 출력
print("train accuracy:", (y==y_pred).mean())

train accuracy: 0.9236812570145904


In [20]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=50, max_depth=None, min_samples_split=20, max_features=2, oob_score=True)
rf.fit(X, y)
y_pred = rf.predict(X)

print("train accuracy:", (y==y_pred).mean())
print("out-of-bag score:", rf.oob_score_)


train accuracy: 0.8821548821548821
out-of-bag score: 0.8305274971941639


In [22]:
test = pd.read_csv("test.csv")

test['Age'] = test['Age'].fillna(train['Age'].mean())

features = ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'Sex', 'Embarked']
X_test = pd.get_dummies(test[features], drop_first=True).values
y_pred = rf.predict(X_test)

with open("rf_result.csv", "w") as f:
    f.write("PassengerId,Survived\n")

    for a, b in zip(test['PassengerId'].values, y_pred):
        f.write(f"{a},{b}\n")

In [24]:
import pandas as pd
import numpy as np
import re
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder

# Read the training and test data
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

# Add 'Survived' column to the test set for consistent processing
test['Survived'] = np.nan

# Combine the datasets
combined = pd.concat([train, test], ignore_index=True, sort=False)

# Extract titles from names
def extract_title(name):
    title_search = re.search(' ([A-Za-z]+)\.', name)
    if title_search:
        return title_search.group(1)
    return ""

combined['Title'] = combined['Name'].apply(extract_title)

# Simplify titles
title_mapping = {
    "Mlle": "Miss", "Ms": "Miss", "Lady": "Mrs", "Mme": "Mrs",
    "Countess": "Mrs", "Dona": "Mrs", "Dr": "Rare", "Rev": "Rare",
    "Col": "Rare", "Major": "Rare", "Sir": "Rare", "Jonkheer": "Rare",
    "Don": "Rare", "Capt": "Rare"
}
combined['Title'] = combined['Title'].replace(title_mapping)

# Fill missing 'Age' values based on median age for each title
combined['Age'] = combined.groupby('Title')['Age'].transform(lambda x: x.fillna(x.median()))

# Fill any remaining missing 'Age' values with overall median
combined['Age'].fillna(combined['Age'].median(), inplace=True)

# Fill missing 'Embarked' with the most common port
combined['Embarked'].fillna(combined['Embarked'].mode()[0], inplace=True)

# Fill missing 'Fare' values with the median fare
combined['Fare'].fillna(combined['Fare'].median(), inplace=True)

# Extract deck information from 'Cabin'
combined['Deck'] = combined['Cabin'].str[0]
combined['Deck'] = combined['Deck'].fillna('U')  # U for Unknown

# Create 'FamilySize' feature
combined['FamilySize'] = combined['SibSp'] + combined['Parch'] + 1

# Create 'IsAlone' feature
combined['IsAlone'] = 1  # Initialize to 1 (true)
combined.loc[combined['FamilySize'] > 1, 'IsAlone'] = 0  # Set to 0 if family size >1

# Categorize 'Age' into bands
combined['AgeBand'] = pd.cut(combined['Age'], 5)
age_label = LabelEncoder()
combined['AgeBand'] = age_label.fit_transform(combined['AgeBand'])

# Categorize 'Fare' into bands
combined['FareBand'] = pd.qcut(combined['Fare'], 4)
fare_label = LabelEncoder()
combined['FareBand'] = fare_label.fit_transform(combined['FareBand'])

# Drop unnecessary columns
combined.drop(['Name', 'Ticket', 'Cabin'], axis=1, inplace=True)

# One-Hot Encoding for categorical variables
combined = pd.get_dummies(combined, columns=['Sex', 'Embarked', 'Title', 'Deck'], drop_first=True)

# Split back into train and test sets
train = combined[combined['Survived'].notnull()]
test = combined[combined['Survived'].isnull()].drop(['Survived'], axis=1)

# Define features and target variable
features = train.columns.drop(['Survived', 'PassengerId'])
X = train[features]
y = train['Survived'].astype(int)
X_test = test[features]

# Initialize and train the Random Forest classifier
rf = RandomForestClassifier(
    n_estimators=100,
    max_depth=5,
    random_state=42
)
rf.fit(X, y)

# Predict survival on the test set
y_pred = rf.predict(X_test)

# Create submission DataFrame
submission = pd.DataFrame({
    'PassengerId': test['PassengerId'].astype(int),
    'Survived': y_pred.astype(int)
})

# Write the submission DataFrame to a CSV file
submission.to_csv('submission1.csv', index=False)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  combined['Age'].fillna(combined['Age'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  combined['Embarked'].fillna(combined['Embarked'].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the interme