In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.feature_selection import RFE
from sklearn.metrics import accuracy_score

In [14]:
# Load the train data set
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

# Can simplify the Titles with this dictionary:
Title_Dictionary = {"Capt"  : "Officer", 
                    "Col"   : "Officer",
                    "Major" : "Officer",
                    "Dr"    : "Officer", 
                    "Rev"   : "Officer", 
                    "Don"   : "Royalty", 
                    "Sir"   : "Royalty", 
                    "Jonkheer": "Royalty",
                    "the Countess":"Royalty", 
                    "Mr"    : "Mr", 
                    "Mme"   : "Mrs", 
                    "Mrs"   : "Mrs", 
                    "Ms"    : "Miss", 
                    "Miss"  : "Miss", 
                    "Mlle"  : "Miss", 
                    "Master" : "Master", 
                    "Lady"  : "Royalty", 
                    "*" : "Other" 
                    }


# ----------------------------------------------------------------------
# TRAIN modifications
# ----------------------------------------------------------------------
train["Title"] = train['Name'].apply(lambda x: x.split(',')[1].split('.')[0].strip())
train["Title"] = train["Title"].map(Title_Dictionary)
# print(data["Title"].value_counts())

# fill in NA values for Fare and Title
avg_fare = round(train['Fare'].mean())
train['Fare'] = train['Fare'].fillna(avg_fare)

avg_age = round(train['Age'].mean())
train['Age'] = train['Age'].fillna(avg_age)

train['Title'] = train['Title'].fillna("Other")


for i in ('Sex','Embarked', 'Title'):
    # Create a one-hot encoder
    encoder = OneHotEncoder(handle_unknown='ignore')

    # Encode the "Sex" column
    encoder.fit(train[[i]])
    sex_encoded = encoder.transform(train[[i]])
    sex_encoded_df = pd.DataFrame(sex_encoded.toarray(), columns=encoder.get_feature_names_out([i]))

    # Join the encoded column to the original dataframe
    train = pd.concat([train, sex_encoded_df], axis=1)

    # Drop the original "Sex" column
    train = train.drop(columns=[i])

train = train.drop(columns=["Name", "Ticket", "Cabin", "Title_Royalty", 'Embarked_nan'])
print(train.info())

# ----------------------------------------------------------------------
# TEST modifications
# ----------------------------------------------------------------------
test["Title"] = test['Name'].apply(lambda x: x.split(',')[1].split('.')[0].strip())
test["Title"] = test["Title"].map(Title_Dictionary)
# print(data["Title"].value_counts())

# fill in NA values for Fare and Title
avg_fare = round(test['Fare'].mean())
test['Fare'] = test['Fare'].fillna(avg_fare)

avg_age = round(test['Age'].mean())
test['Age'] = test['Age'].fillna(avg_age)


test['Title'] = test['Title'].fillna("Other")

for i in ('Sex','Embarked', 'Title'):
    # Create a one-hot encoder
    encoder = OneHotEncoder(handle_unknown='ignore')

    # Encode the "Sex" column
    encoder.fit(test[[i]])
    sex_encoded = encoder.transform(test[[i]])
    sex_encoded_df = pd.DataFrame(sex_encoded.toarray(), columns=encoder.get_feature_names_out([i]))

    # Join the encoded column to the original dataframe
    test = pd.concat([test, sex_encoded_df], axis=1)

    # Drop the original "Sex" column
    test = test.drop(columns=[i])

test = test.drop(columns=["Name", "Ticket", "Cabin"])
print(test.info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 17 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   PassengerId    891 non-null    int64  
 1   Survived       891 non-null    int64  
 2   Pclass         891 non-null    int64  
 3   Age            891 non-null    float64
 4   SibSp          891 non-null    int64  
 5   Parch          891 non-null    int64  
 6   Fare           891 non-null    float64
 7   Sex_female     891 non-null    float64
 8   Sex_male       891 non-null    float64
 9   Embarked_C     891 non-null    float64
 10  Embarked_Q     891 non-null    float64
 11  Embarked_S     891 non-null    float64
 12  Title_Master   891 non-null    float64
 13  Title_Miss     891 non-null    float64
 14  Title_Mr       891 non-null    float64
 15  Title_Mrs      891 non-null    float64
 16  Title_Officer  891 non-null    float64
dtypes: float64(12), int64(5)
memory usage: 118.5 KB
None
<

In [15]:
print(train.isnull().sum())
print()
print(test.isnull().sum())

PassengerId      0
Survived         0
Pclass           0
Age              0
SibSp            0
Parch            0
Fare             0
Sex_female       0
Sex_male         0
Embarked_C       0
Embarked_Q       0
Embarked_S       0
Title_Master     0
Title_Miss       0
Title_Mr         0
Title_Mrs        0
Title_Officer    0
dtype: int64

PassengerId      0
Pclass           0
Age              0
SibSp            0
Parch            0
Fare             0
Sex_female       0
Sex_male         0
Embarked_C       0
Embarked_Q       0
Embarked_S       0
Title_Master     0
Title_Miss       0
Title_Mr         0
Title_Mrs        0
Title_Officer    0
Title_Other      0
dtype: int64


In [16]:
# ----------------------------------------------------------------------
# Checking my work 
# ----------------------------------------------------------------------
print(train.columns)
print(test.columns)


# Split the data into y_train and x_train
y_train = train["Survived"]
x_train = train.drop(columns=["Survived"])

x_test = test.drop(columns=["Title_Other"])

# print(y_train)
# print(y_test.columns)
# print(x_train.columns)
# print(x_test.columns)
# print(x_train.shape)
# print(x_test.shape)

print(x_train.info())
print(x_test.info())

# print(train.head())


Index(['PassengerId', 'Survived', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare',
       'Sex_female', 'Sex_male', 'Embarked_C', 'Embarked_Q', 'Embarked_S',
       'Title_Master', 'Title_Miss', 'Title_Mr', 'Title_Mrs', 'Title_Officer'],
      dtype='object')
Index(['PassengerId', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'Sex_female',
       'Sex_male', 'Embarked_C', 'Embarked_Q', 'Embarked_S', 'Title_Master',
       'Title_Miss', 'Title_Mr', 'Title_Mrs', 'Title_Officer', 'Title_Other'],
      dtype='object')
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 16 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   PassengerId    891 non-null    int64  
 1   Pclass         891 non-null    int64  
 2   Age            891 non-null    float64
 3   SibSp          891 non-null    int64  
 4   Parch          891 non-null    int64  
 5   Fare           891 non-null    float64
 6   Sex_female     891 non-null

In [17]:
# Create a Random Forest model
RF = RandomForestClassifier(random_state=1)

# Use RFE (Recursive Factor Elimination) to select the top 4 features
my_model = RFE(estimator=RF, n_features_to_select=5)
my_model.fit(x_train, y_train)

# Print the selected features
print(x_train.columns[my_model.support_])

# Fit the model to the x_train and y_train data
my_model.fit(x_train, y_train)

# Make predictions for the test data
y_test_pred = my_model.predict(x_test)

# Create Submission csv
df = pd.DataFrame(y_test_pred, columns = ['Survived'])
submission = pd.DataFrame({"PassengerId" : test['PassengerId'], "Survived" : y_test_pred}).astype('int32')
submission.info()
submission.to_csv("submission_RF.csv", index=False)

Index(['PassengerId', 'Age', 'Fare', 'Sex_female', 'Title_Mr'], dtype='object')
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype
---  ------       --------------  -----
 0   PassengerId  418 non-null    int32
 1   Survived     418 non-null    int32
dtypes: int32(2)
memory usage: 3.4 KB


In [18]:
# Create an XGBoost model
xgb = XGBClassifier(random_state=1)

# Use RFE (Recursive Factor Elimination) to select the top 4 features
my_model = RFE(estimator=xgb, n_features_to_select=3)
my_model.fit(x_train, y_train)

# Print the selected features
print(x_train.columns[my_model.support_])

# Fit the model to the x_train and y_train data
my_model.fit(x_train, y_train)

# Make predictions for the test data
y_test_pred = my_model.predict(x_test)

# Create Submission csv
df = pd.DataFrame(y_test_pred, columns = ['Survived'])
submission = pd.DataFrame({"PassengerId" : test['PassengerId'], "Survived" : y_test_pred}).astype('int32')
submission.info()
submission.to_csv("submission_XGB.csv", index=False)

Index(['Pclass', 'Sex_female', 'Title_Mr'], dtype='object')
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype
---  ------       --------------  -----
 0   PassengerId  418 non-null    int32
 1   Survived     418 non-null    int32
dtypes: int32(2)
memory usage: 3.4 KB
