In [667]:
import pandas as pd 
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder,OneHotEncoder, StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score ,classification_report ,confusion_matrix

In [668]:
train_df = pd.read_csv('data/train.csv')
test_df = pd.read_csv('data/test.csv')

In [669]:
class CONFIG:
    title_list = [
                    'Mr', 'Mrs', 'Miss', 'Rev', 'Ms', 'Dr', 'Lady', 'Master', 'Don', 'Mme', 
                    'Major', 'Sir', 'Mlle', 'Col', 'Capt', 'Countess', 'Jonkheer'
                ]
    title_mapping = {
                        'Mr': ('male', 18, np.inf),
                        'Mrs': ('female', 18, np.inf),
                        'Miss': ('female', 0, 18),
                        'Master': ('male', 0, 18),
                        'Rev': ('male', 18, np.inf),
                        'Ms': ('female', 18, np.inf),
                        'Dr': ('male', 18, np.inf),
                        'Dr': ('female', 18, np.inf),
                        'Lady': ('female', 18, np.inf),
                        'Don': ('male', 18, np.inf),
                        'Mme': ('female', 18, np.inf),
                        'Major': ('male', 18, np.inf),
                        'Sir': ('male', 18, np.inf),
                        'Mlle': ('female', 18, np.inf),
                        'Col': ('male', 18, np.inf),
                        'Capt': ('male', 18, np.inf),
                        'Countess': ('female', 18, np.inf),
                        'Jonkheer': ('male', 18, np.inf)
                    }
    INPUT_COLS = ['Sex', 'Pclass','AgeGroup', 'FarePerPerson', 'CabinLetter','FamilyCategory']
    TARGET_COL = ['Survived']
    CAT_COLS = ['Sex', 'Pclass','AgeGroup', 'CabinLetter','FamilyCategory']
    NUM_COLS = ['FarePerPerson']
    TEST_SIZE = 0.25
    RANDOM_STATE = 42
    LBL_ENC = LabelEncoder()
    OHE_ENC = OneHotEncoder(sparse=False, handle_unknown='ignore')
    SCALER = StandardScaler()
    
    

In [670]:
def extract_tiles_from_names(df):
    df['FamilyName'] = df['Name'].apply(lambda x: x.split(',')[0])
    df['FirstName'] = df['Name'].apply(lambda x: x.split(',')[1].split('.')[1].strip())
    df['MappedTitle'] = df['Name'].apply(lambda x: x.split(',')[1].split('.')[0].strip())
    return df

def map_title(row):
    if pd.isnull(row['Age']):
        mapped_title = row['MappedTitle']
        sex = row['Sex']
        for title, (title_sex, min_age, max_age) in CONFIG.title_mapping.items():
            if title == 'Dr':
                if sex in ['male', 'female'] and mapped_title == 'Dr':
                    mapped_title = 'Mr' if sex == 'male' else 'Mrs'
                    break
            elif title_sex == sex and min_age <= row['Age'] <= max_age:
                mapped_title = title
                break
    else:
        sex = row['Sex']
        age = row['Age']
        for title, (title_sex, min_age, max_age) in CONFIG.title_mapping.items():
            if title == 'Dr':
                if sex in ['male', 'female'] and min_age <= age <= max_age:
                    mapped_title = 'Mr' if sex == 'male' else 'Mrs'
                    break
            elif title_sex == sex and min_age <= age <= max_age:
                mapped_title = title
                break
    return mapped_title

def group_age(title):
    if title == "Mr" or  title == "Mr":
        return "Adult"
    else:
        return "Children"

def fill_age_null_values(df):
    df.loc[(df['AgeGroup'] == 'Children') & (df['Age'].isnull()), 'Age'] = float(round(df[df['AgeGroup'] == 'Children']['Age'].median()))
    df.loc[(df['AgeGroup'] == 'Adult') & (df['Age'].isnull()), 'Age'] = float(round(df[df['AgeGroup'] == 'Adult']['Age'].median()))
    df.drop('MappedTitle', axis=1, inplace=True)
    return df

def handle_age_outliers_for_age(df, age_group):
    mask = df['AgeGroup'] == age_group
    
    q1 = df.loc[mask, 'Age'].quantile(0.25)
    q3 = df.loc[mask, 'Age'].quantile(0.75)
    iqr = q3 - q1
    lower_bound = q1 - 1.5 * iqr
    upper_bound = q3 + 1.5 * iqr
    
    df.loc[mask & (df['Age'] < lower_bound), 'Age'] = lower_bound
    df.loc[mask & (df['Age'] > upper_bound), 'Age'] = upper_bound
    return df

def update_family_counts(row, df):
    family_name = row['FamilyName']
    ticket = row['Ticket']
    num_children = df[(df['FamilyName'] == family_name) & (df['Title'].isin(['Miss', 'Master'])) | (df['Ticket'] == ticket) & (df['Title'].isin(['Miss', 'Master']))].shape[0]
    num_adults = df[(df['FamilyName'] == family_name) & (df['Title'].isin(['Mr', 'Mrs'])) | (df['Ticket'] == ticket) & (df['Title'].isin(['Mr', 'Mrs']))].shape[0]
    row['SibSp'] = num_children
    row['Parch'] = num_adults
    row['FamilySize'] = num_children + num_adults 
    return row

def categorize_family_size(size):
    if size == 1:
        return "Alone"
    elif 1 <= size <= 3:
        return "Medium"
    else:
        return "Large"
    
    
def create_ticket_to_fare_mapping(df):
    duplicate_tickets = df[df['Fare'] != 0].groupby('Ticket')['Fare'].max()
    ticket_to_fare = duplicate_tickets.to_dict()
    return ticket_to_fare

def fill_fare(row, ticket_to_fare ,df):
    if row['Fare'] == 0:
        if row['Ticket'] in ticket_to_fare:
            return ticket_to_fare[row['Ticket']]
        elif row['Pclass'] == 1:
            return df[df['Pclass'] == 1]['Fare'].mode().iloc[0]
        elif row['Pclass'] == 2:
            return df[df['Pclass'] == 2]['Fare'].mode().iloc[0]
        elif row['Pclass'] == 3:
            return df[df['Pclass'] == 3]['Fare'].mode().iloc[0]
    elif pd.isnull(row['Fare']):
        return df[df['Pclass'] == row['Pclass']]['Fare'].mode().iloc[0]
    return row['Fare']

def fare_per_person(df):
    ticket_counts = df['Ticket'].value_counts()
    df['FarePerPerson'] = df['Fare'] / ticket_counts[df['Ticket']].values
    return df

def outliers_using_iqr_fare(df, col):
    q1 = df[col].quantile(0.25)
    q3 = df[col].quantile(0.75)
    iqr = q3-q1
    lower_limit = q1 - 1.5*iqr
    upper_limit = q3 + 1.5*iqr
    df[col] = np.where(df[col] > upper_limit, upper_limit, df[col])
    df[col] = np.where(df[col] < lower_limit, lower_limit, df[col])
    return df

In [671]:
def fill_cabin(row,df):
    if row['Cabin'] == 'Unknown' and row['Pclass'] in [1, 2, 3]:
        class_condition = row['Pclass']
        fare = row['Fare']
        duplicate_tickets = df[(df['Ticket'] == row['Ticket']) & (df['Cabin'] != 'Unknown')]['Cabin']
        family_members_cabins = df[(df['FamilyName'] == row['FamilyName']) & (df['Cabin'] != 'Unknown')]['Cabin']
        if duplicate_tickets.any():
            return duplicate_tickets.iloc[0]
        
        elif family_members_cabins.any():
            return family_members_cabins.iloc[0]
        else:
            if class_condition == 1:
                if fare <= 55:
                    return 'C123'
                elif (55 < fare) & (fare <= 75):
                    return np.random.choice(['B22','C2','E44'])
                elif (75 < fare) & (fare <= 91):
                    return np.random.choice(['C78','C83','E67'])
                else:
                    return np.random.choice(['B58 B60','C22 C26','C23 C25 C27','C65','C68'])
            elif class_condition == 2:
                if fare > 13:
                    return 'D'
                else:
                    return 'F2'
            elif class_condition == 3:
                return 'G6'
        
    return row['Cabin']

In [672]:
def encoding_with_one_hot_encoder(data, features):
    fitting_encoder = CONFIG.OHE_ENC.fit(data[features]).transform(data[features])
    col_names = CONFIG.OHE_ENC.get_feature_names_out(input_features=features)
    encoder_df = pd.DataFrame(fitting_encoder, columns=col_names,
                              index=data.index)
    data = data.join(encoder_df)
    data = data.drop(features, axis=1)
    return data



def encoder_with_transform(data, features):
    fitting_encoder = CONFIG.OHE_ENC.transform(data[features])
    col_names = CONFIG.OHE_ENC.get_feature_names_out(input_features=features)
    encoder_df = pd.DataFrame(fitting_encoder,
                              columns=col_names,
                              index=data.index)
    data = data.join(encoder_df)
    data = data.drop(features, axis=1)
    return data
    


In [673]:
train_df['FamilyName'] = train_df['Name'].apply(lambda x: x.split(',')[0])
train_df['FirstName'] = train_df['Name'].apply(lambda x: x.split(',')[1].split('.')[1].strip())
train_df['MappedTitle'] = train_df['Name'].apply(lambda x: x.split(',')[1].split('.')[0].strip())

# Apply the map_title function to create 'Title' column
train_df['Title'] = train_df.apply(map_title, axis=1)   
train_df['AgeGroup'] = train_df['Title'].apply(lambda x: group_age(x))

train_df = fill_age_null_values(train_df)

train_df = train_df.apply(lambda row: update_family_counts(row, train_df), axis=1)
train_df['FamilyCategory'] = train_df['FamilySize'].apply(categorize_family_size)

train_df['Cabin'] = train_df['Cabin'].fillna('Unknown')
train_df['CabinLetter'] = train_df['Cabin'].apply(lambda x: str(x)[0])

# for pclass in [1, 2, 3]:
#     #survived_cabin = train_df[(train_df['Pclass'] == pclass) & (train_df['Survived'] == 1)]['Cabin'].unique()
#     #died_cabin = train_df[(train_df['Pclass'] == pclass) & (train_df['Survived'] == 0)]['Cabin'].unique()
#     train_df['Cabin'] = train_df.apply(lambda x: fill_cabin(x, train_df), axis=1)
train_df['Cabin'] = train_df.apply(lambda x: fill_cabin(x, train_df), axis=1)
train_df['CabinLetter'] = train_df['Cabin'].apply(lambda cabin: cabin[0] if cabin != 'Unknown' else 'Unknown')



train_df['Embarked'].replace(np.nan,'S',inplace=True)

train_df = handle_age_outliers_for_age(train_df, 'Adult')
train_df = handle_age_outliers_for_age(train_df, 'Children')

ticket_to_fare_mapping = create_ticket_to_fare_mapping(train_df)
train_df['Fare'] = train_df.apply(lambda row: fill_fare(row, ticket_to_fare_mapping, train_df), axis=1)
train_df = fare_per_person(train_df)

train_df = outliers_using_iqr_fare(train_df, 'Fare')

In [674]:
test_df['FamilyName'] = test_df['Name'].apply(lambda x: x.split(',')[0])
test_df['FirstName'] = test_df['Name'].apply(lambda x: x.split(',')[1].split('.')[1].strip())
test_df['MappedTitle'] = test_df['Name'].apply(lambda x: x.split(',')[1].split('.')[0].strip())


test_df['Title'] = test_df.apply(map_title, axis=1)   
test_df['AgeGroup'] = test_df['Title'].apply(lambda x: group_age(x))

test_df = fill_age_null_values(test_df)

test_df = test_df.apply(lambda row: update_family_counts(row, test_df), axis=1)
test_df['FamilyCategory'] = test_df['FamilySize'].apply(categorize_family_size)

test_df['Cabin'] = test_df['Cabin'].fillna('Unknown')
test_df['CabinLetter'] = test_df['Cabin'].apply(lambda x: str(x)[0])


test_df['Cabin'] = test_df.apply(lambda x: fill_cabin(x, test_df), axis=1)
test_df['CabinLetter'] = test_df['Cabin'].apply(lambda cabin: cabin[0] if cabin != 'Unknown' else 'Unknown')


test_df['Embarked'].replace(np.nan,'S',inplace=True)

test_df = handle_age_outliers_for_age(test_df, 'Adult')
test_df = handle_age_outliers_for_age(test_df, 'Children')

ticket_to_fare_mapping_test = create_ticket_to_fare_mapping(test_df)
test_df['Fare'] = test_df.apply(lambda row: fill_fare(row, ticket_to_fare_mapping_test, test_df), axis=1)
test_df = fare_per_person(test_df)

test_df = outliers_using_iqr_fare(test_df, 'Fare')

In [675]:
X = train_df[CONFIG.INPUT_COLS]
y = train_df[CONFIG.TARGET_COL]

X_test_df = test_df[CONFIG.INPUT_COLS]

In [676]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=CONFIG.TEST_SIZE)

In [677]:
# for col in CONFIG.CAT_COLS:    
#     CONFIG.LBL_ENC.fit(list(X_train[col].values)) 
#     X_train[col] = CONFIG.LBL_ENC.transform(list(X_train[col].values))
#     X_test[col] = CONFIG.LBL_ENC.transform(list(X_test[col].values))

    
for col in CONFIG.NUM_COLS:
    X_train[col] = CONFIG.SCALER.fit(X_train[col].values.reshape(-1,1)).transform(X_train[col].values.reshape(-1,1))
    X_test[col] = CONFIG.SCALER.transform(X_test[col].values.reshape(-1,1))
    X_test_df[col] = CONFIG.SCALER.transform(X_test_df[col].values.reshape(-1,1))

X_train = encoding_with_one_hot_encoder(X_train, CONFIG.CAT_COLS)
X_test = encoder_with_transform(X_test, CONFIG.CAT_COLS)
X_test_df = encoder_with_transform(X_test_df, CONFIG.CAT_COLS)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_df[col] = CONFIG.SCALER.transform(X_test_df[col].values.reshape(-1,1))


In [678]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier

from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import ExtraTreesClassifier

In [679]:
rf = RandomForestClassifier(n_estimators=100, random_state=0, max_depth=8, min_samples_leaf=16, min_samples_split=2, max_features='sqrt' )
rf.fit(X_train, y_train)
rf.score(X_test, y_test)

y_pred = rf.predict(X_test)
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print(accuracy_score(y_test, y_pred))


y_test_pred = rf.predict(X_test_df)

submission = pd.DataFrame({
        "PassengerId": test_df["PassengerId"],
        "Survived": y_test_pred
    })
submission.to_csv('submission.csv', index=False)

  return fit_method(estimator, *args, **kwargs)


              precision    recall  f1-score   support

           0       0.78      0.93      0.85       122
           1       0.89      0.67      0.77       101

    accuracy                           0.82       223
   macro avg       0.84      0.80      0.81       223
weighted avg       0.83      0.82      0.81       223

[[114   8]
 [ 33  68]]
0.8161434977578476


In [680]:
gb_trees = GradientBoostingClassifier( n_estimators=100, random_state=0, learning_rate=0.1, max_depth=6, min_samples_leaf=8, min_samples_split=8, subsample=0.8)
gb_trees.fit(X_train, y_train)
gb_pred = gb_trees.predict(X_test)
print(classification_report(y_test, gb_pred))
print(confusion_matrix(y_test, gb_pred))
print(accuracy_score(y_test, gb_pred))


gb_test_pred = gb_trees.predict(X_test_df)

submission = pd.DataFrame({
        "PassengerId": test_df["PassengerId"],
        "Survived": gb_test_pred
    })
submission.to_csv('submission_gb.csv', index=False)


  y = column_or_1d(y, warn=True)


              precision    recall  f1-score   support

           0       0.83      0.90      0.87       122
           1       0.87      0.78      0.82       101

    accuracy                           0.85       223
   macro avg       0.85      0.84      0.84       223
weighted avg       0.85      0.85      0.85       223

[[110  12]
 [ 22  79]]
0.8475336322869955
