# Feature Construction, Splitting

In [27]:
# import libraries
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression

In [28]:
# laod dataset with selected columns
df = sns.load_dataset('titanic')[['age', 'pclass', 'sibsp', 'parch', 'survived']]

In [29]:
# display the first few rows of the dataframe
df.head()

Unnamed: 0,age,pclass,sibsp,parch,survived
0,22.0,3,1,0,0
1,38.0,1,1,0,1
2,26.0,3,0,0,1
3,35.0,1,1,0,1
4,35.0,3,0,0,0


In [30]:
# remove missing values
df.dropna(inplace=True)

In [31]:
# split features and target variable
X = df.iloc[:,0:4]
y= df.iloc[:,-1]

In [32]:
# display the first few rows of the features dataframe
X.head()

Unnamed: 0,age,pclass,sibsp,parch
0,22.0,3,1,0
1,38.0,1,1,0
2,26.0,3,0,0
3,35.0,1,1,0
4,35.0,3,0,0


In [33]:
# evaluate logistic regression model using 10-fold cross-validation
np.mean(cross_val_score(LogisticRegression(), X, y, cv=10, scoring='accuracy'))

np.float64(0.6921165884194054)

# Applying feature construction

In [34]:
# Feature Construction: Create 'Family_size' feature
X['Family_size'] = X['sibsp'] + X['parch'] +1

In [35]:
X.head()

Unnamed: 0,age,pclass,sibsp,parch,Family_size
0,22.0,3,1,0,2
1,38.0,1,1,0,2
2,26.0,3,0,0,1
3,35.0,1,1,0,2
4,35.0,3,0,0,1


In [36]:
def myfunc(num):
    """Categorizes a family size number into groups."""
    
    if num == 1:
        # 0 = alone
        return 0

    elif num > 1 and num <= 4:
        # 1 = small family (2-4 members)
        return 1

    else:
        # 2 = large family (5+ members)
        return 2

In [37]:
# Apply the function to the 'Family_size' feature
myfunc(4)

1

In [38]:
# Create 'Family_type' feature based on 'Family_size'
X['Family_type'] = X['Family_size'].apply(myfunc)

In [39]:
X.head()

Unnamed: 0,age,pclass,sibsp,parch,Family_size,Family_type
0,22.0,3,1,0,2,1
1,38.0,1,1,0,2,1
2,26.0,3,0,0,1,0
3,35.0,1,1,0,2,1
4,35.0,3,0,0,1,0


In [40]:
# Drop the original 'sibsp', 'parch', and 'Family_size' features
X.drop(columns=['sibsp', 'parch', 'Family_size'], inplace=True)

In [41]:
X.head()

Unnamed: 0,age,pclass,Family_type
0,22.0,3,1
1,38.0,1,1
2,26.0,3,0
3,35.0,1,1
4,35.0,3,0


In [42]:
# evaluate logistic regression model using 20-fold cross-validation
np.mean(cross_val_score(LogisticRegression(), X, y , cv=20, scoring= 'accuracy'))

np.float64(0.7003174603174602)

# Feature Splitting

In [45]:
# Load dataset from CSV file
df = pd.read_csv('../Dataset/train_titanic.csv')

In [46]:
# display the first few rows of the dataframe
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [47]:
# display the 'Name' column
df['Name']

0                                Braund, Mr. Owen Harris
1      Cumings, Mrs. John Bradley (Florence Briggs Th...
2                                 Heikkinen, Miss. Laina
3           Futrelle, Mrs. Jacques Heath (Lily May Peel)
4                               Allen, Mr. William Henry
                             ...                        
886                                Montvila, Rev. Juozas
887                         Graham, Miss. Margaret Edith
888             Johnston, Miss. Catherine Helen "Carrie"
889                                Behr, Mr. Karl Howell
890                                  Dooley, Mr. Patrick
Name: Name, Length: 891, dtype: object

In [48]:
# Create 'Title' feature by extracting titles from the 'Name' column
df['Title']= df['Name'].str.split(',', expand=True)[1].str.split('.', expand=True)[0]

In [49]:
# display the 'Name' and 'Title' columns
df[['Name', 'Title']].head()

Unnamed: 0,Name,Title
0,"Braund, Mr. Owen Harris",Mr
1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",Mrs
2,"Heikkinen, Miss. Laina",Miss
3,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",Mrs
4,"Allen, Mr. William Henry",Mr


In [50]:
# display data types of each column
df.dtypes


PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
Title           object
dtype: object

In [51]:
# analyze survival rates by title
df.groupby('Title')['Survived'].mean().sort_values(ascending=False)

Title
Lady            1.000000
Ms              1.000000
Sir             1.000000
Mme             1.000000
the Countess    1.000000
Mlle            1.000000
Mrs             0.792000
Miss            0.697802
Master          0.575000
Major           0.500000
Col             0.500000
Dr              0.428571
Mr              0.156673
Capt            0.000000
Jonkheer        0.000000
Don             0.000000
Rev             0.000000
Name: Survived, dtype: float64

In [52]:
# Create 'Is_Married' feature based on 'Title'
df['Is_Married'] = 0
df['Is_Married'].loc[df['Title']== 'Mrs'] =1

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  df['Is_Married'].loc[df['Title']== 'Mrs'] =1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Is_Married'].l

In [53]:
# display the 'Is_Married' column
df['Is_Married']

0      0
1      0
2      0
3      0
4      0
      ..
886    0
887    0
888    0
889    0
890    0
Name: Is_Married, Length: 891, dtype: int64