<a href="https://colab.research.google.com/github/jahnavimidde/VsemML/blob/main/Titanic.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#Data Collection
import pandas as pd
df = pd.read_csv("/content/titanic_train (1).csv")
print(df.head(5))

   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.0500   NaN        S  


In [2]:
#Handling null values
df.ffill(inplace=True) #Forward fill
df.bfill(inplace=True) #Backward fill
print(df.isnull().any())  #If any null values, returns True

PassengerId    False
Survived       False
Pclass         False
Name           False
Sex            False
Age            False
SibSp          False
Parch          False
Ticket         False
Fare           False
Cabin          False
Embarked       False
dtype: bool


In [3]:
#Handling different columns
#Drop ones that don't contribute
df.drop('PassengerId',axis=1,inplace=True)

In [4]:
#Encode nominal categorical values
df["Sex"]=df["Sex"].map({"male":0,"female":1})

In [5]:
#Extract useful prefixes
import re
df["Title"]=df["Name"].str.extract(r",\s*([^\.]+)\.")
df.drop("Name",axis=1,inplace=True)
df["Deck"] = df["Ticket"].str.extract(r"([A-Za-z\.]+)")
df["Deck"] = df["Deck"].fillna("NoPrefix")
df.drop("Ticket", axis=1, inplace=True)
#But too many prefixes for deck
threshold=10
prefix_counts = df["Deck"].value_counts()
rare_prefixes = prefix_counts[prefix_counts < threshold].index
df["Deck"] = df["Deck"].replace(rare_prefixes, 'Other')
# One-Hot Encode 'Deck' column, drop_first=True avoids dummy variable trap
df = pd.get_dummies(df, columns=["Deck"], drop_first=True)

In [6]:
#After data preprocessing
print(df.head(2))

   Survived  Pclass  Sex   Age  SibSp  Parch     Fare Cabin Embarked Title  \
0         0       3    0  22.0      1      0   7.2500   C85        S    Mr   
1         1       1    1  38.0      1      0  71.2833   C85        C   Mrs   

   Deck_C.A.  Deck_NoPrefix  Deck_Other  Deck_PC  Deck_SC  Deck_SOTON  \
0      False          False       False    False    False       False   
1      False          False       False     True    False       False   

   Deck_STON  
0      False  
1      False  


#Train and Test Data Splitting

In [7]:
#Data Splitting
from sklearn.model_selection import train_test_split
X=df.drop("Survived",axis=1)
y=df["Survived"]
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=42,stratify=y)

In [8]:
#Basic Filter methods
cat_columns = X_train.select_dtypes(include=["object"]).columns.tolist()
X_train=pd.get_dummies(X_train,columns=cat_columns,drop_first=True)
X_train=X_train.astype(int)
X_test=pd.get_dummies(X_test,columns=cat_columns,drop_first=True)
X_test=X_test.astype(int)
#Removing Constant features
const = []
for features in X_train:
  if(X_train[features].std()==0):
    const.append(features)
print("Number of constant features:",len(const))
X_train.drop(labels=const,axis=1,inplace=True)
X_test.drop(labels=const,axis=1,inplace=True)

#Removing quasi constant features
quasi_constant = []
for feature in X_train.columns:
  predominant = (X_train[feature].value_counts()/float(len(X_train))).sort_values(ascending=False).values[0]
  if(predominant>0.999):
    quasi_constant.append(feature)
print("Number of quasi constant features:",len(quasi_constant))
X_train.drop(labels=quasi_constant,axis=1,inplace=True)
X_test.drop(labels=quasi_constant,axis=1,inplace=True)#Apply same removal to X_test

#Duplicated features
duplicates = []
for i in range(len(X_train.columns)):
  col1 = X_train.columns[i]
  for col2 in X_train.columns[i+1:]:
    if(X_train[col1].equals(X_train[col2])): #Not ==, as it won't return a single True of False
      duplicates.append(col2)
print("Number of duplicate features:",len(duplicates))
X_train.drop(labels=duplicates,axis=1,inplace=True)
X_test.drop(labels=duplicates,axis=1,inplace=True)#Apply same removal to X_test


Number of constant features: 0
Number of quasi constant features: 0
Number of duplicate features: 0


In [9]:
#Statistical Filter Methods
num_features=[col for col in X_train.columns if not set(X_train[col].unique()).issubset({0,1})]
cat_features = [col for col in X_train.columns if set(X_train[col].unique()).issubset({0,1})]
from sklearn.feature_selection import f_classif,SelectKBest
#1. Anova
f_values,p_values = f_classif(X_train[num_features],y_train)
anova_df=pd.DataFrame({
    "Numerical Features":num_features,
    "F_values":f_values,
    "P_values":p_values
    })
anova_df.sort_values(by="P_values",inplace=True)
significant_numeric_features = anova_df[anova_df["P_values"]<0.05]["Numerical Features"].tolist()
print("Selected Numerical Features:", significant_numeric_features)
#2.chi2
from sklearn.feature_selection import chi2
chi2_values,p_values=chi2(X_train[cat_features],y_train)
chi2_df=pd.DataFrame({
    "Categorical Features":cat_features,
    "Chi2_values":chi2_values,
    "p_values":p_values
     })
chi2_df.sort_values(by="p_values",inplace=True)
significant_chi2_features=chi2_df[chi2_df["p_values"]<0.05]["Categorical Features"].tolist()
print("Selected Features from Chi2 test:", significant_chi2_features)
#3. Mutual info
from sklearn.feature_selection import mutual_info_classif,SelectKBest
selector = SelectKBest(score_func=mutual_info_classif,k=10)
selector.fit(X_train[cat_features],y_train)
significant_mi_features=X_train[cat_features].columns[selector.get_support()].tolist()
print("Selected Features from MI test:",significant_mi_features)
final_selected_features=list(set(significant_numeric_features+significant_chi2_features+significant_mi_features)) #Set conversion is for removing duplicates, converting to list is to use it as indexer
#Filter
X_train=X_train[final_selected_features]
for col in X_train.columns:
    if col not in X_test.columns:
        X_test[col] = 0
X_test = X_test[X_train.columns]

Selected Numerical Features: ['Pclass', 'Fare', 'Parch', 'Age']
Selected Features from Chi2 test: ['Sex', 'Title_Mr', 'Title_Mrs', 'Title_Miss', 'Deck_PC', 'Embarked_S', 'Cabin_A34', 'Cabin_D33', 'Cabin_C110', 'Title_Master']
Selected Features from MI test: ['Sex', 'Deck_NoPrefix', 'Cabin_B73', 'Cabin_C124', 'Cabin_C99', 'Cabin_D56', 'Title_Don', 'Title_Miss', 'Title_Mr', 'Title_Mrs']


In [10]:
#Dataset after data preprocessing and feature selection
print(X_train.columns)
print(X_train.head(3))

Index(['Title_Mr', 'Cabin_D33', 'Deck_NoPrefix', 'Pclass', 'Fare', 'Title_Mrs',
       'Cabin_B73', 'Cabin_C99', 'Title_Don', 'Age', 'Parch', 'Cabin_A34',
       'Cabin_C110', 'Deck_PC', 'Cabin_C124', 'Embarked_S', 'Title_Miss',
       'Sex', 'Title_Master', 'Cabin_D56'],
      dtype='object')
     Title_Mr  Cabin_D33  Deck_NoPrefix  Pclass  Fare  Title_Mrs  Cabin_B73  \
748         1          0              1       1    53          0          0   
45          1          0              0       3     8          0          0   
28          0          0              1       3     7          0          0   

     Cabin_C99  Title_Don  Age  Parch  Cabin_A34  Cabin_C110  Deck_PC  \
748          0          0   19      0          0           0        0   
45           0          0   19      0          0           0        0   
28           0          0   19      0          0           0        0   

     Cabin_C124  Embarked_S  Title_Miss  Sex  Title_Master  Cabin_D56  
748           0        