In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
df = data.copy()
tf = test.copy()

In [None]:
df.head()

In [None]:
df = df.drop("Name",axis = 1)

In [None]:
df.head(10)

In [None]:
df.shape

In [None]:
df.isna().sum()

In [None]:
df = df.drop("Cabin",axis = 1)

In [None]:
# we will use hot-deck-imputation for missing values of df age
def hot_deck_imputation(df, target_variable, matching_variables):
    for index, row in df.iterrows():
        if pd.isnull(row[target_variable]):
            matching_criteria = row[matching_variables]
            similar_individuals = df.dropna(subset=[target_variable]).loc[
                (df[matching_variables] == matching_criteria).all(axis=1)]
            if len(similar_individuals) > 0:
                imputed_value = similar_individuals.sample(n=1)[target_variable].values[0]
                df.loc[index, target_variable] = imputed_value

    return df

target_variable = "Age"
matching_variables = ['Sex', "Survived","Pclass"]
# Perform hot-deck imputation
data_imputed = hot_deck_imputation(df, target_variable, matching_variables)
#print(data_imputed)
df = data_imputed

In [None]:
df.isna().sum()

In [None]:
# Find the mode of the 'Embarked' column
mode_embarked = df['Embarked'].mode().values[0]

# Replace missing values with the mode
df['Embarked'].fillna(mode_embarked, inplace=True)

In [None]:
df.isna().sum()

In [None]:
df.head(5)

In [None]:
# lets check multicollinearity
import matplotlib.pyplot as plt
import seaborn as sns

# Create correlation matrix
corr_matrix = df.corr()

# Plot correlation matrix heatmap
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')
plt.show()

In [None]:
# Compute eigenvalues
eigenvalues, _ = np.linalg.eig(corr_matrix)

# Calculate condition number
condition_number = np.max(eigenvalues) / np.min(eigenvalues)

print(condition_number)

# Eigenvalues or Condition Number: 
#     Perform an eigenvalue decomposition of the correlation matrix or 
#     calculate the condition number. High eigenvalues or 
#     a large condition number (>30) suggest multicollinearity.

In [None]:
df.head()

In [None]:
# custom mapping for Embarked
embarked_mapping = {'C': 0, 'Q': 1, 'S': 2}
df['Embarked'] = df['Embarked'].map(embarked_mapping)

# custom mapping for Sex
Sex_mapping = {'male': 0, 'female' : 1}
df['Sex'] = df['Sex'].map(Sex_mapping)

In [None]:
df.head()

In [None]:
# Lets build Decision Tree
from sklearn.model_selection import train_test_split as tts
x = df[["Age","Sex","Embarked","Pclass"]]
y = df["Survived"]
xtrain = x
ytrain = y
#xtrain,xtest,ytrain,ytest = tts(x,y,test_size = 0.2,random_state=1)

In [None]:
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier()
dtc.fit(xtrain, ytrain)
#ypredtrain = dtc.predict(xtrain)
#ypredtest = dtc.predict(xtest)

In [None]:
# from sklearn.metrics import accuracy_score
# print("Accuracy of training ", accuracy_score(ytrain,ypredtrain))
# print("Accuracy of testing ", accuracy_score(ytest,ypredtest))

In [3]:
tf.shape

(891, 12)

In [4]:
tf.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [None]:
tf.isna().sum()

In [None]:
# missing values for AGe in test
from sklearn.impute import KNNImputer

# Define the imputer
imputer = KNNImputer(n_neighbors=5)

# Prepare the data for imputation
imputation_data = tf[['Pclass', 'SibSp', 'Parch', 'Fare']]

# Perform imputation
tf['Age'] = imputer.fit_transform(imputation_data)

In [None]:
tf.drop("Cabin",axis = True, inplace = True)

In [None]:
tf.isna().sum()

In [None]:
# Find the mode of the 'Embarked' column
modeembarked = tf['Embarked'].mode().values[0]

# Replace missing values with the mode
tf['Embarked'].fillna(modeembarked, inplace=True)

In [None]:
tf.isna().sum()

In [None]:
# custom mapping for Embarked
embarkedmapping = {'C': 0, 'Q': 1, 'S': 2}
tf['Embarked'] = tf['Embarked'].map(embarkedmapping)

# custom mapping for Sex
Sexmapping = {'male': 0, 'female' : 1}
tf['Sex'] = tf['Sex'].map(Sexmapping)

In [None]:
xt = tf[["Age","Sex","Embarked","Pclass"]]

In [None]:
ytp = dtc.predict(xt)

In [None]:
# how to export into csv
results = pd.DataFrame({'PassengerId': tf['PassengerId'], 'Survived': ytp})
results.to_csv('Full_gender_submission.csv', index=False)

In [None]:
tf.shape

In [None]:
results_418 = results.head(418)
results_418.to_csv('gender_submission.csv', index=False)

In [None]:
results_418.shape