In [1]:
import numpy as np
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt

df = pd.read_csv("/kaggle/input/titanic/train.csv")
test = pd.read_csv("/kaggle/input/titanic/test.csv")
test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [2]:
print("DATA QUALITY:")
print("-" * 50)

print(" COMPLETENESS CHECK:\n")

# Completeness Check (Missing Values)
missing_values = df.isna().sum()[df.isnull().sum() > 0]
missing_percent = (missing_values / len(df)) * 100

# Combine missing count and percentage into a DataFrame
missing_data = pd.DataFrame({
    "Missing Values": missing_values,
    "Percent Missing": missing_percent
})
missing_data = missing_data[missing_data["Missing Values"] > 0]
print(missing_data)
print("-"*50,'\n','DATA INFORAMTION:\n')
df.info()
print("-"*50,'\n','NUMBER OF DUPLICATED VALUES:', df.duplicated().sum())
print("-"*50,'\n','SOME NUMERICAL FEATURES\' SUMMARIES:\n')
df.describe()[["Age","Fare"]]

DATA QUALITY:
--------------------------------------------------
 COMPLETENESS CHECK:

          Missing Values  Percent Missing
Age                  177        19.865320
Cabin                687        77.104377
Embarked               2         0.224467
-------------------------------------------------- 
 DATA INFORAMTION:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 

Unnamed: 0,Age,Fare
count,714.0,891.0
mean,29.699118,32.204208
std,14.526497,49.693429
min,0.42,0.0
25%,20.125,7.9104
50%,28.0,14.4542
75%,38.0,31.0
max,80.0,512.3292


In [3]:
for col in ["Pclass","SibSp","Parch","Embarked"]:
    print(f"{col} : ",df[col].unique())
print(df.groupby("Cabin")["PassengerId"].count().sort_values(ascending=False))
print(df.groupby("Embarked")["PassengerId"].count().sort_values(ascending=False))

Pclass :  [3 1 2]
SibSp :  [1 0 3 4 2 5 8]
Parch :  [0 1 2 5 3 4 6]
Embarked :  ['S' 'C' 'Q' nan]
Cabin
C23 C25 C27    4
G6             4
B96 B98        4
F2             3
C22 C26        3
              ..
C101           1
B94            1
B86            1
B82 B84        1
T              1
Name: PassengerId, Length: 147, dtype: int64
Embarked
S    644
C    168
Q     77
Name: PassengerId, dtype: int64


# Preprocessnig:

In [4]:
# Dropping irrelevant features:
df.drop(columns=["PassengerId", "Ticket"], inplace= True)

## 1. Handling some Missing Values: 

In [5]:
from sklearn.impute import SimpleImputer

In [6]:
df.fillna({"Embarked": "S"}, inplace=True)

In [7]:
imputer_age = SimpleImputer(strategy='mean')
df["Age"] = imputer_age.fit_transform(df[["Age"]])

## 2. Handling Categorical / Textual variables:

In [8]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

le = LabelEncoder()
df["Sex"] = le.fit_transform(df["Sex"])

In [9]:
df["Title"] = df["Name"].str.extract(" ([A-Za-z]+)\.")
test["Title"] = test["Name"].str.extract(" ([A-Za-z]+)\.")

df["Cabin_Letter"] = df["Cabin"].str[0].fillna("M")
test["Cabin_Letter"] = test["Cabin"].str[0].fillna("M")

In [10]:
encoder = OneHotEncoder(drop="first", sparse_output=False)

encoded_embarked = encoder.fit_transform(df[["Embarked"]])
encoded_df = pd.DataFrame(encoded_embarked, columns=encoder.get_feature_names_out(["Embarked"]))
df = pd.concat([df, encoded_df], axis=1)

encoded_embarked = encoder.fit_transform(test[["Embarked"]])
encoded_df_test = pd.DataFrame(encoded_embarked, columns=encoder.get_feature_names_out(["Embarked"]))
test = pd.concat([test, encoded_df_test], axis=1)

In [11]:
all_titles_cab = set(df["Cabin_Letter"].unique()).union(set(test["Cabin_Letter"].unique()))

encoder = OneHotEncoder(categories=[list(all_titles_cab)], drop="first", sparse_output=False)

encoded_train = encoder.fit_transform(df[["Cabin_Letter"]])
encoded_df0 = pd.DataFrame(encoded_train, columns=encoder.get_feature_names_out(["Cabin_Letter"]))
df = pd.concat([df, encoded_df0], axis=1)

encoded_cabin = encoder.fit_transform(test[["Cabin_Letter"]])
encoded_cabin_df = pd.DataFrame(encoded_cabin, columns=encoder.get_feature_names_out(["Cabin_Letter"]))
test = pd.concat([test, encoded_cabin_df], axis=1)


In [12]:
all_titles = set(df["Title"].unique()).union(set(test["Title"].unique()))

encoder = OneHotEncoder(categories=[list(all_titles)], drop="first", sparse_output=False)

encoded_train = encoder.fit_transform(df[["Title"]])
encoded_df0 = pd.DataFrame(encoded_train, columns=encoder.get_feature_names_out(["Title"]))
df = pd.concat([df, encoded_df0], axis=1)

encoded_test = encoder.transform(test[["Title"]])
encoded_df1 = pd.DataFrame(encoded_test, columns=encoder.get_feature_names_out(["Title"]))
test = pd.concat([test, encoded_df1], axis=1)

# Logistic Regression:

In [13]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

In [14]:
X = df.drop(columns=["Name", "Cabin","Survived", "Title", "Embarked","Cabin_Letter"]) 
y = df["Survived"]

In [15]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, stratify=y , random_state=42)

In [16]:
model = LogisticRegression(max_iter=800)
model.fit(X, y)

In [17]:
y_pred = model.predict(X_val)

In [18]:
y_proba = model.predict_proba(X_val)

In [19]:
accuracy = accuracy_score(y_val, y_pred)
print(f"Accuracy: {accuracy:.2f}")

print(classification_report(y_val, y_pred))

Accuracy: 0.85
              precision    recall  f1-score   support

           0       0.86      0.90      0.88       110
           1       0.83      0.77      0.80        69

    accuracy                           0.85       179
   macro avg       0.84      0.83      0.84       179
weighted avg       0.85      0.85      0.85       179



# Using the provided test data (after processing it as well):

In [20]:
test["Age"] = imputer_age.fit_transform(test[["Age"]])

In [21]:
test["Sex"] = le.fit_transform(test["Sex"])

In [22]:
test.fillna({"Fare":test["Fare"].mean()}, inplace=True)

In [23]:
X_test = test.drop(columns=["PassengerId","Name", "Cabin","Ticket", "Title", "Embarked","Cabin_Letter"]) 

In [24]:
predictions = model.predict(X_test)

In [25]:
submission = pd.DataFrame({"PassengerId": test["PassengerId"], "Survived": predictions})
submission.to_csv("submission.csv", index=False)