In [16]:
import pandas as pd
train_data: pd.DataFrame = pd.read_csv("./train.csv")

---

# Look at the data

* **Cabin** has too many missing values.
* **Ticket** has too many unique values.

In [2]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Gender       891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [3]:
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Gender,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
train_data.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [5]:
train_data.corr(numeric_only= True)

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
PassengerId,1.0,-0.005007,-0.035144,0.036847,-0.057527,-0.001652,0.012658
Survived,-0.005007,1.0,-0.338481,-0.077221,-0.035322,0.081629,0.257307
Pclass,-0.035144,-0.338481,1.0,-0.369226,0.083081,0.018443,-0.5495
Age,0.036847,-0.077221,-0.369226,1.0,-0.308247,-0.189119,0.096067
SibSp,-0.057527,-0.035322,0.083081,-0.308247,1.0,0.414838,0.159651
Parch,-0.001652,0.081629,0.018443,-0.189119,0.414838,1.0,0.216225
Fare,0.012658,0.257307,-0.5495,0.096067,0.159651,0.216225,1.0


In [6]:
len(train_data["Ticket"].unique())

681

In [18]:
train_data["Cabin"].isna().sum()

687

---

# **Feature Engineering**

#### lets look at **ONE-HOT ENCODING**

In [8]:
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.impute import SimpleImputer

cat_columns: list[str] = ["Gender", "Embarked"]
num_columns: list[str] = ["Pclass", "Age", "SibSp", "Parch"]

# Create a pipeline for the categorical columns using OneHotEncoder
cat_pipeline: Pipeline = make_pipeline(
    SimpleImputer(strategy="most_frequent"),
    OneHotEncoder(handle_unknown="ignore"))


# Create a pipeline for the numerical columns using MinMaxScaler
num_pipeline: Pipeline = make_pipeline(
    SimpleImputer(strategy="median"),
    MinMaxScaler())


In [9]:
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Gender,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [10]:
make_pipeline(
    SimpleImputer(strategy="most_frequent"),
    OneHotEncoder(handle_unknown="ignore")).fit_transform(train_data[cat_columns]).toarray()

array([[0., 1., 0., 0., 1.],
       [1., 0., 1., 0., 0.],
       [1., 0., 0., 0., 1.],
       ...,
       [1., 0., 0., 0., 1.],
       [0., 1., 1., 0., 0.],
       [0., 1., 0., 1., 0.]])

In [11]:
from sklearn.compose import ColumnTransformer

preprocessing = ColumnTransformer([
    ("num", num_pipeline, num_columns),
    ("cat", cat_pipeline, cat_columns),
])

In [12]:
train_data = pd.read_csv("./train.csv")
train_data = train_data.drop(["PassengerId", "Name", "Ticket", "Cabin"], axis= 1)
X = train_data.drop('Survived', axis= 1)
y = train_data['Survived']

In [13]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss, accuracy_score, classification_report

log_reg = LogisticRegression()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.2, random_state= 42)
X_train = preprocessing.fit_transform(X_train)
X_test = preprocessing.transform(X_test)

log_reg.fit(X_train, y_train)

predictions = log_reg.predict(X_test)

log_loss_ = log_loss(y_test, predictions)
acc_score = accuracy_score(y_test, predictions)

print(f"Log loss: {log_loss_}")
print(f"Accuracy score: {acc_score}")
print(classification_report(y_test, predictions))

Log loss: 7.450364108364997
Accuracy score: 0.7932960893854749
              precision    recall  f1-score   support

           0       0.81      0.84      0.83       105
           1       0.76      0.73      0.74        74

    accuracy                           0.79       179
   macro avg       0.79      0.78      0.79       179
weighted avg       0.79      0.79      0.79       179



---

# Prediction

In [22]:
test_data: pd.DataFrame = pd.read_csv("./test.csv")
test = test_data.drop(["PassengerId", "Name", "Ticket", "Cabin"], axis= 1)
test = preprocessing.transform(test)
test_predictions = log_reg.predict(test)

In [24]:
log_reg.predict(test)

array([0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0,
       1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,

In [23]:
log_reg.predict_proba(test)

array([[0.87994623, 0.12005377],
       [0.5211397 , 0.4788603 ],
       [0.8325015 , 0.1674985 ],
       [0.8936797 , 0.1063203 ],
       [0.42021351, 0.57978649],
       [0.86736977, 0.13263023],
       [0.33466035, 0.66533965],
       [0.80661282, 0.19338718],
       [0.23692446, 0.76307554],
       [0.91186709, 0.08813291],
       [0.89550028, 0.10449972],
       [0.65856786, 0.34143214],
       [0.0981843 , 0.9018157 ],
       [0.88758826, 0.11241174],
       [0.14751873, 0.85248127],
       [0.14047179, 0.85952821],
       [0.74690478, 0.25309522],
       [0.81465209, 0.18534791],
       [0.42518337, 0.57481663],
       [0.34336665, 0.65663335],
       [0.61300426, 0.38699574],
       [0.86500924, 0.13499076],
       [0.09255513, 0.90744487],
       [0.42993647, 0.57006353],
       [0.11517814, 0.88482186],
       [0.93904637, 0.06095363],
       [0.05442217, 0.94557783],
       [0.81898516, 0.18101484],
       [0.63654131, 0.36345869],
       [0.8742724 , 0.1257276 ],
       [0.

In [15]:
submission = pd.DataFrame(columns= ["PassengerId", "Survived"])
submission["PassengerId"] = test_data["PassengerId"]
submission["Survived"] = test_predictions

submission.to_csv("submission.csv", index= False)