# Pipelined solutions to titanic
Check out the EDA notebook

In [17]:
import numpy as np
import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.metrics import make_scorer, mean_squared_error
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.model_selection import train_test_split, cross_val_score

# Linear Regression 

In [18]:
train = pd.read_csv("train_eda.csv")
test = pd.read_csv("test_eda.csv")

In [19]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Cabin_L,Cabin_N,Title,Surname
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,,,Mr.,Braund
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,C,85.0,Mrs.,Cumings
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,,,Miss.,Heikkinen
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,C,123.0,Mrs.,Futrelle
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,,,Mr.,Allen


In [20]:
# From the EDA notebook
removeable_columns = ["PassengerId", "Survived"]
categorical_columns = ["Pclass", "Sex", "Parch", "Embarked", "SibSp", 'Cabin_L'] 
continuous_columns = ["Age", "Fare"]
discrete_columns = ["Cabin_N"]

categorical_columns.append("Title")


y = train["Survived"]
X, X_test = train.drop(columns=removeable_columns, axis=1), test.drop(columns=["PassengerId"], axis=1)

In [21]:
set(train.columns) - set(removeable_columns) - set(categorical_columns) - set(continuous_columns) - set(discrete_columns)

{'Cabin', 'Name', 'Surname', 'Ticket'}

## Preprocessing Schema:
this will not be strict 

    - Surname, Ticket, Name, Cabin can be ignored.
    
    Data Imputing:
        - Cabin_N mean will be used to fill in the missing values.  
        - New value for categorical columns (only Embarked has missing values)
        - Mean for continuous data



In [22]:
# Model Definition

continuous_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

discrete_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", StandardScaler())
])


preprocessor = ColumnTransformer(
    transformers=[
        ("con", continuous_transformer, continuous_columns),
        ("cat", categorical_transformer, categorical_columns),
        ("dis", discrete_transformer, discrete_columns)
    ]
)

model = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", LinearRegression())
])

In [23]:
# Train/Val Subset
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Train
model.fit(X_train, y_train)

# CV 
neg_mses = cross_val_score(model, X, y, 
                        scoring="neg_mean_squared_error",
                        cv=5)
rmse_scores = np.sqrt(-neg_mses)
cv_score = rmse_scores.mean()
print(f"Cross-validation accuracy: {cv_score:.3f}")

# 80/20 Eval
y_val_hat = model.predict(X_val)
score = mean_squared_error(y_val_hat, y_val)
score = score ** (1/2)
print(f"80/20 Validation accuracy:: {score:.3f}")

Cross-validation accuracy: 0.375
80/20 Validation accuracy:: 0.381


In [24]:
preprocessor.fit_transform(X_train).shape

(712, 47)

In [25]:
X_train.shape

(712, 14)