# Linear Regression

In [2]:
import numpy as np
import pandas as pd

import random

In [3]:
pd.options.mode.copy_on_write = True

In [4]:
train = pd.read_csv("train_eda.csv")
test = pd.read_csv("test_eda.csv")

In [5]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Cabin_L,Cabin_N,Title,Surname
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,,,Mr.,Braund
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,C,85.0,Mrs.,Cumings
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,,,Miss.,Heikkinen
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,C,123.0,Mrs.,Futrelle
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,,,Mr.,Allen


## Preprocessing

In [6]:
# From the EDA notebook
removeable_columns = ["PassengerId", "Survived"]
categorical_columns = ["Pclass", "Sex", "Parch", "Embarked", "SibSp", 'Cabin_L'] 
continuous_columns = ["Age", "Fare"]
discrete_columns = ["Cabin_N"]

categorical_columns.append("Title")

feature_columns = categorical_columns + continuous_columns + discrete_columns

In [7]:
# Separate inputs and outputs
y = train["Survived"]
X, X_test = train.drop(columns=removeable_columns, axis=1), test.drop(columns=["PassengerId"], axis=1)

In [8]:
# Shuffle the data
idx = [i for i in range(len(X))]
random.shuffle(idx)

# 80/20 Split for train/val
val_size = 0.2
t_id = int((1-val_size)*len(X))
X_train, X_val = X[:t_id], X[t_id:]
y_train, y_val = y[:t_id], y[t_id:]


In [9]:
continuous_columns + discrete_columns

['Age', 'Fare', 'Cabin_N']

In [20]:
# Impute Categorical Columns
val = "Unknown"
values = [val for val in categorical_columns]
fillna_arg = dict(zip(categorical_columns, values))
X_train.fillna(fillna_arg, inplace=True)
X_val.fillna(fillna_arg, inplace=True)
X_test.fillna(fillna_arg, inplace=True)

# Impute Numerical Columns
numerical_columns = continuous_columns+discrete_columns
vals = X_train[numerical_columns].median()
fillna_arg = dict(zip(numerical_columns, vals))
X_train.fillna(fillna_arg, inplace=True)
X_val.fillna(fillna_arg, inplace=True)
X_test.fillna(fillna_arg, inplace=True)

missing_count = X_train[feature_columns].isna().sum().sum() + X_val[feature_columns].isna().sum().sum() + X_test[feature_columns].isna().sum().sum()

assert missing_count == 0, "Missing Values Need to be handled"

In [23]:
X_train[feature_columns].head()

Unnamed: 0,Pclass,Sex,Parch,Embarked,SibSp,Cabin_L,Title,Age,Fare,Cabin_N
0,3,male,0,S,1,Cabin_L,Mr.,22.0,7.25,38.0
1,1,female,0,C,1,C,Mrs.,38.0,71.2833,85.0
2,3,female,0,S,0,Cabin_L,Miss.,26.0,7.925,38.0
3,1,female,0,S,1,C,Mrs.,35.0,53.1,123.0
4,3,male,0,S,0,Cabin_L,Mr.,35.0,8.05,38.0


In [None]:
# 