## Data Pre-Processing

In [42]:
# Import modules
import matplotlib.pyplot as plt
import pandas as pd

# Importing dataset
dataset = pd.read_csv('datasets/full.csv')

# Separate dataset (X) and dependent vector (y)
X = dataset.drop(columns=['Survived'])
y = dataset['Survived']

# Drop unnecessary columns
X.drop(columns=['PassengerId','Name', 'Ticket', 'Cabin'], inplace = True)

# Handle Missing values
## All Empty
X.dropna(axis=0,thresh=1, inplace=True)
X.reset_index(inplace=True)
X.drop(['index'], axis=1, inplace=True)

## Filling missing
X.fillna(X.mean(), inplace=True)
X['Embarked'].fillna(X['Embarked'].value_counts().index[0], inplace=True)
X['Sex'].fillna(X['Sex'].value_counts().index[0], inplace=True)

# Encode Categorical Data
dummy_vars = ['Sex', 'Embarked']
X_categorical = pd.get_dummies(X[dummy_vars])
X = pd.concat([X_categorical, X.drop(dummy_vars, axis=1)], axis = 1, sort=False)

# Avoiding the dummy variable trap
X['Sex_male'].values[:] = 1 # set as constant to work with multiple linear regression
X['Embarked_S'].values[:] = 1 # set as constant to work with multiple linear regression
cols = X.columns.tolist()
cols.insert(0, cols.pop(cols.index('Sex_male')))
cols.insert(0, cols.pop(cols.index('Embarked_S')))
X = X.reindex(columns=cols)

# Splitting Training and Test Set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

# Feature Scaling
from sklearn.preprocessing import StandardScaler
ss_X = StandardScaler()

cols_to_scale = ['Age', 'SibSp', 'Parch', 'Fare']

X_train_scaling = X_train[cols_to_scale]
X_test_scaling = X_test[cols_to_scale]

X_train_not_scaling = X_train.drop(columns=cols_to_scale)
X_test_not_scaling = X_test.drop(columns=cols_to_scale)

X_train_scaled = pd.DataFrame(ss_X.fit_transform(X_train_scaling), columns=X_train_scaling.columns.values)
X_test_scaled = pd.DataFrame(ss_X.transform(X_test_scaling), columns=X_train_scaling.columns.values)

X_train = pd.concat([X_train_not_scaling, X_train_scaling], axis=1, sort=False)
X_test = pd.concat([X_test_not_scaling, X_test_scaling], axis=1, sort=False)


## Linear Regression

### Multiple-Linear Regression

In [63]:
def backwardElimination(x, y, sl):
    print('--------------------------')
    print('Started backwardElimination:')
    print('--------------------------')
    num_vars = x.shape[1]
    for i in range(0, num_vars):
        # 2. Fit the model with all predictors, # 5. Fit the model without the highest P-value predictor
        ols = lm.OLS(endog=y, exog= x).fit()
        # print('After fit iteration: {}'.format(i))
        # print(ols.summary())
        # 3. Consider the predictor with the highest P-value, If P > SL, go to 4, otherwise done.
        pvalues = ols.pvalues
        max_var, max_val = 0, float('-inf')
        for i in range(len(pvalues)):
            if pvalues[i] > max_val:
                max_var, max_val = x.columns[i], pvalues[i]
        if max_val > sl:
            # 4. Remove the predictor
            print('Removing Predictor: {}'.format(max_var))
            x = x.drop(columns=max_var)
        else:
            print('All predictors pvalues < sl({}) ... Done'.format(sl))
            break
    print(ols.summary())
    print('--------------------------')
    print('Ended backwardElimination:')
    print('--------------------------')
    return x

# 1. Select a significance level to stay in the model --> SL = 0.05
SL = 0.05
X_train_be = backwardElimination(X_train, y_train, SL)
X_test_be = X_test[X_train_be.columns.values]

X_test_be.reset_index(inplace=True, drop=True)
y_test.reset_index(inplace=True, drop=True)

# Fitting Simple Linear Regression to the Training set
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(X_train_be, y_train)

# Predicting the Test set results
y_test_pred = pd.Series(lr.predict(X_test_be), name='Survived').round(0)

# Determine Prediction Accuracy
correct_count = 0
for i in range(len(y_test)):
    if y_test[i] == y_test_pred[i]:
        correct_count += 1
print('prediction accuracy: {}%'.format(100 * round(correct_count/ len(y_test),2)))


--------------------------
Started backwardElimination:
--------------------------
Removing Predictor: Parch
Removing Predictor: Embarked_Q
Removing Predictor: Embarked_C
Removing Predictor: Fare
All predictors pvalues < sl(0.05) ... Done
                            OLS Regression Results                            
Dep. Variable:               Survived   R-squared:                       0.493
Model:                            OLS   Adj. R-squared:                  0.491
Method:                 Least Squares   F-statistic:                     252.8
Date:                Thu, 09 Apr 2020   Prob (F-statistic):          8.49e-152
Time:                        21:42:36   Log-Likelihood:                -362.71
No. Observations:                1047   AIC:                             735.4
Df Residuals:                    1042   BIC:                             760.2
Df Model:                           4                                         
Covariance Type:            nonrobust             