In [1]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score

from my_tools import train_valid_test_split

In [2]:
# Load and prepare data
train_data = pd.read_csv('../data/preprocessed_train.csv')
test_data = pd.read_csv('../data/preprocessed_test.csv')
X_train, y_train, X_valid, y_valid, X_test = train_valid_test_split(train_data, test_data)

## Linear Regression (Score: 0.76315)

In [3]:
# Training linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

y_train_pred = model.predict(X_train)
y_valid_pred = model.predict(X_valid)
len(y_valid_pred), min(y_valid_pred)[0], max(y_valid_pred)[0]

(179, -0.1744847788321675, 1.1424559410361828)

In [4]:
# Make sure output is either 0 or 1
y_train_pred[y_train_pred < 0.5] = 0
y_train_pred[y_train_pred >= 0.5] = 1

y_valid_pred[y_valid_pred < 0.5] = 0
y_valid_pred[y_valid_pred >= 0.5] = 1
len(y_valid_pred), min(y_valid_pred)[0], max(y_valid_pred)[0]

(179, 0.0, 1.0)

In [5]:
# Linear regression model coefficients and intercept
model.coef_, model.intercept_

(array([[-1.92987502e-01, -4.92415848e-01, -8.26914063e-03,
         -6.11797479e-02, -1.90656525e-02,  3.27194071e-04,
          1.07197092e-02,  1.48285966e-02, -2.55483058e-02]]),
 array([1.42725482]))

In [6]:
# Get model score
print("Training Score: " + str(accuracy_score(y_train, y_train_pred)))
print("Validation Score: " + str(accuracy_score(y_valid, y_valid_pred)))

Training Score: 0.8089887640449438
Validation Score: 0.7932960893854749


## Create Submission File

In [7]:
# Get predictions for test data
y_test_pred = model.predict(X_test)
len(y_test_pred), min(y_test_pred)[0], max(y_test_pred)[0]

(418, -0.29438731791342776, 1.0642795091276849)

In [8]:
# We need either didn't survived (0) or survive (1)
y_test_pred[y_test_pred < 0.5] = 0
y_test_pred[y_test_pred >= 0.5] = 1
len(y_test_pred), min(y_test_pred)[0], max(y_test_pred)[0]

(418, 0.0, 1.0)

In [9]:
submission = pd.DataFrame()
submission['PassengerId'] = test_data['PassengerId']
submission['Survived'] = y_test_pred.astype('int32')
submission

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


In [10]:
# Create submission file
submission.to_csv('../submissions/linear_regression.csv', index=False)