In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

## Linear Regression (Score: 0.76555)

In [2]:
# Load training data
train_data = pd.read_csv('../data/preprocessed_train.csv')
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S
0,1,0,3,1,22.0,1,0,7.25,0,0,1
1,2,1,1,0,38.0,1,0,71.2833,1,0,0
2,3,1,3,0,26.0,0,0,7.925,0,0,1
3,4,1,1,0,35.0,1,0,53.1,0,0,1
4,5,0,3,1,35.0,0,0,8.05,0,0,1


In [3]:
# Prepare input values
X = train_data.drop(["PassengerId", "Survived"], axis=1)
X.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S
0,3,1,22.0,1,0,7.25,0,0,1
1,1,0,38.0,1,0,71.2833,1,0,0
2,3,0,26.0,0,0,7.925,0,0,1
3,1,0,35.0,1,0,53.1,0,0,1
4,3,1,35.0,0,0,8.05,0,0,1


In [4]:
# Prepare output values
y = train_data[["Survived"]]
y.head()

Unnamed: 0,Survived
0,0
1,1
2,1
3,1
4,0


In [5]:
# Split training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8)

In [6]:
# Training linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
len(y_pred), min(y_pred)[0], max(y_pred)[0]

(179, -0.11896379067986507, 1.0860832231227013)

In [7]:
# Linear regression model coefficients and intercept
model.coef_, model.intercept_

(array([[-1.91611786e-01, -4.82346990e-01, -7.34796421e-03,
         -5.85752925e-02, -1.15538639e-02,  2.73046320e-04,
          2.64880945e-02,  2.56913049e-02, -5.21793994e-02]]),
 array([1.40496285]))

In [8]:
# Get model score
model.score(X_train, y_train)

0.42039193903707606

## Create Submission File

In [9]:
# Load test data
test_data = pd.read_csv('../data/preprocessed_test.csv')
test_data.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S
0,892,3,1,34.5,0,0,7.8292,0,1,0
1,893,3,0,47.0,1,0,7.0,0,0,1
2,894,2,1,62.0,0,0,9.6875,0,1,0
3,895,3,1,27.0,0,0,8.6625,0,0,1
4,896,3,0,22.0,1,1,12.2875,0,0,1


In [10]:
# Prepare input value
X = test_data.drop("PassengerId", axis=1)
X.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S
0,3,1,34.5,0,0,7.8292,0,1,0
1,3,0,47.0,1,0,7.0,0,0,1
2,2,1,62.0,0,0,9.6875,0,1,0
3,3,1,27.0,0,0,8.6625,0,0,1
4,3,0,22.0,1,1,12.2875,0,0,1


In [11]:
# Get predictions for test data
y_pred = model.predict(X)
len(y_pred), min(y_pred)[0], max(y_pred)[0]

(418, -0.28366406973782854, 1.0835532788580509)

In [12]:
# We need either didn't survived (0) or survive (1)
y_pred[y_pred < 0.5] = 0
y_pred[y_pred >= 0.5] = 1
len(y_pred), min(y_pred)[0], max(y_pred)[0]

(418, 0.0, 1.0)

In [13]:
submission = pd.DataFrame()
submission['PassengerId'] = test_data['PassengerId']
submission['Survived'] = y_pred.astype('int32')
submission

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


In [14]:
# Create submission file
submission.to_csv('../submissions/linear_regression.csv', index=False)