In [1]:
#Importing libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [27]:
#Loading dataset
df = pd.read_csv('train.csv', encoding='ISO-8859-1')

In [28]:
#Create column with passengers who are married, identified through the Name column
df['Mrs'] = df['Name'].str.contains('Mrs. ')

In [29]:
#Creating column with adult male passengers, identified through the Name column
df['Mr'] = df['Name'].str.contains('Mr. ')

In [30]:
#Creating column with male child passengers, identified through the Name column
df['Master'] = df['Name'].str.contains('Master. ')

In [31]:
#Create column with single passengers, identified through the Name column
df['Miss'] = df['Name'].str.contains('Miss. ')

In [32]:
#Create column with passengers identified through the Name column
df['Aux_Names'] = df['Name'].str.contains('Master. |Miss. |Mr. |Mrs. ')

In [33]:
#Create column with Female passengers + Master column passengers
df['Female_Master'] = (df['Master'] == True) | (df['Sex'] == 'female' )

In [34]:
# Function to handle True and False
def func_true_false(var):
    if var == True:
        return 1
    if var == False:
        return 0

In [35]:
#Applying Function for handling True and False
df['Female_Master'] = df.Female_Master.apply(func_true_false)

In [36]:
#Applying Function for handling True and False
df['Mrs'] = df.Mrs.apply(func_true_false)

In [37]:
#Applying Function for handling True and False
df['Mr'] = df.Mr.apply(func_true_false)

In [38]:
#Applying Function for handling True and False
df['Miss'] = df.Miss.apply(func_true_false)

In [39]:
#Applying Function for handling True and False
df['Master'] = df.Master.apply(func_true_false)

In [40]:
#Applying Function for handling True and False
df['Aux_Names'] = df.Aux_Names.apply(func_true_false)

In [41]:
#Transforming the Sex column into dummies
dummies_sex = pd.get_dummies(df['Sex'])

In [42]:
#Inserting Sex dummies in the dataset
df = pd.concat([df,dummies_sex],axis=1)

In [43]:
#Transforming the Pclass column into dummies
dummies_pclass = pd.get_dummies(df['Pclass'])

In [44]:
#Inserting Pclass dummies in the dataset
df = pd.concat([df,dummies_pclass],axis=1)

In [45]:
#Applying the average in the Fare column for null data
df['Fare'] = df['Fare'].fillna(df['Fare'].mean())

In [46]:
#Applying the average in the Age column for null data only for the Master
df.loc[df['Master'] == 1, 'Age'] = df[df.Master == 1].Age.fillna(df[df.Master == 1].Age.mean())

In [47]:
#Applying the average in the Age column for null data only for Miss
df.loc[df['Miss'] == 1, 'Age'] = df[df.Miss == 1].Age.fillna(df[df.Miss == 1].Age.mean())

In [48]:
#Applying the average in the Age column for null data for Mrs only
df.loc[df['Mrs'] == 1, 'Age'] = df[df.Mrs == 1].Age.fillna(df[df.Mrs == 1].Age.mean())

In [49]:
#Applying the average in the Age column for null data only for Mr
df.loc[df['Mr'] == 1, 'Age'] = df[df.Mr == 1].Age.fillna(df[df.Mr == 1].Age.mean())

In [50]:
#Applying the average in the Age column for the remaining null data
df['Age'] = df['Age'].fillna(df['Age'].mean())

In [51]:
#Separating X and y variables for model creation
X = df[['Age','SibSp','Parch','Fare','Mrs','Master','Mr',1,3,'Female_Master']]
y = df['Survived']

In [52]:
#Applying split to X and y
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=4)

In [59]:
#Creating and training the Logistic Regression model
logreg = LogisticRegression()
logreg.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [60]:
#Effecting the base test values
y_pred = logreg.predict(X_test)

In [61]:
#Getting model accuracy
print(accuracy_score(y_test,y_pred))

0.8731343283582089


In [259]:
#Effecting the test.csv base values
X_test = df[['Age','SibSp','Parch','Fare','Mrs','Master','Mr',1,3,'Female_Master']]
y_pred = logreg.predict(X_test)

In [260]:
#Including the Survived variable in the test base
df['Survived'] = y_pred

In [261]:
#Exporting the PassengerId and Survived variables to submit to Kaggle
df[['PassengerId','Survived']].to_csv('gender_submission.csv', index=False)