In [1]:
import pandas as pd
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split

In [2]:
data = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
ids = test['PassengerId']

In [3]:
def clean(data):
    
    #Essientally a label encoder
    #Can be done from sklearn.preprocessing.LabelEncoder()
    gender_map = {'male':0, 'female':1}
    embark_map = {'S':0, 'C':1, 'Q':2, 'U':3}
    
    
    data = data.drop(['Ticket','Cabin','Name','PassengerId'], axis = 1)
    data['Sex'] = data['Sex'].map(gender_map)
    data['Age'] = data['Age'].fillna(round(data['Age'].mean()))
    data['Age'] = data['Age']/(data['Age'].max())
    data['Fare'] = data['Fare'].fillna(round(data['Fare'].mean()))
    data['Fare'] = data['Fare']/data['Fare'].max()
    
    
    #Fills missing data with unknown token
    data['Embarked'] = data.Embarked.fillna('U')
    data['Embarked'] = data.Embarked.map(embark_map)
    
    return data

In [4]:
cleaned_data = clean(data)
cleaned_test = clean(test)

In [5]:
y = cleaned_data['Survived']
X = cleaned_data.drop(['Survived'], axis = 1)

# Stochastic Gradient Descent

In [6]:
X_train1, X_test1, y_train1, y_test1 = train_test_split(X, y)
sgd_clf = SGDClassifier(random_state = 343)
sgd_clf.fit(X_train1, y_train1)

SGDClassifier(random_state=343)

In [7]:
from sklearn.model_selection import cross_val_score
cross_val_score(sgd_clf, X_train1, y_train1, cv = 3, scoring = 'accuracy')

array([0.78923767, 0.75336323, 0.44594595])

# Logistic Regression

In [8]:
from sklearn.linear_model import LogisticRegression

y = cleaned_data['Survived']
X = cleaned_data.drop(['Survived'], axis = 1)

X_train2, X_test2, y_train2, y_test2 = train_test_split(X, y)

log_reg = LogisticRegression(random_state = 343, max_iter = 10000)
log_reg.fit(X_train2,y_train2)
predict = log_reg.predict(X_test2)


In [9]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test2,predict)

0.7937219730941704

In [10]:
#cleaned_test.isnull().sum()
submission = log_reg.predict(cleaned_test)
df = pd.DataFrame({'PassengerId':ids.values,'Survived':submission})

In [11]:
df.to_csv('submission.csv',index = False)