In [37]:
#Student id 1913617
# importing libraries which we will use in future
import pandas as pd
import numpy as np
from sklearn import tree
from sklearn.linear_model import LogisticRegression
from sklearn import model_selection
from sklearn import linear_model
from sklearn import ensemble

In [92]:
# Reading training data
train = pd.read_csv('train.csv') #You should change the path to dataset in your computer
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [93]:
# Reading our testing data 
test = pd.read_csv('test.csv') #You should change the path to dataset in your computer
test.head()


Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [94]:
# Dropping unused coloumns 
train.drop(columns=['Name','Cabin','Fare','Ticket'], inplace=True)
test.drop(columns=['Name','Cabin','Ticket','Fare'], inplace=True)

In [95]:
# Checking DataFrame for null values
train.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Sex              0
Age            177
SibSp            0
Parch            0
Embarked         2
dtype: int64

In [96]:
# Creating function for doing some major cleanings. Changing embarked coloumn values to numeric , filling Age coloumn null values, changing Sex coloumn values into numeric
# And last dropping all null values.
embarked_port = {"S": 0, "C": 1, "Q": 2}
def clean_data(data):
    data["Age"] = data["Age"].fillna(data["Age"].dropna().median())
    data.loc[data["Sex"] == "male", "Sex"] = 0
    data.loc[data["Sex"] == "female", "Sex"] = 1
    data['Embarked'] = data['Embarked'].map(embarked_port)
train.dropna(inplace=True)

In [97]:
# Re-checking dataframe for null values
train.isnull().sum()

PassengerId    0
Survived       0
Pclass         0
Sex            0
Age            0
SibSp          0
Parch          0
Embarked       0
dtype: int64

In [98]:
# Using our data cleaning function
# Picking up our features for training our model
# And picking our target which should be predicted
clean_data(train)
clean_data(test)
feature_names = train[['Pclass','Age','Sex','SibSp','Embarked','Parch']].values 
target = train['Survived'].values

In [99]:
# Training machine learning algorithm. Doing linear logistic regression
# Fitting our values
# printing score of our algorithm
logistic = linear_model.LogisticRegression()
logistic.fit(feature_names, target)
print(logistic.score(feature_names, target))

0.797752808988764




In [100]:
# Cross validation of our score
scores = model_selection.cross_val_score(logistic, feature_names, target, scoring='accuracy', cv=10)
print(scores)
print(scores.max())

[0.80555556 0.73611111 0.79166667 0.86111111 0.73239437 0.78873239
 0.77464789 0.77464789 0.82857143 0.8       ]
0.8611111111111112




In [101]:
# Using Random Forest classifier
# Splitting our training dataframe into splits for better performance and to avoid overtraining
# Printing our classifier score
forest = ensemble.RandomForestClassifier(
    max_depth = 7,
    min_samples_split = 4,
    n_estimators = 1000,
    random_state = 1,
    n_jobs = -1
)

X_train,X_test,y_train,y_test = model_selection.train_test_split(feature_names,target,test_size=0.2,random_state=0)
forest = forest.fit(X_train, y_train)
print(forest.score(feature_names, target))

0.8679775280898876


In [102]:
# printing cross validation scores
scores = model_selection.cross_val_score(forest, feature_names, target, scoring='accuracy', cv=10)
print(scores)
print(scores.max())

[0.73611111 0.75       0.76388889 0.90277778 0.8028169  0.78873239
 0.81690141 0.76056338 0.88571429 0.84285714]
0.9027777777777778


In [73]:
# defining our test features
# Testing our Random forest classifier. Making a prediction
test_features_forest = test[["Pclass", "Age", "Sex","SibSp", "Parch", "Embarked"]].values
prediction_forest = forest.predict(test_features_forest)

In [74]:
# Creating a function for saving our predictions in a new file
def write_prediction(prediction, name):
    PassengerId = np.array(test["PassengerId"]).astype(int)
    solution = pd.DataFrame(prediction, PassengerId, columns = ["Survived"])
    solution.to_csv(name, index_label = ["PassengerId"])

In [75]:
# Saving our predictions 
write_prediction(prediction_forest, "submission.csv")