In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

train_data = pd.read_csv('C:\\Users\\user\\Downloads\\train.csv')
test_data = pd.read_csv('C:\\Users\\user\\Downloads\\test.csv')

train_data['famsize'] = train_data['SibSp'] + train_data['Parch'] 
test_data['famsize'] =  test_data['SibSp'] + test_data['Parch']

train_data = train_data.drop(['PassengerId', 'Name', 'Ticket', 'Embarked', 'Cabin', 'SibSp', 'Parch'], axis=1)
test_data = test_data.drop([ 'Name', 'Ticket', 'Embarked', 'Cabin', 'SibSp', 'Parch'], axis=1)
numeric_features = train_data.select_dtypes(include=[np.number])

print(numeric_features.dtypes)
print(train_data.corr())
train_data.head()

Survived      int64
Pclass        int64
Age         float64
Fare        float64
famsize       int64
dtype: object
          Survived    Pclass       Age      Fare   famsize
Survived  1.000000 -0.338481 -0.077221  0.257307  0.016639
Pclass   -0.338481  1.000000 -0.369226 -0.549500  0.065997
Age      -0.077221 -0.369226  1.000000  0.096067 -0.301914
Fare      0.257307 -0.549500  0.096067  1.000000  0.217138
famsize   0.016639  0.065997 -0.301914  0.217138  1.000000


Unnamed: 0,Survived,Pclass,Sex,Age,Fare,famsize
0,0,3,male,22.0,7.25,1
1,1,1,female,38.0,71.2833,1
2,1,3,female,26.0,7.925,0
3,1,1,female,35.0,53.1,1
4,0,3,male,35.0,8.05,0


In [2]:
train_data['Age'] =train_data['Age'].fillna(-0.5)
test_data['Age'] = test_data['Age'].fillna(-0.5)


def process_age(df,cut_points,label_names):
    df["Age_categories"] = pd.cut(df["Age"],cut_points,labels=label_names)
    return df

cut_points = [-1,0,5,12,18,35,60,100]
label_names = ["Missing","Infant","Child","Teenager","Young Adult","Adult","Senior"]
train_data = process_age(train_data,cut_points,label_names)
test_data = process_age(test_data,cut_points,label_names)


In [3]:
def process_fare(df,cut_points,label_names):
    df["fare_category"] = pd.cut(df["Fare"],cut_points,labels=label_names)
    return df
cut_points = [0,50,100,200,300,500]
label_names= ['free','cheap','low','moderate','high',]
test_data = process_fare(test_data,cut_points,label_names)
train_data = process_fare(train_data,cut_points,label_names)

In [4]:
train_data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Fare,famsize,Age_categories,fare_category
0,0,3,male,22.0,7.25,1,Young Adult,free
1,1,1,female,38.0,71.2833,1,Adult,cheap
2,1,3,female,26.0,7.925,0,Young Adult,free
3,1,1,female,35.0,53.1,1,Young Adult,cheap
4,0,3,male,35.0,8.05,0,Young Adult,free


In [5]:
def create_dummies(df,column_name):
    dummies = pd.get_dummies(df[column_name],prefix=column_name)
    df = pd.concat([df,dummies],axis=1)
    return df

for column in ["Pclass","Sex","Age_categories","fare_category",'famsize']:
    train_data = create_dummies(train_data, column)
    test_data = create_dummies(test_data, column)

In [6]:
train_data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Fare,famsize,Age_categories,fare_category,Pclass_1,Pclass_2,...,fare_category_high,famsize_0,famsize_1,famsize_2,famsize_3,famsize_4,famsize_5,famsize_6,famsize_7,famsize_10
0,0,3,male,22.0,7.25,1,Young Adult,free,0,0,...,0,0,1,0,0,0,0,0,0,0
1,1,1,female,38.0,71.2833,1,Adult,cheap,1,0,...,0,0,1,0,0,0,0,0,0,0
2,1,3,female,26.0,7.925,0,Young Adult,free,0,0,...,0,1,0,0,0,0,0,0,0,0
3,1,1,female,35.0,53.1,1,Young Adult,cheap,1,0,...,0,0,1,0,0,0,0,0,0,0
4,0,3,male,35.0,8.05,0,Young Adult,free,0,0,...,0,1,0,0,0,0,0,0,0,0


In [7]:
columns = ['Pclass_1', 'Pclass_2', 'Pclass_3', 'Sex_female', 'Sex_male',
       'Age_categories_Missing','Age_categories_Infant',
       'Age_categories_Child', 'Age_categories_Teenager',
       'Age_categories_Young Adult', 'Age_categories_Adult',
      'Age_categories_Senior','famsize_0','famsize_1','famsize_2','famsize_3',
        'famsize_4','famsize_5','famsize_6','famsize_7','famsize_10','fare_category_high',
          'fare_category_cheap','fare_category_low','fare_category_moderate','fare_category_free']

In [8]:
from sklearn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier(n_neighbors = 10, weights = 'distance',)


from sklearn.model_selection import train_test_split
xtrain, xtesting, ytrain, ytesting = train_test_split(train_data[columns],train_data['Survived'], test_size=0.2, random_state=42)
model.fit(xtrain, ytrain)
prediction = model.predict(xtesting)


from sklearn.metrics import accuracy_score
accuracy = accuracy_score(ytesting, prediction)
print(accuracy)

0.8212290502793296


In [9]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(model, train_data[columns], train_data['Survived'], cv=10)
scores.sort()
accuracy = scores.mean()

print(scores)
print(accuracy)


[0.73033708 0.78651685 0.78888889 0.79775281 0.82022472 0.83146067
 0.83146067 0.85393258 0.85393258 0.86516854]
0.8159675405742822


In [10]:
from sklearn.model_selection import GridSearchCV
param_grid = [
 {'n_neighbors': [3, 10, 30], 'weights': ['distance','uniform']},
 {'n_neighbors': [3, 10], 'weights': ['distance','uniform']},
 ]

grid_search = GridSearchCV(model, param_grid, cv=5,scoring='neg_mean_squared_error',return_train_score=True)
grid_search.fit(xtrain, ytrain)
grid_search.best_estimator_

KNeighborsClassifier(n_neighbors=10, weights='distance')

In [11]:
from sklearn.ensemble import RandomForestClassifier 
from sklearn.ensemble import VotingClassifier 
from sklearn.linear_model import LogisticRegression 
from sklearn.svm import SVC
log_clf = LogisticRegression() 
rnd_clf = RandomForestClassifier() 
svm_clf = SVC()
voting_clf = VotingClassifier(    estimators=[('lr', log_clf), ('rf', rnd_clf), ('svc', svm_clf)],    voting='hard') 


In [12]:
from sklearn.ensemble import RandomForestClassifier
rnd_clf = RandomForestClassifier()


from sklearn.model_selection import train_test_split
xtrain, xtesting, ytrain, ytesting = train_test_split(train_data[columns],train_data['Survived'], test_size=0.2, random_state=42)
rnd_clf.fit(xtrain, ytrain)
prediction = rnd_clf.predict(xtesting)


from sklearn.metrics import accuracy_score
accuracy = accuracy_score(ytesting, prediction)
print(accuracy)

from sklearn.model_selection import cross_val_score
rnd_clf = RandomForestClassifier()

scores = cross_val_score(rnd_clf, train_data[columns], train_data['Survived'], cv=10)
scores.sort()
accuracy = scores.mean()

print(scores)
print(accuracy)



0.8379888268156425
[0.73033708 0.77777778 0.79775281 0.80898876 0.80898876 0.80898876
 0.84269663 0.85393258 0.86516854 0.86516854]
0.8159800249687891


In [13]:

log_clf = LogisticRegression()


from sklearn.model_selection import train_test_split
xtrain, xtesting, ytrain, ytesting = train_test_split(train_data[columns],train_data['Survived'], test_size=0.2, random_state=42)
log_clf.fit(xtrain, ytrain)
prediction =log_clf.predict(xtesting)


from sklearn.metrics import accuracy_score
accuracy = accuracy_score(ytesting, prediction)
print(accuracy)

from sklearn.model_selection import cross_val_score
log_clf = LogisticRegression()

scores = cross_val_score(log_clf, train_data[columns], train_data['Survived'], cv=10)
scores.sort()
accuracy = scores.mean()

print(scores)
print(accuracy)

0.8044692737430168
[0.78651685 0.78651685 0.78651685 0.78651685 0.80898876 0.81111111
 0.83146067 0.83146067 0.83146067 0.85393258]
0.8114481897627964


In [14]:

voting_clf = VotingClassifier(    estimators=[('lr', log_clf), ('rf', rnd_clf), ('svc', svm_clf)],    voting='hard')

from sklearn.model_selection import train_test_split
xtrain, xtesting, ytrain, ytesting = train_test_split(train_data[columns],train_data['Survived'], test_size=0.2, random_state=42)
voting_clf.fit(xtrain, ytrain)
prediction =voting_clf.predict(xtesting)


from sklearn.metrics import accuracy_score
accuracy = accuracy_score(ytesting, prediction)
print(accuracy)

from sklearn.model_selection import cross_val_score
voting_clf = VotingClassifier(    estimators=[('lr', log_clf), ('rf', rnd_clf), ('svc', svm_clf)],    voting='hard')

scores = cross_val_score(voting_clf, train_data[columns], train_data['Survived'], cv=10)
scores.sort()
accuracy = scores.mean()

print(scores)
print(accuracy)

0.8156424581005587
[0.75280899 0.78651685 0.79775281 0.79775281 0.81111111 0.82022472
 0.84269663 0.84269663 0.85393258 0.86516854]
0.8170661672908863


In [15]:

rnd_clf.fit(train_data[columns],train_data['Survived'])
testset_predictions = rnd_clf.predict(test_data[columns])

xtest_set_ids =test_data["PassengerId"]
submission_df = {"PassengerId": xtest_set_ids,
                 "Survived": testset_predictions}
submission = pd.DataFrame(submission_df)

submission.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1


In [16]:
submission.to_csv("submission6.csv",index=False)