In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [3]:
train_df = pd.read_csv('../input/titanic/train.csv')
test_df = pd.read_csv('../input/titanic/test.csv')
train_df

In [4]:
train_df.info()

In [5]:
train_df.describe()

In [6]:
import matplotlib.pyplot as plt
fig = plt.figure()
fig.set(alpha=0.2)

plt.subplot2grid((2,3),(0,0))
train_df.Survived.value_counts().plot(kind='bar')
plt.title('survive')

plt.subplot2grid((2,3),(0,1))
train_df.Pclass.value_counts().plot(kind='bar')
plt.title('Pclass')

plt.subplot2grid((2,3),(0,2))
plt.scatter(train_df.Survived, train_df.Age)
plt.ylabel('Age')
plt.grid(visible=True, which='major', axis='y')

plt.subplot2grid((2,3),(1,0), colspan=2)
train_df.Age[train_df.Pclass == 1].plot(kind='kde')
train_df.Age[train_df.Pclass == 2].plot(kind='kde')
train_df.Age[train_df.Pclass == 3].plot(kind='kde')
plt.xlabel('Age')
plt.ylabel('Density')
plt.legend(('1class','2class','3class'),loc='best')

plt.subplot2grid((2,3),(1,2))
train_df.Embarked.value_counts().plot(kind='bar')
plt.show()

In [7]:
fig = plt.figure()
fig.set(alpha=0.2)

S_0 = train_df.Pclass[train_df.Survived == 0].value_counts()
S_1 = train_df.Pclass[train_df.Survived == 1].value_counts()
df = pd.DataFrame({'survived':S_1, 'unsurvived':S_0})
df.plot(kind='bar', stacked=False)
plt.xlabel('class')
plt.ylabel('number of people')
plt.show()


In [8]:
fig = plt.figure()
fig.set(alpha=0.2)

S_E0 = train_df.Embarked[train_df.Survived == 0].value_counts()
S_E1 = train_df.Embarked[train_df.Survived == 1].value_counts()
df = pd.DataFrame({'survived':S_E1, 'unsurvived':S_E0})
df.plot(kind='bar', stacked=True)
plt.xlabel('Embarked')
plt.ylabel('number of people')
plt.show()

In [9]:
fig = plt.figure()
fig.set(alpha=0.2)

S_m = train_df.Survived[train_df.Sex == 'male'].value_counts()
S_f = train_df.Survived[train_df.Sex == 'female'].value_counts()
df = pd.DataFrame({'M':S_m, 'F':S_f})
df.plot(kind='bar', stacked=False)
plt.xlabel('Survived')
plt.ylabel('number of people')
plt.show()

In [10]:
fig=plt.figure()
fig.set(alpha=0.65) 
plt.title(u"situation")

ax1=fig.add_subplot(141)
train_df.Survived[train_df.Sex == 'female'][train_df.Pclass != 3].value_counts().plot(kind='bar', label="female highclass", color='#FA2479')
ax1.set_xticklabels([u"unsurvived", u"survived"], rotation=45)
ax1.legend([u"f/h"], loc='best')

ax2=fig.add_subplot(142, sharey=ax1)
train_df.Survived[train_df.Sex == 'female'][train_df.Pclass == 3].value_counts().plot(kind='bar', label='female, low class', color='pink')
ax2.set_xticklabels([u"unsurvived", u"survived"], rotation=45)
plt.legend([u"f/l"], loc='best')

ax3=fig.add_subplot(143, sharey=ax1)
train_df.Survived[train_df.Sex == 'male'][train_df.Pclass != 3].value_counts().plot(kind='bar', label='male, high class',color='lightblue')
ax3.set_xticklabels([u"unsurvived", u"survived"], rotation=45)
plt.legend([u"m/h"], loc='best')

ax4=fig.add_subplot(144, sharey=ax1)
train_df.Survived[train_df.Sex == 'male'][train_df.Pclass == 3].value_counts().plot(kind='bar', label='male low class', color='steelblue')
ax4.set_xticklabels([u"unsurvived", u"survived"], rotation=45)
plt.legend([u"m/l"], loc='best')

plt.show()

In [11]:
g = train_df.groupby(['SibSp','Survived'])
df = pd.DataFrame(g.count()['PassengerId'])
df

In [12]:
from sklearn.ensemble import RandomForestRegressor

def set_missing_ages(df):
    age_df = df[['Age','Fare','Parch','SibSp','Pclass']]
    
    known_age = age_df[age_df.Age.notnull()].values
    unknown_age = age_df[age_df.Age.isnull()].values
    
    y = known_age[:,0]
    x = known_age[:,1:]
    
    rfr = RandomForestRegressor(random_state=0, n_estimators=2000, n_jobs=-1)
    lr=rfr.fit(x,y)
    
    pre_age = lr.predict(unknown_age[:,1:])
    
    df.loc[(df.Age.isnull()), 'Age'] = pre_age
    
    return df, lr

def set_Cabin_type(df):
    df.loc[(df.Cabin.notnull()),'Cabin'] = 'Yes'
    df.loc[(df.Cabin.isnull()),'Cabin'] = 'No'
    return df
train_df, lr = set_missing_ages(train_df)
train_df = set_Cabin_type(train_df)
train_df
    

In [13]:
dummie_Cabin = pd.get_dummies(train_df['Cabin'], prefix='Cabin')
dummie_Embarked = pd.get_dummies(train_df['Embarked'], prefix='Embarked')
dummie_Sex = pd.get_dummies(train_df['Sex'], prefix='Sex')
dummie_Pclass = pd.get_dummies(train_df['Pclass'], prefix='Pclass')

df = pd.concat([train_df,dummie_Cabin,dummie_Embarked,dummie_Pclass,dummie_Sex], axis=1)
df.drop(['Pclass','Name','Sex','Ticket','Cabin','Embarked'], axis=1, inplace=True)
df

In [14]:
import sklearn.preprocessing as preprocessing
scaler = preprocessing.StandardScaler()
age_scale = scaler.fit(df[['Age']])
df['Age_scaled'] = scaler.fit_transform(df[['Age']], age_scale)
fare_scale = scaler.fit(df[['Fare']])
df['Fare_scaled'] = scaler.fit_transform(df[['Fare']], fare_scale)
df

In [15]:
from sklearn import linear_model
import seaborn as sns
final_df = df.drop(['PassengerId','Age','Fare'], axis=1)
corr = final_df.corr()
sns.heatmap(corr)

In [16]:
cols = list(final_df.columns)
cols.remove('Survived')
X_cols = cols
y = final_df['Survived']
x = final_df[X_cols]
clf = linear_model.LogisticRegression()
clf.fit(x,y)
acc = clf.score(x,y)
acc

In [17]:
test_df = pd.read_csv('../input/titanic/test.csv')

In [18]:
test_df.loc[(test_df.Fare.isnull()), 'Fare'] = 0

tem_df = test_df[['Age','SibSp','Parch','Fare','Pclass']]
null_age = tem_df[test_df.Age.isnull()].values
x_1 = null_age[:,1:]
pred_age = lr.predict(x_1)
test_df.loc[test_df.Age.isnull(), 'Age'] = pred_age


test_df = set_Cabin_type(test_df)
dum_Cabin = pd.get_dummies(test_df['Cabin'], prefix='Cabin')
dum_Embarked = pd.get_dummies(test_df['Embarked'], prefix='Embarked')
dum_Pclass = pd.get_dummies(test_df['Pclass'], prefix='Pclass')
dum_Sex = pd.get_dummies(test_df['Sex'], prefix='Sex')

df_test = pd.concat([test_df,dum_Cabin,dum_Embarked,dum_Pclass,dum_Sex], axis=1)
df_test.drop(['Pclass','Name','Cabin','Embarked','Sex','Ticket'], axis=1, inplace=True)
df_test['Age_scaled'] = scaler.fit_transform(df_test[['Age']], age_scale)
df_test['Fare_scaled'] = scaler.fit_transform(df_test[['Fare']], fare_scale)
df_test


In [19]:
final_test = df_test.drop(['PassengerId', 'Age', 'Fare'], axis=1)
predictions = clf.predict(final_test)
# result = pd.DataFrame({'PassengerId':test_df['PassengerId'].values, 'Survived':predictions.astype(np.int32)})
# result.to_csv('./predictions.csv', index=False)

In [20]:
pd.DataFrame({'columns':list(final_df.columns)[1:], 'coef':list(clf.coef_.T)})

In [21]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
cross_val_score(clf, x,y, scoring='roc_auc',cv=5)

In [22]:
from sklearn.ensemble import BaggingRegressor
bagging_clf = BaggingRegressor(clf, n_estimators=20, max_samples=0.8, max_features=1.0, bootstrap=True, bootstrap_features=False,n_jobs=-1)
bagging_clf.fit(x,y)

predictions = bagging_clf.predict(final_test)
result = pd.DataFrame({'PassengerId':test_df['PassengerId'].values, 'Survived':predictions.astype(np.int32)})
result.to_csv('./predictions0.csv', index=False)

In [23]:
from sklearn.svm import SVC, LinearSVC
svc = SVC()
svc.fit(x,y)
acc_svc = round(svc.score(x,y)*100, 2)
acc_svc