In [70]:
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
from matplotlib import style

from sklearn import linear_model
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix

In [2]:
# load data in
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [3]:
# see data types for train data
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [4]:
# see data types for test data
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Name           418 non-null object
Sex            418 non-null object
Age            332 non-null float64
SibSp          418 non-null int64
Parch          418 non-null int64
Ticket         418 non-null object
Fare           418 non-null float64
Cabin          91 non-null object
Embarked       418 non-null object
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB


In [5]:
# show basic information of train csv data
train.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [6]:
# show table
train.head(20)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


In [7]:
# find mean of train data age, to replace NAs with mean 
age_mean = train['Age'].mean()

In [8]:
train['Age']=train['Age'].fillna(age_mean)

In [9]:
# check that blank ages are filled with mean - 29.69
train.head(20)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,29.699118,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


In [10]:
# drop columns that don't seem important or cannot easily be converted to numeric data
titanic1 = train.drop(['Cabin','Name','Embarked','Ticket','PassengerId'], axis=1)

In [11]:
# convert gender to 0 and 1
titanic1.Sex[titanic1.Sex == 'male'] = 0
titanic1.Sex[titanic1.Sex == 'female'] = 1
print(titanic1)

     Survived  Pclass Sex        Age  SibSp  Parch     Fare
0           0       3   0  22.000000      1      0   7.2500
1           1       1   1  38.000000      1      0  71.2833
2           1       3   1  26.000000      0      0   7.9250
3           1       1   1  35.000000      1      0  53.1000
4           0       3   0  35.000000      0      0   8.0500
..        ...     ...  ..        ...    ...    ...      ...
886         0       2   0  27.000000      0      0  13.0000
887         1       1   1  19.000000      0      0  30.0000
888         0       3   1  29.699118      1      2  23.4500
889         1       1   0  26.000000      0      0  30.0000
890         0       3   0  32.000000      0      0   7.7500

[891 rows x 7 columns]


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [12]:
# check that gender conversion worked
titanic1.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare
0,0,3,0,22.0,1,0,7.25
1,1,1,1,38.0,1,0,71.2833
2,1,3,1,26.0,0,0,7.925
3,1,1,1,35.0,1,0,53.1
4,0,3,0,35.0,0,0,8.05


In [13]:
titanic1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 7 columns):
Survived    891 non-null int64
Pclass      891 non-null int64
Sex         891 non-null object
Age         891 non-null float64
SibSp       891 non-null int64
Parch       891 non-null int64
Fare        891 non-null float64
dtypes: float64(2), int64(4), object(1)
memory usage: 48.9+ KB


In [14]:
# convert sex from object to integer
demo = titanic1.apply(pd.to_numeric)

In [15]:
demo.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 7 columns):
Survived    891 non-null int64
Pclass      891 non-null int64
Sex         891 non-null int64
Age         891 non-null float64
SibSp       891 non-null int64
Parch       891 non-null int64
Fare        891 non-null float64
dtypes: float64(2), int64(5)
memory usage: 48.9 KB


In [16]:
# edit test csv data (in case I need to run test)
test_age_mean = test['Age'].mean()

In [17]:
test_age_mean

30.272590361445783

In [18]:
# repeat all steps as previously done to match train and test data
test['Age']=test['Age'].fillna(test_age_mean)

In [19]:
test1 = test.drop(['Cabin','Name','Embarked','Ticket'], axis=1)

In [20]:
test1

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare
0,1158,1,male,30.27259,0,0,0.00000
1,1264,1,male,49.00000,0,0,0.00000
2,913,3,male,9.00000,0,1,3.17080
3,1008,3,male,30.27259,0,0,6.43750
4,1025,3,male,30.27259,1,0,6.43750
...,...,...,...,...,...,...,...
413,1267,1,female,45.00000,0,0,262.37500
414,945,1,female,28.00000,3,2,263.00000
415,961,1,female,60.00000,1,4,263.00000
416,1235,1,female,58.00000,0,1,512.32920


In [21]:
test1.Sex[test.Sex == 'male'] = 0
test1.Sex[test.Sex == 'female'] = 1
test1.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare
0,1158,1,0,30.27259,0,0,0.0
1,1264,1,0,49.0,0,0,0.0
2,913,3,0,9.0,0,1,3.1708
3,1008,3,0,30.27259,0,0,6.4375
4,1025,3,0,30.27259,1,0,6.4375


In [22]:
test1['Age']=test1['Age'].fillna(age_mean)
test1.head(5)

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare
0,1158,1,0,30.27259,0,0,0.0
1,1264,1,0,49.0,0,0,0.0
2,913,3,0,9.0,0,1,3.1708
3,1008,3,0,30.27259,0,0,6.4375
4,1025,3,0,30.27259,1,0,6.4375


In [23]:
test1 = test1.apply(pd.to_numeric)

In [24]:
# Y_train is the target variable "survived" and X_train are the input variables (everything else) with X_test being used to better predict
X_train = demo.drop("Survived", axis=1)
Y_train = demo["Survived"]
X_test  = test1.drop("PassengerId", axis=1).copy()

In [88]:
# logistic regression code with estimate accuracy percent
logreg = LogisticRegression()
logreg.fit(X_train, Y_train)

Y_pred = logreg.predict(X_test)

accuracy_log = round(logreg.score(X_train, Y_train) * 100, 2)
print(round(accuracy_log,2,), "%")

79.8 %


In [104]:
# random forest ML code with estimate accuracy percent
random_forest = RandomForestClassifier(n_estimators=100, oob_score = True)
random_forest.fit(X_train, Y_train)
Y_prediction = random_forest.predict(X_test)

random_forest.score(X_train, Y_train)

acc_random_forest = round(random_forest.score(X_train, Y_train) * 100, 2)
print(round(acc_random_forest,2,), "%")

98.2 %


In [103]:
# comparing models
random_forest_score = RandomForestClassifier(n_estimators=100)
scores = cross_val_score(random_forest_score, X_train, Y_train, cv=10, scoring = "accuracy")
print("Scores:", scores)
print("Mean:", scores.mean())
print("Standard Deviation:", scores.std())

Scores: [0.72222222 0.7752809  0.74157303 0.87640449 0.88764045 0.84269663
 0.82022472 0.7752809  0.86516854 0.84269663]
Mean: 0.8149188514357052
Standard Deviation: 0.055027190513177573


In [45]:
feature_importance = pd.DataFrame({'feature':X_train.columns,'importance':np.round(random_forest.feature_importances_,3)})
feature_importance = feature_importance.sort_values('importance',ascending=False).set_index('feature')
feature_importance.head()

Unnamed: 0_level_0,importance
feature,Unnamed: 1_level_1
Fare,0.297
Age,0.269
Sex,0.268
Pclass,0.084
SibSp,0.045


In [80]:
decision_tree = DecisionTreeClassifier()

In [92]:
decision_tree.fit(X_train, Y_train)  

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [93]:
Y_pred = decision_tree.predict(X_test)

In [96]:
# estimate accuracy percent with decision tree
accuracy_decision_tree = round(decision_tree.score(X_train, Y_train) * 100, 2)
print(round(accuracy_decision_tree,2,), "%")

98.2 %


In [105]:
# comparing models
decision_tree_score = DecisionTreeClassifier() 
scores = cross_val_score(decision_tree_score, X_train, Y_train, cv=10, scoring = "accuracy")
print("Scores:", scores)
print("Mean:", scores.mean())
print("Standard Deviation:", scores.std())

Scores: [0.74444444 0.73033708 0.70786517 0.75280899 0.82022472 0.78651685
 0.80898876 0.75280899 0.80898876 0.82022472]
Mean: 0.7733208489388265
Standard Deviation: 0.038633989068014914


In [117]:
# testing the model using confusion matrix
predictions = cross_val_predict(random_forest, X_train, Y_train)
confusion_matrix(Y_train, predictions)

array([[475,  74],
       [ 91, 251]])

In [113]:
predictions = cross_val_predict(decision_tree, X_train, Y_train)
confusion_matrix(Y_train, predictions)

array([[449, 100],
       [102, 240]])

In [114]:
predictions = cross_val_predict(logreg, X_train, Y_train)
confusion_matrix(Y_train, predictions)

array([[463,  86],
       [106, 236]])

In [99]:
logreg = LogisticRegression()
scores = cross_val_score(logreg, X_train, Y_train, cv=10, scoring = "accuracy")
print("Scores:", scores)
print("Mean:", scores.mean())
print("Standard Deviation:", scores.std())

Scores: [0.78888889 0.78651685 0.74157303 0.82022472 0.80898876 0.76404494
 0.79775281 0.7752809  0.79775281 0.80898876]
Mean: 0.7890012484394506
Standard Deviation: 0.02234674882428341
