## Titanic Data // Logistic Regression

In [1]:
import numpy as np
import pandas as pd

###### >> Load Titanic Train data

In [2]:
Ttrain = pd.read_csv('train.csv')

In [3]:
Ttrain.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


In [4]:
Ttrain = Ttrain.set_index('PassengerId')

###### Divide Train data Xtrain and Ytrain

In [5]:
# Xtrain is Predictors
Xtrain = Ttrain[[col for col in Ttrain.columns if col != 'Survived']]

In [6]:
# Ytrain is Predictions of survival to train
Ytrain = Ttrain['Survived']

In [7]:
#delete Cabin because Cabin has no influence on Survival
del Xtrain['Cabin']

In [8]:
#function to make sex into numericals

FM = lambda x : 1 if x == 'male' else 0

In [9]:
# apply above function
Xtrain['SEX'] =Xtrain['Sex'].apply(FM)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [10]:
del Xtrain['Sex']

In [11]:
#fill na values of Age with interpolating
Xtrain['Age'] = Xtrain.Age.fillna(Xtrain.Age.interpolate())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [12]:
# Make a column of family if they contain siblings and parents and spouse
Xtrain['Family'] = Xtrain['SibSp'] + Xtrain['Parch'] + 1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [13]:
 del Xtrain['Ticket']

 del Xtrain['SibSp']

 del Xtrain['Parch']

 del Xtrain['Name']

 del Xtrain['Embarked']

In [14]:
Xtrain.head()

Unnamed: 0_level_0,Pclass,Age,Fare,SEX,Family
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,3,22.0,7.25,1,2
2,1,38.0,71.2833,0,2
3,3,26.0,7.925,0,1
4,1,35.0,53.1,0,2
5,3,35.0,8.05,1,1


###### >> Modeling

In [15]:
from sklearn.linear_model import LogisticRegression

In [16]:
Log = LogisticRegression()

In [17]:
Log.fit(Xtrain,Ytrain)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [18]:
print(f'Co-efficient  : {Log.coef_}')
print(f'Intercept  : {Log.intercept_}')

Co-efficient  : [[-0.80629912 -0.02557107  0.00530489 -2.53898994 -0.20008019]]
Intercept  : [3.84789763]


###### >> Accuracy of Model :

In [27]:
from sklearn import metrics
ACC = round(metrics.accuracy_score(Ytrain,Log.predict(Xtrain)),3)
print(f'Accuracy of the Model "Log" is {ACC*100}% .')

Accuracy of the Model "Log" is 79.7% .


###### >> Loading Test Data

In [28]:
Ttest = pd.read_csv('test.csv')

In [29]:
Ttest.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Name           418 non-null object
Sex            418 non-null object
Age            332 non-null float64
SibSp          418 non-null int64
Parch          418 non-null int64
Ticket         418 non-null object
Fare           417 non-null float64
Cabin          91 non-null object
Embarked       418 non-null object
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB


In [31]:
# Actually Survived isn't given , so Ttest is Xtest
Xtest = Ttest[[col for col in Ttest.columns if col != 'Survived']]

In [33]:
del Xtest['Name']
del Xtest['Cabin']
del Xtest['Embarked']
del Xtest['Ticket']

In [37]:
Xtest.head()

Unnamed: 0_level_0,Pclass,Sex,Age,Fare,Family
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
892,3,male,34.5,7.8292,1
893,3,female,47.0,7.0,2
894,2,male,62.0,9.6875,1
895,3,male,27.0,8.6625,1
896,3,female,22.0,12.2875,3


In [None]:
#setting Index 
Xtest = Xtest.set_index('PassengerId')

In [35]:
Xtest['Family'] = Xtest['SibSp'] + Xtest['Parch'] + 1
del Xtest['SibSp']
del Xtest['Parch']

In [38]:
Xtest['SEX'] = Xtest['Sex'].apply(FM)

In [39]:
del Xtest['Sex']

In [40]:
Xtest['Age'] = Xtest.Age.fillna(Xtest.Age.interpolate())

In [42]:
Xtest.head(5)

Unnamed: 0_level_0,Pclass,Age,Fare,Family,SEX
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
892,3,34.5,7.8292,1,1
893,3,47.0,7.0,2,0
894,2,62.0,9.6875,1,1
895,3,27.0,8.6625,1,1
896,3,22.0,12.2875,3,0


#### testing - Accuracy


In [43]:
Xtest = Xtest.dropna()

In [44]:
Xtest.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 417 entries, 892 to 1309
Data columns (total 5 columns):
Pclass    417 non-null int64
Age       417 non-null float64
Fare      417 non-null float64
Family    417 non-null int64
SEX       417 non-null int64
dtypes: float64(2), int64(3)
memory usage: 19.5 KB


In [45]:
Ytest = Log.predict(Xtest)

In [46]:
Xtest['Survived'] = Ytest

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [47]:
Xtest.sample(15)

Unnamed: 0_level_0,Pclass,Age,Fare,Family,SEX,Survived
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1208,1,57.0,146.5208,2,1,0
1177,3,36.0,7.25,1,1,0
1048,1,29.0,221.7792,1,0,1
957,2,15.5,21.0,1,0,0
1173,3,0.75,13.775,3,1,0
1163,3,36.0,7.75,1,1,0
1210,3,27.0,7.8542,1,1,0
1306,1,39.0,108.9,1,0,1
1184,3,41.5,7.2292,1,1,0
1122,2,14.0,65.0,1,1,0
