In [20]:
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, make_pipeline
import pandas as pd

import warnings
warnings.filterwarnings("ignore")

RandomState = 100000

##### ----> Read the file train.csv into Python 

In [2]:
titan = pd.read_csv('./Data/train.csv', sep=',')   

In [3]:
titan

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


#### DataFrame:

* NaN values: Age, Cabin
* Age check the distribution with a histogram and also the for male and female
* the passanger id and name is related directly to those who survived and not survived could possibly having data leakage
* google to check the method to fill in the NaN values of cabin if not then just **drop**
* fillin the NaN (this is feature engineering) values after splitting the train test dataframe
* drop the feature "Cabin" before splitting data 

##### -----> checking for NaN values in our DF

In [4]:
# it had more than 50% of missing values (613 NaN values)
# also name and keeping passenger id to identify who didnot not survive
titan.drop(columns=['Cabin', 'Name', 'Ticket'], inplace=True)

In [5]:
titan[['Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']].isnull().sum()

Sex           0
Age         177
SibSp         0
Parch         0
Fare          0
Embarked      2
dtype: int64

In [6]:
titan.dropna(axis=0, subset=['Embarked'], inplace=True)
titan.shape

(889, 9)

##### -----> start splitting the dataframe into features and labels --> train and test

In [7]:
#features
X = titan[['PassengerId', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']]
#label species
y = titan['Survived']

In [8]:
#check shape
X.shape, y.shape

((889, 8), (889,))

In [9]:
# split data to train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state= 42)

In [10]:
#check shape again
X_train.shape, y_train.shape

((533, 8), (533,))

In [11]:
X_train.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
569,570,3,male,32.0,0,0,7.8542,S
786,787,3,female,18.0,0,0,7.4958,S
75,76,3,male,25.0,0,0,7.65,S
114,115,3,female,17.0,0,0,14.4583,C
597,598,3,male,49.0,0,0,0.0,S


In [12]:
X_train.reset_index(inplace=True)
X_train

Unnamed: 0,index,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,569,570,3,male,32.0,0,0,7.8542,S
1,786,787,3,female,18.0,0,0,7.4958,S
2,75,76,3,male,25.0,0,0,7.6500,S
3,114,115,3,female,17.0,0,0,14.4583,C
4,597,598,3,male,49.0,0,0,0.0000,S
...,...,...,...,...,...,...,...,...,...
528,107,108,3,male,,0,0,7.7750,S
529,271,272,3,male,25.0,0,0,0.0000,S
530,862,863,1,female,48.0,0,0,25.9292,S
531,436,437,3,female,21.0,2,2,34.3750,S


In [13]:
X_train.drop(columns=['index'], inplace=True)
X_train

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,570,3,male,32.0,0,0,7.8542,S
1,787,3,female,18.0,0,0,7.4958,S
2,76,3,male,25.0,0,0,7.6500,S
3,115,3,female,17.0,0,0,14.4583,C
4,598,3,male,49.0,0,0,0.0000,S
...,...,...,...,...,...,...,...,...
528,108,3,male,,0,0,7.7750,S
529,272,3,male,25.0,0,0,0.0000,S
530,863,1,female,48.0,0,0,25.9292,S
531,437,3,female,21.0,2,2,34.3750,S


In [14]:
# feature engineering for categorical
categorical_features = ['Sex', "Embarked", "Pclass"]

# handle unknown means it will ignore nan's if it finds them
categorical_transformer = OneHotEncoder(handle_unknown="ignore")

In [15]:
#get all indices = Females
#fem_ind = titan[titan['Sex'] == 'female']['PassengerId'].tolist()
#mal_ind = titan[titan['Sex'] == 'male']['PassengerId'].tolist()

In [35]:
# you can also create custom functions
# instead of a simple imputer for the age i will create a function to take the median by sex
def median_age(df):
    medi_ageF = df[df['Sex'] == 'female']['Age'].median()
    medi_ageM = df[df['Sex'] == 'male']['Age'].median()
    df.loc[df['Sex'] == 'female', 'Age'] = df.loc[df['Sex'] == 'female', 'Age'].fillna(medi_ageF)
    df.loc[df['Sex'] == 'male', 'Age'] = df.loc[df['Sex'] == 'male', 'Age'].fillna(medi_ageM)

       
    return df[['Age']]

In [36]:
# feature engineering for numericals
fare_feature = ["Fare"]

# create a sequential pipeline 
# output of one step will be input to the next 
fare_transformer = make_pipeline(
    KBinsDiscretizer(n_bins=4, encode='onehot-dense', strategy='quantile'))    

In [37]:
preprocessor = ColumnTransformer(
    transformers=[
        ("categ", categorical_transformer, categorical_features),
        ("age", FunctionTransformer(median_age), ['Sex', 'Age']),
        ("fare", fare_transformer, fare_feature),        
    ],
    remainder='passthrough')

**order of column transformer**
Sex, Embarked, Pclass, Age, Fare, PassengerId, SibSp, Parch

In [38]:
X_train_trans = preprocessor.fit_transform(X_train)
X_test_trans = preprocessor.transform(X_test)

X_train_trans

array([[  0.,   1.,   0., ..., 570.,   0.,   0.],
       [  1.,   0.,   0., ..., 787.,   0.,   0.],
       [  0.,   1.,   0., ...,  76.,   0.,   0.],
       ...,
       [  1.,   0.,   0., ..., 863.,   0.,   0.],
       [  1.,   0.,   0., ..., 437.,   2.,   2.],
       [  0.,   1.,   0., ..., 104.,   0.,   0.]])

In [39]:
temp = pd.DataFrame(X_train_trans) # transform to a dataframe to check our output of column-transformers
temp

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,32.0,1.0,0.0,0.0,0.0,570.0,0.0,0.0
1,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,18.0,1.0,0.0,0.0,0.0,787.0,0.0,0.0
2,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,25.0,1.0,0.0,0.0,0.0,76.0,0.0,0.0
3,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,17.0,0.0,0.0,1.0,0.0,115.0,0.0,0.0
4,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,49.0,1.0,0.0,0.0,0.0,598.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
528,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,28.5,1.0,0.0,0.0,0.0,108.0,0.0,0.0
529,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,25.0,1.0,0.0,0.0,0.0,272.0,0.0,0.0
530,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,48.0,0.0,0.0,1.0,0.0,863.0,0.0,0.0
531,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,21.0,0.0,0.0,0.0,1.0,437.0,2.0,2.0


In [40]:
temp.isnull().sum()

0     0
1     0
2     0
3     0
4     0
5     0
6     0
7     0
8     0
9     0
10    0
11    0
12    0
13    0
14    0
15    0
dtype: int64

In [41]:
m_log = LogisticRegression(max_iter=300)

In [42]:
#pipeline_log

In [43]:
m_log.fit(X_train_trans, y_train)

LogisticRegression(max_iter=300)

In [44]:
# calculate the accuracy score from training data
titanic1_train_accuracy = m_log.score(X_train_trans, y_train) 
print('Train accuracy: ', titanic1_train_accuracy)
# calculate the accuracy score from test data
titanic1_test_accuracy = m_log.score(X_test_trans, y_test)
print('Test accuracy: ', titanic1_test_accuracy)

Train accuracy:  0.8048780487804879
Test accuracy:  0.797752808988764


In [None]:
# get predictions from the pipeline
pred = m_log.predict(X_test_trans)

In [None]:
#Precision vs Recall
from sklearn.metrics import precision_score, recall_score
precision = precision_score(y_test,pred)
recall = recall_score(y_test,pred)

print("Precision: " +str(precision_score(y_test,pred))+ "   Recall: "+str(recall_score(y_test,pred)))

In [None]:
# get prediction probabilities from the pipeline 
m_log.predict_proba(X_test_trans)

In [None]:
#confusion matrix
from sklearn.metrics import confusion_matrix
conf = confusion_matrix(y_test,pred)
conf

In [None]:
# visualize confusion matrix 
import matplotlib.pyplot as plt
import seaborn as sns
fig = plt.figure(figsize=(6, 6))
ax = plt.subplot()
sns.heatmap(conf, annot=True, ax = ax,fmt='g', cmap = 'Blues')
plt.gcf().set_facecolor('grey')
ax.set_xlabel('Prediction')
ax.set_ylabel('True')
ax.set_title('Confusion Matrix')

In [None]:
# work on TEST.csv for Kaggle
test_tit = pd.read_csv('./Data/test.csv')


In [None]:
C_test_titF = test_tit.copy()
C_test_titF

In [None]:
C_test_titF.drop(columns=['Cabin', 'Name', 'Ticket'], inplace=True)

In [None]:
C_test_titF[['Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']].isnull().sum(), C_test_titF.shape

In [None]:
C_test_titF['Fare'].fillna(C_test_titF['Fare'].mean(), inplace=True)

In [None]:
# transform the TEST data set 
X_testFi_trans = preprocessor.transform(C_test_titF)

In [None]:
#prediction of final TEST dataset
pred_testF = m_log.predict(X_testFi_trans)

In [None]:
#prediction of final TEST dataset
m_log.predict_proba(X_testFi_trans)

In [None]:
# exporting result
gender_submission1 = pd.DataFrame()
gender_submission1["PassengerId"]=C_test_titF["PassengerId"]
gender_submission1["Survived"]=pred_testF.reshape(-1,1)
gender_submission1.head(), gender_submission1.shape

In [None]:
gender_submission1.to_csv('Data/KaggleTest_LogisticRegression_corrected_pipeline_sub2.csv',index=False)
