In [1]:
import numpy as np # linear algebra
import pandas as pd 
import seaborn as sns

In [2]:
#load the data set
df = pd.read_csv('../data/train.csv')
#check the first 3 rows of the data
df.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S


In [3]:
#access a single cell (column and row), you can replace the 'Name' column or row number
df['Name'][1]

'Cumings, Mrs. John Bradley (Florence Briggs Thayer)'

In [4]:
#we try to understand the distribution of male / female
df['Sex'].value_counts()

male      577
female    314
Name: Sex, dtype: int64

In [5]:
df[ df['Sex'] == 'male']['Survived'].value_counts()

0    468
1    109
Name: Survived, dtype: int64

In [6]:
df[ ['Sex','Survived']].value_counts()

Sex     Survived
male    0           468
female  1           233
male    1           109
female  0            81
dtype: int64

In [7]:
#lets conver the above summary table to data frame
x = df[ ['Sex','Survived']].value_counts()
result = pd.DataFrame(x, columns = ['Sex/Survived'])
result

Unnamed: 0_level_0,Unnamed: 1_level_0,Sex/Survived
Sex,Survived,Unnamed: 2_level_1
male,0,468
female,1,233
male,1,109
female,0,81


In [8]:
#furthermore you can also create a pivot table 
df[['Sex','Survived','Age']].pivot_table(index = 'Sex', columns = 'Survived')

Unnamed: 0_level_0,Age,Age
Survived,0,1
Sex,Unnamed: 1_level_2,Unnamed: 2_level_2
female,25.046875,28.847716
male,31.618056,27.276022


In [9]:
# now we can convert the Embarked column to numeric
# since we have 3 options like C, Q and S, we can use the one-hot-encoding
# instead of the one hot encoding transformation from pandas we can use get_dummies from sklearn
temp = pd.get_dummies(df['Embarked'])

# now we can add the new columns to the original data frame
df = pd.concat([df,temp],axis= 1)

# we dont need the nominal column anymore, so lets remove it
df.drop('Embarked', axis = 1, inplace = True)
df.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,C,Q,S
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,0,0,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,1,0,0
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,0,0,1


In [10]:
# similar to the 'Embarked' column, we transform the Sex column to numeric
# this case we have a risk of dummy variable trap
# so we simply use the pandas replacer
temp = df['Sex'].replace({'male':1,'female':0})

# remove the nominal column from data set
df.drop('Sex',axis = 1,inplace=True)

# add transformed data set to the original dataset
df  = pd.concat([df,temp],axis = 1)
df.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Age,SibSp,Parch,Ticket,Fare,Cabin,C,Q,S,Sex
0,1,0,3,"Braund, Mr. Owen Harris",22.0,1,0,A/5 21171,7.25,,0,0,1,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.0,1,0,PC 17599,71.2833,C85,1,0,0,0
2,3,1,3,"Heikkinen, Miss. Laina",26.0,0,0,STON/O2. 3101282,7.925,,0,0,1,0


In [11]:
#lets check if the name column contains any of these titles.
temp = df['Name'].str.contains('Mr\.') | df['Name'].str.contains('Dr\.') | df['Name'].str.contains('Sir')
# please note that we use \ symbol before . because it is a regular expression

#now lets convert the outcome to numeric
temp = temp.to_frame().replace({True:1,False:0})

#remove the original name column now, we dont neet it anymore
df.drop('Name',axis = 1,inplace = True)

#concatenate the original data frame with the titles
df = pd.concat([df,temp],axis = 1)
df.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Ticket,Fare,Cabin,C,Q,S,Sex,Name
0,1,0,3,22.0,1,0,A/5 21171,7.25,,0,0,1,1,1
1,2,1,1,38.0,1,0,PC 17599,71.2833,C85,1,0,0,0,0
2,3,1,3,26.0,0,0,STON/O2. 3101282,7.925,,0,0,1,0,0


In [12]:
df.drop(['Cabin','Ticket'], axis = 1, inplace = True)
df.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare,C,Q,S,Sex,Name
0,1,0,3,22.0,1,0,7.25,0,0,1,1,1
1,2,1,1,38.0,1,0,71.2833,1,0,0,0,0
2,3,1,3,26.0,0,0,7.925,0,0,1,0,0


In [13]:
# lets see what we have now:
df.describe()


Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare,C,Q,S,Sex,Name
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208,0.188552,0.08642,0.722783,0.647587,0.589226
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429,0.391372,0.281141,0.447876,0.47799,0.492251
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104,0.0,0.0,0.0,0.0,0.0
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542,0.0,0.0,1.0,1.0,1.0
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0,0.0,0.0,1.0,1.0,1.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292,1.0,1.0,1.0,1.0,1.0


In [14]:
#lets implement the min-max scaler and set min = 0 and max = 1
from sklearn.preprocessing import MinMaxScaler
mms = MinMaxScaler()
df = pd.DataFrame( mms.fit_transform(df), columns = df.columns)
#lets check the maximum, avarega values 
df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare,C,Q,S,Sex,Name
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0
mean,0.5,0.383838,0.654321,0.367921,0.065376,0.063599,0.062858,0.188552,0.08642,0.722783,0.647587,0.589226
std,0.289162,0.486592,0.418036,0.18254,0.137843,0.134343,0.096995,0.391372,0.281141,0.447876,0.47799,0.492251
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.25,0.0,0.5,0.247612,0.0,0.0,0.01544,0.0,0.0,0.0,0.0,0.0
50%,0.5,0.0,1.0,0.346569,0.0,0.0,0.028213,0.0,0.0,1.0,1.0,1.0
75%,0.75,1.0,1.0,0.472229,0.125,0.0,0.060508,0.0,0.0,1.0,1.0,1.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [15]:
#now we can implement the knn imputer and fill the blanks with the average of 5 nearest neighbors
from sklearn.impute import KNNImputer
knni = KNNImputer()
df = pd.DataFrame( knni.fit_transform(df), columns = df.columns)
df.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare,C,Q,S,Sex,Name
0,0.0,0.0,1.0,0.271174,0.125,0.0,0.014151,0.0,0.0,1.0,1.0,1.0
1,0.001124,1.0,0.0,0.472229,0.125,0.0,0.139136,1.0,0.0,0.0,0.0,0.0
2,0.002247,1.0,1.0,0.321438,0.0,0.0,0.015469,0.0,0.0,1.0,0.0,0.0


In [16]:
# lets try to build a linear regression between age and survived columns
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(df[['Age']],df[['Survived']])
lr.coef_

array([[-0.28068267]])

In [17]:
#we can also see the slope between all columns to all columns in a matrix
regmat = []
for col in df.columns:
    satirlar = []
    for row in df.columns:
        lr.fit(df[[row]],df[col])
        x = lr.coef_[0]
        satirlar += [ round(x, 2) ]
    regmat += [satirlar]

sonuc = pd.DataFrame(regmat, columns = df.columns, index = df.columns)
sonuc

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare,C,Q,S,Sex,Name
PassengerId,1.0,-0.0,-0.02,0.07,-0.12,-0.0,0.04,-0.0,-0.03,0.01,0.03,0.03
Survived,-0.01,1.0,-0.39,-0.28,-0.12,0.3,1.29,0.21,0.01,-0.17,-0.55,-0.54
Pclass,-0.05,-0.29,1.0,-0.95,0.25,0.06,-2.37,-0.26,0.33,0.08,0.12,0.1
Age,0.03,-0.04,-0.16,1.0,-0.39,-0.27,0.18,0.01,-0.02,-0.0,0.05,0.09
SibSp,-0.03,-0.01,0.03,-0.24,1.0,0.43,0.23,-0.02,-0.01,0.02,-0.03,-0.07
Parch,-0.0,0.02,0.01,-0.16,0.4,1.0,0.3,-0.0,-0.04,0.02,-0.07,-0.09
Fare,0.0,0.05,-0.13,0.06,0.11,0.16,1.0,0.07,-0.04,-0.04,-0.04,-0.04
C,-0.0,0.14,-0.23,0.04,-0.17,-0.03,1.09,1.0,-0.21,-0.68,-0.07,-0.05
Q,-0.03,0.0,0.15,-0.05,-0.05,-0.17,-0.34,-0.11,1.0,-0.31,-0.04,-0.04
S,0.03,-0.14,0.09,-0.01,0.23,0.21,-0.77,-0.89,-0.79,1.0,0.12,0.1


In [18]:
# or we can create another matrix from the intercept values (which is the b value in y = ax+b formula)
regmat = []
for col in df.columns:
    satirlar = []
    for row in df.columns:
        lr.fit(df[[row]],df[col])
        x = lr.intercept_
        satirlar += [ round(x, 2) ]
    regmat += [satirlar]

sonuc = pd.DataFrame(regmat, columns = df.columns, index = df.columns)
sonuc

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare,C,Q,S,Sex,Name
PassengerId,0.0,0.5,0.52,0.47,0.51,0.5,0.5,0.5,0.5,0.49,0.48,0.48
Survived,0.39,-0.0,0.64,0.49,0.39,0.37,0.3,0.34,0.38,0.51,0.74,0.7
Pclass,0.68,0.77,0.0,1.0,0.64,0.65,0.8,0.7,0.63,0.6,0.58,0.59
Age,0.35,0.38,0.47,0.0,0.39,0.38,0.35,0.36,0.37,0.37,0.33,0.31
SibSp,0.08,0.07,0.05,0.15,0.0,0.04,0.05,0.07,0.07,0.05,0.09,0.11
Parch,0.06,0.05,0.06,0.12,0.04,-0.0,0.04,0.06,0.07,0.05,0.11,0.12
Fare,0.06,0.04,0.15,0.04,0.06,0.05,-0.0,0.05,0.07,0.09,0.09,0.08
C,0.19,0.14,0.34,0.17,0.2,0.19,0.12,-0.0,0.21,0.68,0.23,0.22
Q,0.1,0.09,-0.01,0.11,0.09,0.1,0.11,0.11,-0.0,0.31,0.11,0.11
S,0.71,0.78,0.67,0.73,0.71,0.71,0.77,0.89,0.79,-0.0,0.65,0.66


In [19]:
# another technique for the feature importance is using a machine learning and getting the
# feature importances. 
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()
X = df.drop('Survived', axis = 1)
y = df['Survived']
rfc.fit(X,y)
pred = rfc.predict(X)

In [20]:
from sklearn.metrics import accuracy_score
acc = accuracy_score(y,pred)
acc

0.9988776655443322

In [21]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42)

In [22]:
rfc.fit(X_train,y_train)
pred = rfc.predict(X_test)
acc = accuracy_score(y_test,pred)
acc

0.8440677966101695

In [23]:
rfc.feature_importances_

array([0.17638884, 0.07970809, 0.17628181, 0.04450364, 0.0328105 ,
       0.17606685, 0.01576496, 0.00811256, 0.01657154, 0.14182708,
       0.13196411])

In [24]:
X.columns

Index(['PassengerId', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'C', 'Q', 'S',
       'Sex', 'Name'],
      dtype='object')

In [25]:
#lets get cv = 3 so the data set will be divided into 3 parts and 2 of them will be the training set
from sklearn.model_selection import cross_validate
scores = cross_validate(rfc, X, y, cv=3,
                       scoring=('accuracy'),
                       return_train_score=True)
scores

{'fit_time': array([0.11216521, 0.11112404, 0.11086106]),
 'score_time': array([0.00868988, 0.00921512, 0.00847387]),
 'test_score': array([0.82491582, 0.83164983, 0.83164983]),
 'train_score': array([1., 1., 1.])}

In [26]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier()
from sklearn.svm import SVC
svc = SVC()
from sklearn.cluster import KMeans
km = KMeans()
from sklearn.ensemble import GradientBoostingClassifier
gbc = GradientBoostingClassifier()

#we create a list for algorihthms and another list for the name of the algorithms
algos = [lr,gnb,knn,dtc,svc,km,gbc,rfc]
algo_names = ['LogisticRegression','GaussianNB','KNeighborsClassifier',
             'DecisionTreeClassifier','SVC','KMeans','GradientBoostingClassifier',
             'RandomForestClassifier']

# lets get the score of each algorithm by using the accuracy
scores = []
for i in range(len(algos)):
    temp = cross_validate(algos[i], X, y, cv=3,
                       scoring=('accuracy'),
                       return_train_score=True)
    scores += [algo_names[i],temp]
scores



['LogisticRegression',
 {'fit_time': array([0.00495386, 0.00434399, 0.00391722]),
  'score_time': array([0.00099611, 0.00091815, 0.00085711]),
  'test_score': array([0.79124579, 0.81144781, 0.79461279]),
  'train_score': array([0.82659933, 0.81649832, 0.80976431])},
 'GaussianNB',
 {'fit_time': array([0.00117207, 0.00117922, 0.00109887]),
  'score_time': array([0.00089598, 0.00093007, 0.00085783]),
  'test_score': array([0.77777778, 0.7979798 , 0.79124579]),
  'train_score': array([0.81986532, 0.7996633 , 0.7979798 ])},
 'KNeighborsClassifier',
 {'fit_time': array([0.00118971, 0.00123215, 0.0012238 ]),
  'score_time': array([0.00609326, 0.00597405, 0.00593734]),
  'test_score': array([0.78114478, 0.8047138 , 0.80808081]),
  'train_score': array([0.88383838, 0.86700337, 0.85690236])},
 'DecisionTreeClassifier',
 {'fit_time': array([0.00266194, 0.00273299, 0.00242186]),
  'score_time': array([0.00094199, 0.00106907, 0.0009129 ]),
  'test_score': array([0.63299663, 0.74747475, 0.78787879]

In [27]:

df = pd.read_csv('../data/train.csv')
temp = pd.get_dummies(df['Embarked'])
df = pd.concat([df,temp],axis= 1)
df.drop('Embarked', axis = 1, inplace = True)

temp = df['Sex'].replace({'male':1,'female':0})
df.drop('Sex',axis = 1,inplace=True)
df  = pd.concat([df,temp],axis = 1)

temp = df['Name'].str.contains('Mr\.') | df['Name'].str.contains('Dr\.') | df['Name'].str.contains('Sir')
temp = temp.to_frame().replace({True:1,False:0})
df.drop('Name',axis = 1,inplace = True)
df = pd.concat([df,temp],axis = 1)

df.drop(['Cabin','Ticket','PassengerId'], axis = 1 , inplace = True)

from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import KNNImputer

from sklearn.pipeline import Pipeline
pipe = Pipeline([('Min Max Scaler', MinMaxScaler()), ('KNN Imputer', KNNImputer()),
                ('GradientBoostingClassifier',GradientBoostingClassifier())])

X = df.drop('Survived', axis = 1)
y = df['Survived']
pipe.fit(X,y)
y_pred = pipe.predict(X)

from sklearn.metrics import accuracy_score
acc = accuracy_score(y,y_pred)
acc

0.9090909090909091

In [28]:
#load the test data set
df = pd.read_csv('../data/test.csv')
#check the first 3 rows of the data
df.head(3)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q


In [29]:
temp = pd.get_dummies(df['Embarked'])
df = pd.concat([df,temp],axis= 1)
df.drop('Embarked', axis = 1, inplace = True)

temp = df['Sex'].replace({'male':1,'female':0})
df.drop('Sex',axis = 1,inplace=True)
df  = pd.concat([df,temp],axis = 1)

temp = df['Name'].str.contains('Mr\.') | df['Name'].str.contains('Dr\.') | df['Name'].str.contains('Sir')
temp = temp.to_frame().replace({True:1,False:0})
df.drop('Name',axis = 1,inplace = True)
df = pd.concat([df,temp],axis = 1)

df.drop(['Cabin','Ticket','PassengerId'], axis = 1 , inplace = True)

sonuclar = pipe.predict(df)

In [30]:
df = pd.read_csv('../data/test.csv')
df = df[['PassengerId']]
df['Survived'] = sonuclar
df.to_csv('jeonchan.csv', index = False)