## Step 1: Importing basic libraries

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline


## Step 2: Reading the data

In [None]:
#df = pd.read_csv('http://bit.ly/kaggletrain')
training = pd.read_csv(r'C:\Users\rquiles\OneDrive - Healthesystems, LLC\Desktop\train.csv')
training.head()

In [None]:
test = pd.read_csv(r'C:\Users\rquiles\OneDrive - Healthesystems, LLC\Desktop\test.csv')
test.head()

In [None]:
training['train_test'] = 1
test['train_test'] = 0
test['Survived'] = np.NaN
all_data = pd.concat([training,test])

In [None]:
training

In [None]:
all_data.head()

## Step 3: Data Exploration

In [None]:
training.info()

In [None]:
training.isna().sum()

In [None]:
training.describe()

In [None]:
# seperate the data into numeric and categorical
df_num = training[['Age','SibSp','Parch','Fare']]
df_cat = training[['Survived','Pclass','Sex','Ticket','Cabin','Embarked']]

#### Visualization Of The Data

In [None]:
for i in df_num.columns:
    plt.hist(df_num[i], edgecolor= 'black')
    plt.title(i)
    plt.show()

In [None]:
pd.pivot_table(training, index='Survived', values = ['Age', 'SibSp', 'Parch', 'Fare'])

In [None]:
#Take aways from charts
#1.  Avg age of survivors is 28
#2.  People who paid higher fares more likely survived
#3.  If you have parents, you had a higher chance of survival
#4.  If you had siblins, you had less of a chance of survival

In [None]:
for i in df_cat.columns:
    sns.barplot(df_cat[i].value_counts().index,df_cat[i].value_counts()).set_title(i)
    plt.show()

In [None]:
print(pd.pivot_table(training, index = 'Survived', columns = 'Pclass',
                     values = 'Ticket' ,aggfunc ='count'))
print()
print(pd.pivot_table(training, index = 'Survived', columns = 'Sex', 
                     values = 'Ticket' ,aggfunc ='count'))
print()
print(pd.pivot_table(training, index = 'Survived', columns = 'Embarked', 
                     values = 'Ticket' ,aggfunc ='count'))

In [None]:
#1.  Survived: Most of the people died in the shipwreck,  only around 300 people survived.
#2.  Pclass: The majority of the people traveling, had tickets to the 3rd class.
#3.  Sex: There were more males than females aboard the ship, roughly double the amount.
#4.  Embarked: Most of the passengers boarded the ship from Southampton.

## Step 4: Feature Engineering
#### Using various techniques used to prepare, extract, and transform features from raw data to provide the best inputs to a ML 
#### model.

In [None]:
df_cat.Cabin
training['cabin_multiple'] = training.Cabin.apply(lambda x: 0 if pd.isna(x) 
                                                    else len(x.split(' ')))
training['cabin_multiple'].value_counts()

In [None]:
#pd.set_option('display.max_rows', None)
#pd.set_option('display.max_columns', None)

training

In [None]:
pd.pivot_table(training, index = 'Survived', columns = 'cabin_multiple',
               values = 'Ticket' ,aggfunc ='count')

## Step 5: Data preprocessing for model
#### The process of taking data and transforming it into a format that can be understood and analyzed by computers and ML models

In [None]:
#df = df.loc[df.Embarked.notna(), ['Survived', 'Pclass', 'Sex', 'Embarked']]
#df.head()

training = training.loc[(training['Embarked'].notna()) & (training['Age'].notna()) , :]
training.head()

In [None]:
X = training.drop(['Survived', 'Name', 'Ticket', 'Cabin'], axis=1)
y = training['Survived']

In [None]:
X.head()
print(X.shape)

In [None]:
y.head()
print(y.shape)

In [None]:
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import make_pipeline 

from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

column_trans = make_column_transformer((OneHotEncoder(), ['Sex', 'Embarked']), remainder='passthrough')

lr = LogisticRegression(max_iter = 3000)
knn = KNeighborsClassifier(n_neighbors = 7)
svc = SVC(probability = True)

In [None]:
column_trans.fit_transform(X)

## Step 6: Building  Model

### Logistic Regression

In [None]:
pipe = make_pipeline(column_trans, lr)
round(cross_val_score(pipe, X, y, cv =5 , scoring='roc_auc').mean(),2)

In [None]:
pipe.fit(X,y)

In [None]:
y_pred = pipe.predict(X)
y_pred_probs = pipe.predict_proba(X)[:,1]
y_pred_probs

In [None]:
from sklearn.metrics import roc_curve, roc_auc_score

x1 = [0, 0.2, 0.4, 0.6, 0.8, 1.0]
y1 = [0, 0.2, 0.4, 0.6, 0.8, 1.0]

tpr, fpr, _ = roc_curve(y, y_pred_probs)
plt.plot(tpr,fpr)
plt.plot(x1,y1, color='red', marker='_')
plt.ylabel('True Positive Rate (Sensitivity)')
plt.xlabel('False Positive Rate (1-Specificity)')

### K Nearest Neighbor

In [None]:
k_range = range(1,25)
scores = []
for k in k_range:
    pipe = make_pipeline(column_trans, knn)
    scores.append(cross_val_score(pipe, X, y, cv = k + 1 , scoring='accuracy').mean())



In [None]:
plt.plot(k_range, scores)
plt.xlabel('Value of K for KNN')
plt.ylabel('Testing Accuracy')

In [None]:
pipe = make_pipeline(column_trans, knn)
cross_val_score(pipe, X, y, cv = 3, scoring='accuracy').mean() 

### Support Vector Classifier

In [None]:
pipe = make_pipeline(column_trans, svc)
cross_val_score(pipe, X, y, cv =5, scoring='roc_auc' ).mean() 

## Step 7: Run Model To Make Predictions

In [None]:
test.head()

In [None]:
test['cabin_multiple'] = test.Cabin.apply(lambda x: 0 if pd.isna(x) 
                                                    else len(x.split(' ')))

In [None]:
test = test.drop(['Survived', 'Name', 'Ticket', 'Cabin'], axis=1)

In [None]:
test.head()

In [None]:
pipe = make_pipeline(column_trans, lr)


In [None]:
print(X.shape)
print(test.shape)

In [None]:
test.isna().sum()

In [None]:
test = test.loc[(test['Age'].notna()) &(test['Fare'].notna()) , :]


In [None]:
pipe.fit(X,y)

In [None]:
y_pred = pipe.predict(test)
y_pred