In [None]:
import numpy as np
import pandas as pd
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
titanic_df=pd.read_csv('/content/train.csv')

In [None]:
titanic_df.head(10)

In [None]:
titanic_df.shape

In [None]:
titanic_df.drop(['PassengerId','Name','Ticket', 'Cabin'] ,axis=1, inplace=True)

In [None]:
titanic_df.head()

In [None]:
titanic_df[titanic_df.isnull().any(axis=1)].count()

In [None]:
titanic_df=titanic_df.dropna()

In [None]:
titanic_df.shape

In [None]:
titanic_df[titanic_df.isnull().any(axis=1)].count()

In [None]:
titanic_df.describe()

In [None]:
fig, ax=plt.subplots(figsize=(12,8))

plt.scatter(titanic_df['Age'],titanic_df['Survived'])

plt.xlabel('Age')
plt.ylabel('Survived')

In [None]:
pd.crosstab(titanic_df['Sex'], titanic_df['Survived'])

In [None]:
pd.crosstab(titanic_df['Pclass'], titanic_df['Survived'])

In [None]:
titanic_data_corr=titanic_df.corr()
titanic_data_corr

In [None]:
fig, ax =plt.subplots(figsize=(12,10))
sns.heatmap(titanic_data_corr, annot=True)

In [None]:
from sklearn import preprocessing

label_encoding= preprocessing.LabelEncoder()
titanic_df['Sex']=label_encoding.fit_transform(titanic_df['Sex'].astype(str))

titanic_df.head()

In [None]:
label_encoding.classes_

In [None]:
titanic_df=pd.get_dummies(titanic_df, columns=['Embarked'])
titanic_df.head()

In [None]:
titanic_df= titanic_df.sample(frac=1).reset_index(drop=True)
titanic_df.head()

In [None]:
titanic_df.to_csv('/content/train_processed.csv', index=False)

In [None]:
!ls 

# Training the mode



### Binary Classification -- Logistic Regression

In [None]:
titanic_df=pd.read_csv('/content/train_processed.csv')
titanic_df.head()

In [None]:
titanic_df.shape

In [None]:
from sklearn.model_selection import train_test_split

X=titanic_df.drop('Survived', axis=1)
Y = titanic_df['Survived']

X_train, X_test, y_train, y_test = train_test_split(X,Y, test_size=0.2)

In [None]:
y_test

In [None]:
X_train.shape, y_train.shape

In [None]:
X_test.shape, y_test.shape

In [None]:
from sklearn.linear_model import LogisticRegression


In [None]:
logistic_model=LogisticRegression(penalty='l2', C=1.0, solver='liblinear')

In [None]:
logistic_model.fit(X_train, y_train)

In [None]:
y_pred=logistic_model.predict(X_test)

In [None]:
pred_results=pd.DataFrame({'y_test':y_test,
                          'y_pred':y_pred})

In [None]:
pred_results.head()

In [None]:
titanic_crosstab=pd.crosstab(pred_results.y_pred, pred_results.y_test)

titanic_crosstab

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score

In [None]:
acc= accuracy_score(y_test, y_pred)
prec= precision_score(y_test, y_pred)
recall= recall_score(y_test, y_pred)

print("accuracy_score: ", acc)
print("precision_score: ", prec)
print("recalll_score: ", recall)

In [None]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.linear_model import SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.neighbors import RadiusNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier

In [None]:
titanic_df.head()

In [None]:
FEATURES=list(titanic_df.columns[1:])

FEATURES

## Create helper functions

In [None]:
result_dict={}

In [None]:
def summarize_classification(y_test, y_pred):
  acc=accuracy_score(y_test, y_pred, normalize=True) #accuracy in term of a fraction
  num_acc=accuracy_score(y_test, y_pred, normalize=False) # nr of accurately predicted labels

  prec= precision_score(y_test, y_pred)
  recall= recall_score(y_test, y_pred)

  return {'accuracy' : acc,
          'precision': prec,
          'recall': recall,
          'accuracy_count': num_acc}

In [None]:
def build_model(classifier_fn,
                name_of_y_cols,
                names_of_x_cols,
                dataset,
                test_frac=0.2):
  
  X = dataset[names_of_x_cols]
  Y= dataset[name_of_y_cols]

  x_train, x_test, y_train, y_test= train_test_split(X, Y, test_size= test_frac)
  
  model=classifier_fn(x_train, y_train)

  y_pred= model.predict(x_test)

  y_pred_train = model.predict(x_train)

  train_summary= summarize_classification(y_train, y_pred_train)
  test_summary = summarize_classification(y_test, y_pred)

  pred_results =pd.DataFrame({'y_test': y_test,
                              'y_pred': y_pred})
  
  model_crosstab= pd.crosstab(pred_results.y_pred, pred_results.y_test) #calculate the confusion matrix

  return {'training': train_summary,
          'test': test_summary,
          'confusion_matrix': model_crosstab}

In [None]:
def compare_results():
  for key in result_dict:
    print('Classification: ', key)

    print()
    print('Training Data')
    for score in result_dict[key]['training']:
      print(score, result_dict[key]['training'][score])

    print()
    print('Test Data')
    for score in result_dict[key]['test']:
      print(score, result_dict[key]['test'][score])

    print()

## Build and train using the functions

In [None]:
def logistic_fn(X_train, y_train):

   model= LogisticRegression(solver='liblinear')
   model.fit(X_train, y_train)

   return model

In [None]:
result_dict['survived ~ logistic']= build_model(logistic_fn,'Survived', FEATURES, titanic_df)

compare_results()

In [None]:
def linear_discriminant_fn(X_train, y_train, solver='svd'):
  
  model= LinearDiscriminantAnalysis(solver=solver)
  model.fit(X_train, y_train)

  return model

In [None]:
result_dict['survived ~ linear_discriminant_analysis']= build_model(linear_discriminant_fn, 'Survived',FEATURES[0:-1],titanic_df)

compare_results()

In [None]:
def quadratic_discrimininat_fn(X_train, y_train):

  model= QuadraticDiscriminantAnalysis()
  model.fit(X_train, y_train)

  return model

In [None]:
result_dict['survived ~ quadratic_discriminant_analysis']= build_model(quadratic_discrimininat_fn,'Survived', FEATURES[0:-1], titanic_df)
## drop on of the column from the very end so that our embarked categorical variable is dummy encoded, not one-hot encoded
compare_results()

In [None]:
def sgd_fn(X_train, y_train, max_iter=10000, tol=1e-3): #tol=tolerance value 

  model= SGDClassifier(max_iter=max_iter, tol=tol)
  model.fit(X_train, y_train)

  return model

In [None]:
result_dict['survived ~ sgd']= build_model(sgd_fn, 'Survived', FEATURES, titanic_df)

compare_results()