<a href="https://colab.research.google.com/github/jonitorta/Ejercicios_Machine_learning./blob/main/titanic.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#Main libraries to analysis and visualizations
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns

#--------------------------------------------------------
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_predict
from sklearn.compose import ColumnTransformer 
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import RandomizedSearchCV

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
train_path = "/kaggle/input/titanic/train.csv"
test_path = "/kaggle/input/titanic/test.csv"

In [None]:
train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)
print(f"train size :{train_df.shape} \ntest size : {test_df.shape}")

In [None]:
train_df.columns

In [None]:
train_df.info()
print("--------------------------------------------------------------")
test_df.info()

In [None]:
train_df.head()

In [None]:
#38% of survival rate in general
#75% of passagers were in pclass = 3(lowest)
#Most people between 20 to 40 years
#A lot of passagers were alone just 25% with one or more sib/spouce and less than 25% with 
#one or more children and/or parent
#Fare has some high outliers.
train_df.describe()

In [None]:
#All names are unique
#Some tikets are duplicated
#A lot of duplication in cabin
#3 differen ports
train_df.describe(include=["O"])

In [None]:
#Lets do some data exploration and visualization
train_df.hist(figsize = (10,10))
plt.show()

In [None]:
#Check survival rate with some parameters
#-> Sex, Pclass, Sibs, Age, 
train_df.columns

In [None]:
def survival_rate_per_attribute(name):
    #Given a column check survival rate vs values in column
    df = train_df[[name, "Survived"]].groupby(name, as_index= False).mean().sort_values(by = "Survived", ascending = False)
    return df 

In [None]:
#High class more survival rate
survival_rate_per_attribute("Pclass")

In [None]:
#Female survival rate is higher than men
survival_rate_per_attribute("Sex")

In [None]:
#Lets create interval for age and see survival rate per interval 
train_df["Age"].hist()
plt.show()

In [None]:
Ages = train_df["Age"]
train_df["Age_cat"] = pd.cut(Ages, 
                 bins = [0.0,15.0,50.0,100],
                labels = [0,1,2])

In [None]:
#People below 15 years had a high survival rate (>55%)
survival_rate_per_attribute("Age_cat")

In [None]:
train_df.drop("Age_cat", axis = 1, inplace = True)

In [None]:
#People with family seems to have higher chance to survive.
family_num = train_df["SibSp"] + train_df["Parch"] 
l = []
for num in family_num : 
    if num > 0 :
        l.append(1)
    else : 
        l.append(0)
train_df["Family"] = l
survival_rate_per_attribute("Family")

In [None]:
train_df.drop("Family", axis = 1, inplace = True)

In [None]:
#Embarked seems important
survival_rate_per_attribute("Embarked")

In [None]:
train_df["Fare"].hist()
plt.show()

In [None]:
#Create a fare range
train_df["Fare_range"] = pd.cut(
    train_df["Fare"],
    [0.0,100.0,200.0,np.inf],
    labels = [1,2,3]
)
#High fare is more likely to survive.
survival_rate_per_attribute("Fare_range")

In [None]:
train_df.drop("Fare_range", axis = 1, inplace = True)

In [None]:
labels = train_df["Survived"]
train_df = train_df.drop("Survived", axis = 1)
train_df["Embarked"].fillna(train_df["Embarked"][0], inplace = True)

In [None]:
class BasicAdder(TransformerMixin, BaseEstimator):
    
    def __init__(self, add_family = False, fare_interval = False, age_interval = False, binary_sex = False, fill_age = False, fill_fare = False, cat_embarked = False):
        self.add_family = add_family
        self.fare_interval = fare_interval
        self.age_interval = age_interval
        self.binary_sex = binary_sex
        self.fill_age = fill_age
        self.fill_fare = fill_fare
        self.cat_embarked = cat_embarked
    
    def fit(self, X, y = None):
        return self
    
    def transform(self, DF, y = None):
        X = DF.copy()
        
        if self.fill_fare :
            mean = X["Fare"].mean()
            X["Fare"].fillna(mean, inplace = True)
        
        if self.age_interval :
            X["Age_interval"] = pd.cut(X["Age"],
                                       bins = [0.0,15.0,50.0,100],
                                       labels = [0,1,2])
            
        if self.cat_embarked:
            values = X["Embarked"].unique()
            
            for value in values :
                X[str(value)] = X["Embarked"].map({value :  1}).fillna(0)
        
        if self.fare_interval :
            X["Fare_interval"] = pd.cut(X["Fare"],
                                       bins = [-np.inf,100.0,200.0,np.inf],
                                       labels = [0,1,2
                                                ])
            
        if self.add_family : 
            X["Family"] = X["SibSp"] + X["Parch"]
        
        if self.binary_sex : 
            X["Sex"] = X["Sex"].map(dict(zip(['male','female'],[0,1])))
            
        if self.fill_age:
            average_age = []
            for sex in X["Sex"].unique() :
                for pclass in X["Pclass"].unique():
                    condition = (X["Sex"] ==  sex) & (X["Pclass"] == pclass) 
                    average_age.append( X[condition]["Age"].mean() )
                    X.loc[condition, "Age"] = X.loc[condition, "Age"].fillna( average_age[-1] ) 
               
        
        return X

In [None]:
class ColDropper(TransformerMixin, BaseEstimator):
    
    def __init__(self,col_names):
        self.col_names = col_names
        
    def fit(self, X, y = None ):
        return self

    def transform(self, X, y = None):
        return X.drop(self.col_names, axis = 1)  

In [None]:
cat_pipeline = Pipeline([
    ("Binary", BasicAdder(binary_sex = True) ),
    ("Embarked", BasicAdder(cat_embarked = True) )
])

num_pipeline = Pipeline([
    ("To_interval", BasicAdder(fare_interval = True, age_interval = True)),
    ("Add_family", BasicAdder(add_family = True))
])


fill_pipeline = Pipeline([
    ("Fill_age", BasicAdder(fill_age = True)),
    ("Fill_fare", BasicAdder(fill_fare = True))
])
train_df.columns

In [None]:
full_pipeline = Pipeline(
    steps = [
    ("fill", fill_pipeline),
    ("drop_name", ColDropper(["Name"]) ),
    ("drop_passid", ColDropper(["PassengerId"]) ),
    ("drop_ticket", ColDropper(["Ticket"]) ),
    ("drop_cabin", ColDropper(["Cabin"]) ),
    ("cat", cat_pipeline),
    ("num", num_pipeline),
    ("drop_relative", ColDropper(["SibSp", "Parch"])),
    ("drop_age", ColDropper(["Age"]) ),
    ("drop_fare", ColDropper(["Fare"])),
    ("drop_embarked", ColDropper(["Embarked"]))
])
prepared_df = full_pipeline.fit_transform(train_df)
prep_test_df = full_pipeline.fit_transform(test_df)
prepared_df.head()

In [None]:
prepared_df.info()
print("-"*40)
prep_test_df.info()

In [None]:
random_forest = RandomForestClassifier()
predictions = cross_val_predict(random_forest, prepared_df, labels, cv=3)

In [None]:
def plot_conf_mtx(cf_matrix):
  #Plot a confusion matrix
  group_names = ['True Neg','False Pos','False Neg','True Pos']

  group_counts = ["{0:0.0f}".format(value) for value in
                  cf_matrix.flatten()]

  group_percentages = ["{0:.2%}".format(value) for value in
                      cf_matrix.flatten()/np.sum(cf_matrix)]

  labels = [f"{v1}\n{v2}\n{v3}" for v1, v2, v3 in
            zip(group_names,group_counts,group_percentages)]

  labels = np.asarray(labels).reshape(2,2)

  ax = sns.heatmap(cf_matrix, annot=labels, fmt='', cmap='Blues')

  ax.set_title('Seaborn Confusion Matrix with labels\n\n');
  ax.set_xlabel('\nPredicted Values')
  ax.set_ylabel('Actual Values ');

  ## Ticket labels - List must be in alphabetical order
  ax.xaxis.set_ticklabels(['False','True'])
  ax.yaxis.set_ticklabels(['False','True'])

  ## Display the visualization of the Confusion Matrix.
  plt.show()

In [None]:
cnf_mtx = confusion_matrix(labels, predictions)
plot_conf_mtx(cnf_mtx)

In [None]:
n_estimators = [int(x) for x in np.linspace(200,2000,10)]
min_samples_split = [int(x) for x in np.linspace(2,10,4)]
min_samples_leaf = [int(x) for x in np.linspace(1,5,4)]

In [None]:
random_grid = {'n_estimators': n_estimators,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf}
random_grid

In [None]:
rf = RandomForestClassifier()
rf_search = RandomizedSearchCV(estimator = rf,
                               param_distributions = random_grid,
                               n_iter = 25,
                               cv = 3)

In [None]:
rf_search.fit(prepared_df, labels)

In [None]:
best_params = rf_search.best_params_
prepared_rf = RandomForestClassifier(min_samples_leaf = best_params["min_samples_leaf"],
                                     min_samples_split=best_params["min_samples_split"],
                                     n_estimators = best_params["n_estimators"]
                                     )

In [None]:
predictions = cross_val_predict(prepared_rf, prepared_df, labels, cv=3)
tunned_cfm = confusion_matrix(labels, predictions)
plot_conf_mtx(tunned_cfm)

In [None]:
prep_test_df = prep_test_df[ list(prepared_df.columns) ]
prepared_rf.fit(prepared_df, labels)
predictions = prepared_rf.predict(prep_test_df)
output = pd.DataFrame({'PassengerId': test_df.PassengerId, 'Survived': predictions})
output.to_csv('submission.csv', index=False)
print("Your submission was successfully saved!")