In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Reading Data
train_path = '/kaggle/input/titanic/train.csv'
train_data = pd.read_csv(train_path)
X_copy = train_data.copy()

# Find Missing Data
cols_with_missing = [col for col in train_data.columns
                     if train_data[col].isnull().any()]

print('Missing data in following columns: ',cols_with_missing)

# Create X and y
y = train_data['Survived']

X = train_data.drop(['Survived'],axis=1)
X = X.drop(['Cabin'],axis=1)

# Get numerical and categorical
numerical_cols = numerical_cols = [cname for cname in X.columns if X[cname].dtype in ['int64','float64']]
categorical_cols = [cname for cname in X.columns if X[cname].dtype=='object']


In [None]:
# Missing Value Count

missing_val_count_by_col = X.isnull().sum()
print(missing_val_count_by_col[missing_val_count_by_col > 0])
print('Categorical cols: ',categorical_cols)

In [None]:
X.head()

In [None]:
print(X.shape)
print(X.columns)

In [None]:
# Creating Preprocess

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

# Numerical & Categorical Transform
numerical_transformer = SimpleImputer(strategy='constant')

categorical_transformer = Pipeline(steps=[
    ('imputer',SimpleImputer(strategy='most_frequent')),
    ('onehot',OneHotEncoder(handle_unknown='ignore'))
])

# Preprocessor
preprocessor = ColumnTransformer(
                transformers=[
                    ('num', numerical_transformer, numerical_cols),
                    ('cat', categorical_transformer, categorical_cols)])


In [None]:
# Define Model & Pipeline

from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(n_estimators=50,random_state=0)

pipeline = Pipeline(steps=[('preprocessor',preprocessor),
                          ('model',model)])

In [None]:
# Cross Validation

from sklearn.model_selection import cross_val_score

scores = -1 * cross_val_score(pipeline, X, y,
                             cv=5,
                             scoring='neg_mean_absolute_error')
print('MAE scores: ',scores)
print('Average scores: ',scores.mean())

In [None]:
# Improving Scores: Finding optimal n_estimators to prevent Underfitting/Overfitting

def get_scores(n_estimators):
    # Define Model
    rf_model = RandomForestRegressor(n_estimators=n_estimators,
                                    random_state=0)
    # Pipeline
    pipeline = Pipeline(steps=[('preprocessor',preprocessor),
                              ('model',rf_model)])
    
    scores = -1 * cross_val_score(pipeline, X, y,
                                 cv=5,
                                 scoring='neg_mean_absolute_error')
    
    avg_score = scores.mean()
    
    return avg_score

# Finding n_estimators from 50 -- 450

for n_estimators in range(50,450+1,50):
    mae = get_scores(n_estimators)
    print(f'Number of estimators: {n_estimators} \t\t Mean Absolute Error: {mae}' )
    

In [None]:
# Visual Representation of Lowest MAE
from matplotlib import pyplot as plt

mae_scores = {x: get_scores(x) for x in range(50,450+1,50)}

plt.title("Number of Estimators v. MAE")
plt.plot(list(mae_scores.keys()),list(mae_scores.values()))
plt.xlabel("Number of Estimators v. MAE")
plt.ylabel("MAE")

In [None]:
# Evaluating Model using best Number of Estimators

best_estimators = 150

# Define Model
my_model = RandomForestRegressor(n_estimators=best_estimators,random_state=0)

# Pipeline
my_pipeline = Pipeline(steps=[('preprocessor',preprocessor),
                             ('model',my_model)])

# Fit 
my_pipeline.fit(X,y)

# Read Test Data
test_path = '/kaggle/input/titanic/test.csv'
X_test = pd.read_csv(test_path)
X_test = X_test.drop(['Cabin'],axis=1)

# Predictions
my_predictions = my_pipeline.predict(X_test).astype(int)


# Save Predictions to file
output = pd.DataFrame({'PassengerId': X_test['PassengerId'],
                      'Survived': my_predictions})

output.to_csv('submission.csv',index=False)