<a href="https://colab.research.google.com/github/leandrohbar/Machine_learning/blob/main/Titanic.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [401]:
import pandas as pd

titanicTrain = '/content/drive/MyDrive/Arquivos CSV/TITANIC/train.csv'
titanicTest = '/content/drive/MyDrive/Arquivos CSV/TITANIC/test.csv'
submission = '/content/drive/MyDrive/Arquivos CSV/TITANIC/gender_submission.csv'

train_data = pd.read_csv(titanicTrain)
test_data = pd.read_csv(titanicTest)
sub_file = pd.read_csv(submission)

In [None]:
train_data.head()

In [None]:
train_data.info()

In [402]:
def process_ticket_data(data_frame):
    """
    Process ticket-related data in the DataFrame.

    Args:
        data_frame (DataFrame): The DataFrame containing ticket-related columns.

    Returns:
        DataFrame: A copy of the input DataFrame with ticket-related data processed.
    """
    data_frame = data_frame.copy()

    # Define a function to normalize names
    def normalize_name(x):
        return " ".join([v.strip(",()[].\"'") for v in x.split(" ")])

    # Define a function to extract ticket numbers
    def extract_ticket_number(x):
        return x.split(" ")[-1]

    # Define a function to extract ticket items
    def extract_ticket_item(x):
        items = x.split(" ")
        if len(items) == 1:
            return "None"
        return " ".join(items[0:-1])

    # Apply the normalization function to the 'Name' column
    data_frame['Name'] = data_frame['Name'].apply(normalize_name)

    # Extract and create a new column for ticket numbers
    data_frame['Ticket_number'] = data_frame['Ticket'].apply(extract_ticket_number)

    # Extract and create a new column for ticket items
    data_frame['Ticket_item'] = data_frame['Ticket'].apply(extract_ticket_item)

    return data_frame

# Process ticket-related data for the train and test DataFrames
train_d = process_ticket_data(train_data)
test_d = process_ticket_data(test_data)


In [None]:
test_d.head()

In [403]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split

In [404]:
# Extract Input Features

# Get a list of all columns in the 'train_d' DataFrame
inputFeatures = list(train_d.columns)

# Remove the 'Ticket' column from the list of input features
inputFeatures.remove('Ticket')

# Remove the 'PassengerId' column from the list of input features
inputFeatures.remove('PassengerId')

# Remove the 'Survived' column from the list of input features
inputFeatures.remove('Survived')
inputFeatures.remove('Name')

In [405]:
# Prepare Input Features (X) and Target Variable (y)

# 'inputFeatures' is a list of column names containing the input features
X = train_d[inputFeatures]
X_test = test_d[inputFeatures]
# 'Survived' is the target variable you want to predict
y = train_d['Survived']

In [None]:
# Handling Missing Values and One-Hot Encoding

# Import the required libraries
from sklearn.impute import SimpleImputer

# Perform one-hot encoding for categorical features in the training data
X_encoded = pd.get_dummies(X, columns=['Sex', 'Cabin', 'Embarked', 'Ticket_number', 'Ticket_item'])

# Perform one-hot encoding for categorical features in the test data
X_test_encoded = pd.get_dummies(X_test, columns=['Sex', 'Cabin', 'Embarked', 'Ticket_number', 'Ticket_item'])

# Identify columns with some missing values
columns_with_some_nulls = X_encoded.columns[X_encoded.isnull().any()]
columns_with_some_nulls_test = X_test_encoded.columns[X_test_encoded.isnull().any()]

# Create a SimpleImputer with the 'mean' strategy
imputer = SimpleImputer(strategy='mean')

# Fill missing values in the training data with the mean of respective columns
X_encoded[columns_with_some_nulls] = imputer.fit_transform(X_encoded[columns_with_some_nulls])

# Fill missing values in the test data with the same imputer
X_test_encoded[columns_with_some_nulls_test] = imputer.fit_transform(X_test_encoded[columns_with_some_nulls_test])

# Display information about the encoded training and test data
X_encoded.info()
X_test_encoded.info()


In [410]:
# Finding and Dropping Different Columns

# Get the column lists from X_encoded and X_test_encoded
colunas_df1 = X_encoded.columns.tolist()
colunas_df2 = X_test_encoded.columns.tolist()

# Find the columns that are in df1 but not in df2
colunas_diferentes_df2 = [coluna for coluna in colunas_df2 if coluna not in colunas_df1]

# Drop the different columns from X_test_encoded
X_test_encoded = X_test_encoded.drop(colunas_diferentes_df2, axis=1)

# Find the columns that are in df2 but not in df1
colunas_df1 = X_encoded.columns.tolist()
colunas_df2 = X_test_encoded.columns.tolist()

colunas_diferentes_df1 = [coluna for coluna in colunas_df1 if coluna not in colunas_df2]

# Drop the different columns from X_encoded
X_encoded = X_encoded.drop(colunas_diferentes_df1, axis=1)


In [411]:
# Splitting the Data into Training and Validation Sets

# Split the input features (X) and target variable (y) into training and validation sets.
# The 'random_state' parameter ensures reproducibility of the split.
train_X, aval_X, train_y, aval_y = train_test_split(X_encoded, y, random_state=1)


In [424]:
# Function to Calculate Mean Absolute Error (MAE) for DecisionTreeRegressor with Threshold

def get_mae(max_leaf_nodes, train_X, aval_X, train_y, aval_y):
    # Create a DecisionTreeRegressor model with the specified max_leaf_nodes
    model_train = DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes, random_state=1)

    # Fit the model on the training data
    model_train.fit(train_X, train_y)

    # Make predictions on the validation data
    predict = model_train.predict(aval_X)

    # Define a threshold for binary classification
    threshold = 0.5

    # Convert predictions to binary values based on the threshold
    predict = [1 if p > threshold else 0 for p in predict]

    # Calculate the Mean Absolute Error (MAE) between predictions and actual values
    mae = mean_absolute_error(predict, aval_y)

    return mae

# Define a list of max_leaf_nodes values to test
max_leaf_nodes = [50, 65, 70, 90, 95, 99, 100, 110, 120]

# Calculate MAE for each max_leaf_nodes value and store it in a dictionary
scores = {leaf_size: get_mae(leaf_size, train_X, aval_X, train_y, aval_y) for leaf_size in max_leaf_nodes}

# Find the best max_leaf_nodes value with the minimum MAE
best_tree_size = min(scores, key=scores.get)


In [None]:
# Creating the Final Decision Tree Regressor Model

# Create a DecisionTreeRegressor model with the best max_leaf_nodes value
final_model = DecisionTreeRegressor(max_leaf_nodes=best_tree_size, random_state=1)

# Fit the final model on the fully preprocessed training data (X_encoded) and target labels (y)
final_model.fit(X_encoded, y)


In [423]:
# Making Predictions with the Final Model and Calculating MAE

# Make predictions using the final model on the preprocessed training data
final_predict = final_model.predict(X_encoded)

# Define a threshold for binary classification
threshold = 0.5

# Convert the continuous predictions to binary values based on the threshold
final_predict = [1 if p > threshold else 0 for p in final_predict]

# Calculate the Mean Absolute Error (MAE) between the final predictions and actual target labels
mae_ = mean_absolute_error(final_predict, y)


In [422]:
# Making Predictions on Test Data with the Final Model

# Make predictions using the final trained model (final_model) on the preprocessed test data (X_test_encoded)
survived_final = final_model.predict(X_test_encoded)

# Define a threshold for binary classification
threshold = 0.5

# Convert the continuous predictions to binary values based on the threshold
survived_final = [1 if p > threshold else 0 for p in survived_final]


In [419]:
sub_file['Survived'] = survived_final

In [None]:
sub_file.head()

In [421]:
sub_file.to_csv('Submission.csv', index=False)