# Solving Kaggle with ChatGPT and CoPilot

All the code here was generated (as much as possible, but some human edits were made)

https://www.kaggle.com/competitions/spaceship-titanic/
https://chat.openai.com/chat

In [None]:
# Import libraries for data manipulation and analysis
import numpy as np
import pandas as pd

# Import libraries for data visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Set default styling for plots
plt.style.use('fivethirtyeight')

# Import scikit-learn for machine learning
import sklearn
from sklearn.preprocessing import MinMaxScaler
from sklearn.neural_network import MLPClassifier



In [None]:
# Filenames for train and test csv files
train_file = 'train.csv'
test_file = 'test.csv'

# Read the CSV file into a dataframe, using the first row as the column names
df_train_raw = pd.read_csv(train_file, header=0)
df_test_raw = pd.read_csv(test_file, header=0)

# Print the first few rows of the dataframe
print("Training data:")
print(df_train_raw.head())

print("Test data:")
print(df_test_raw.head())

In [None]:
# Print a summary of the dataframe, including the data type of each column
print(df_train_raw.info())

# Print a summary of the dataframe, including the data type of each column
print(df_test_raw.info())

In [None]:
# Create a one-hot encoding of the 'col' column
homeplanet_onehot = pd.get_dummies(df_train_raw['HomePlanet'])
print(homeplanet_onehot.head())

In [None]:
for col_name in [
    "HomePlanet",
    "CryoSleep",
    "Cabin",
    "Destination",
    "VIP",
    "Name",
]:
    print(f"All unique values in {col_name}: {df_train_raw[col_name].unique()}")


In [None]:
# Remove rows with missing values
df_train_filtered = df_train_raw.dropna()
df_test_filtered = df_test_raw.dropna()

# Get the number of rows and columns in the dataframe
num_rows, _ = df_train_filtered.shape
num_rows_test, _ = df_test_filtered.shape

# Print the differenc in rows between the original and filtered dataframes
print(f"Number of rows removed (TRAIN): {df_train_raw.shape[0] - num_rows}")
print(f"Number of rows removed (TEST): {df_test_raw.shape[0] - num_rows_test}")

# Print the percentage of rows removed
print(f"Percentage of rows removed (TRAIN): {(df_train_raw.shape[0] - num_rows) / df_train_raw.shape[0] * 100:.2f}%")
print(f"Percentage of rows removed (TEST): {(df_test_raw.shape[0] - num_rows_test) / df_test_raw.shape[0] * 100:.2f}%")

In [None]:
# Columns that will be used for training
training_cols_names = [
    "HomePlanet",
    "CryoSleep",
    # "Cabin",
    "Destination",
    "Age",
    "VIP",
    "RoomService",
    "FoodCourt",
    "ShoppingMall",
    "Spa",
    "VRDeck",
    "Transported",
]

# Create a new dataframe with only the training_cols_names columns
df_train_filtered_cols = df_train_filtered.filter(items=training_cols_names)
df_test_filtered_cols = df_test_filtered.filter(items=training_cols_names)


In [None]:
# List of columns we want to one-hot encode
onehot_cols = [
    "HomePlanet",
    "CryoSleep",
    # "Cabin",
    "Destination",
    "VIP",
]

# Create one-hot encodings of the specified columns
df_train_filtered_cols_onehot = pd.get_dummies(
    df_train_filtered_cols, columns=onehot_cols + ["Transported"])
df_test_filtered_cols_onehot = pd.get_dummies(
    df_test_filtered_cols, columns=onehot_cols)

print(df_train_filtered_cols_onehot.head())


In [None]:
# Create a MinMaxScaler object
scaler = MinMaxScaler()

# Select the columns to normalize
cols = ["Age", "RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck"]

# Normalize the selected columns
df_train_filtered_cols_onehot[cols] = scaler.fit_transform(df_train_filtered_cols_onehot[cols])
df_test_filtered_cols_onehot[cols] = scaler.fit_transform(df_test_filtered_cols_onehot[cols])

print(df_train_filtered_cols_onehot.head())

In [None]:
print('Train data shape:')
print(df_train_filtered_cols_onehot.shape)
print('Test data shape:')
print(df_test_filtered_cols_onehot.shape)

In [None]:

# Remove the "Transported_True" and "Transported_False" columns
train_df = df_train_filtered_cols_onehot.drop(["Transported_True", "Transported_False"], axis=1)

# Get the list of column names
features = train_df.columns

# Select the target columns
target = ["Transported_True", "Transported_False"]

# Create an MLPClassifier object
clf = MLPClassifier()

# Set the maximum number of iterations to 1000
clf.set_params(
    # Hyperparameters chosen by ChatGPT
    max_iter=500,
    batch_size=256,
    # learning_rate=0.001,
    early_stopping=True,
    validation_fraction=0.2,
    verbose=True,
)

# Print a message before starting the training process
print("Starting training...")

# Fit the classifier using the features and target from the dataframe
clf.fit(train_df[features], df_train_filtered_cols_onehot[target])

# Print a message after the training process is complete
print("Training complete!")

In [None]:
# Use the trained classifier to make predictions on the test dataset
predictions = clf.predict(df_test_filtered_cols_onehot)

In [None]:
# Get the columns of the first dataframe as a set
cols1 = set(df_test_filtered_cols_onehot.columns)

# Get the columns of the second dataframe as a set
cols2 = set(train_df.columns)

# Get the columns that are different between the two dataframes
diff_cols = cols1.symmetric_difference(cols2)

# Print the columns that are different
print(diff_cols)

In [None]:
print(predictions)
print(predictions.shape)

In [None]:
print(df_test_filtered.head())
print(df_test_raw.head())

print(df_test_filtered.shape)

In [None]:
# Create an empty dataframe
submissions_filtered = pd.DataFrame()

# Add the "PassengerId" column from the "df_test_filtered" dataframe
submissions_filtered["PassengerId"] = df_test_filtered["PassengerId"]

# Add the first column from the "predictions" dataframe
submissions_filtered["Transported_True"] = predictions[:, 0]

# Add the second column from the "predictions" dataframe
submissions_filtered["Transported_False"] = predictions[:, 1]

# Print the resulting dataframe
print(submissions_filtered.head())

In [None]:
# Add the "Transported_True" column from the "predictions" dataframe
# and convert it to a boolean column called "Transported"
submissions_filtered["Transported"] = predictions[:, 0].astype(bool)

# Print the resulting dataframe
print(submissions_filtered.head())


In [None]:
# Create an empty dataframe
submission = submissions_filtered[["PassengerId", "Transported"]]

# Get the rows of the "PassengerId" column from the "df_test_raw" dataframe
# that are not in the "submissions_filtered" dataframe
missing_rows = df_test_raw[~df_test_raw["PassengerId"].isin(submissions_filtered["PassengerId"])]

# Create new rows in the "submissions_filtered" dataframe for each missing row
# and set the "Transported" value to False
submission = submission.append(
    missing_rows[["PassengerId"]].assign(Transported=False),
    ignore_index=True,
    sort=False,
)

In [None]:
print(df_test_raw.shape)
print(submission.shape)

In [None]:
# Write the dataframe to a CSV file with the column headers included
submission.to_csv('file_with_headers.csv', index=False, header=True)