<a href="https://colab.research.google.com/github/kaggle-challenge-group/machine_learning_model/blob/main/kaggle_machine_lrnmodel.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# # load the dataset from the google drive
# from google.colab import drive
# drive.mount('/content/drive')
# import multiprocessing
# img_count = multiprocessing.cpu_count()

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import pandas as pd

# Load the test dataset
train = pd.read_csv('/content/drive/MyDrive/train.csv')
test = pd.read_csv('/content/drive/MyDrive/test.csv')

In [4]:
import pandas as pd

def analyze_group_consistency(df, id_col='PassengerId', columns_to_analyze=None):

  # Split 'id_col' into 'Group' and 'Member'
  df[['Group', '_']] = df[id_col].str.split('_', expand=True)

  # Define a function to check if all values in a column of a group are the same
  def all_same(group, column):
      return group[column].nunique() == 1

  # Analyze each group
  groups = df.groupby('Group')
  total_groups = groups.ngroups

  for column in columns_to_analyze:
      same_value_count = sum(all_same(group, column) for _, group in groups)
      same_value_pct = same_value_count / total_groups * 100
      print(f"Percentage of groups with the same {column}: {same_value_pct:.2f}%")


columns_to_analyze = ['HomePlanet', 'Destination', 'Cabin']
analyze_group_consistency(train, id_col='PassengerId', columns_to_analyze=columns_to_analyze)

analyze_group_consistency(test, id_col='PassengerId', columns_to_analyze=columns_to_analyze)

Percentage of groups with the same HomePlanet: 98.23%
Percentage of groups with the same Destination: 86.81%
Percentage of groups with the same Cabin: 91.64%
Percentage of groups with the same HomePlanet: 98.50%
Percentage of groups with the same Destination: 86.81%
Percentage of groups with the same Cabin: 89.75%


In [5]:
def fill_missing_with_group_data(df, columns):
    """
    Fills missing values in specified columns based on the most common value within each group.

    Parameters:
    - df: DataFrame containing the data.
    - columns: List of column names to fill missing values for.
    """
    # Group by 'Group' and then apply the filling logic
    for column in columns:
        # Use transform to apply a function while keeping the original DataFrame index
        df[column] = df.groupby('Group')[column].transform(lambda x: x.fillna(x.mode()[0]) if not x.mode().empty else x)

    return df

# Columns to fill missing values for
columns_to_fill = ['HomePlanet', 'Destination', 'Cabin']

# Apply the function to fill missing values based on group data
test = fill_missing_with_group_data(test, columns_to_fill)
train = fill_missing_with_group_data(train, columns_to_fill)
# Check if there are still missing values
print(train[columns_to_fill].isnull().sum())
print(test[columns_to_fill].isnull().sum())


HomePlanet     111
Destination    103
Cabin           99
dtype: int64
HomePlanet     46
Destination    51
Cabin          63
dtype: int64


In [6]:
test_copy = test.copy()

In [7]:
test_copy.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Group,_
0,0013_01,Earth,True,G/3/S,TRAPPIST-1e,27.0,False,0.0,0.0,0.0,0.0,0.0,Nelly Carsoning,13,1
1,0018_01,Earth,False,F/4/S,TRAPPIST-1e,19.0,False,0.0,9.0,0.0,2823.0,0.0,Lerome Peckers,18,1
2,0019_01,Europa,True,C/0/S,55 Cancri e,31.0,False,0.0,0.0,0.0,0.0,0.0,Sabih Unhearfus,19,1
3,0021_01,Europa,False,C/1/S,TRAPPIST-1e,38.0,False,0.0,6652.0,0.0,181.0,585.0,Meratz Caltilter,21,1
4,0023_01,Earth,False,F/5/S,TRAPPIST-1e,20.0,False,10.0,0.0,635.0,0.0,0.0,Brence Harperez,23,1


In [8]:
train_copy = train.copy()

In [9]:
train_copy.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,Group,_
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False,1,1
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True,2,1
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False,3,1
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False,3,2
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True,4,1


In [10]:
test_copy.drop(['Group', ''], axis=1, inplace=True, errors='ignore')

In [11]:
test_copy.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,_
0,0013_01,Earth,True,G/3/S,TRAPPIST-1e,27.0,False,0.0,0.0,0.0,0.0,0.0,Nelly Carsoning,1
1,0018_01,Earth,False,F/4/S,TRAPPIST-1e,19.0,False,0.0,9.0,0.0,2823.0,0.0,Lerome Peckers,1
2,0019_01,Europa,True,C/0/S,55 Cancri e,31.0,False,0.0,0.0,0.0,0.0,0.0,Sabih Unhearfus,1
3,0021_01,Europa,False,C/1/S,TRAPPIST-1e,38.0,False,0.0,6652.0,0.0,181.0,585.0,Meratz Caltilter,1
4,0023_01,Earth,False,F/5/S,TRAPPIST-1e,20.0,False,10.0,0.0,635.0,0.0,0.0,Brence Harperez,1


In [12]:
train_copy.drop(['Group', ''], axis=1, inplace=True, errors='ignore')

In [13]:
train_copy['DataType'] = 'train'
test_copy['DataType'] = 'test'

# Combine the datasets
combined = pd.concat([train_copy, test_copy], ignore_index=True)

# Preprocess the 'Cabin' column by splitting it and then dropping the 'Num' column
combined[['Deck', 'Num', 'Side']] = combined['Cabin'].str.split('/', expand=True)
combined.drop('Num', axis=1, inplace=True)  # Drop the 'Num' column as it's not needed

# Display the modified combined DataFrame structure to verify changes
combined.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,_,DataType,Deck,Side
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False,1,train,B,P
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True,1,train,F,S
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False,1,train,A,S
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False,2,train,A,S
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True,1,train,F,S


In [14]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.preprocessing import LabelEncoder

# Assuming 'combined' is your dataset after the initial preprocessing steps

# Convert 'VIP' and 'CryoSleep' boolean columns to integers
# Handling 'VIP' and 'CryoSleep' boolean columns, keeping NaN values unchanged
combined['VIP'] = combined['VIP'].astype(float)
combined['CryoSleep'] = combined['CryoSleep'].astype(float)


# Encode 'Deck' and 'Side' using a LabelEncoder or custom mapping
# This example uses LabelEncoder for simplicity
deck_encoder = LabelEncoder()
side_encoder = LabelEncoder()

# Fit and transform 'Deck' and 'Side' to integers
combined['Deck'] = deck_encoder.fit_transform(combined['Deck'])
combined['Side'] = side_encoder.fit_transform(combined['Side'])

# Select columns for MICE imputation
columns_to_impute = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'Deck', 'Side', 'VIP', 'CryoSleep']

# Instantiate the MICE imputer
mice_imputer = IterativeImputer(max_iter=10, random_state=0)

# Perform the imputation
combined[columns_to_impute] = mice_imputer.fit_transform(combined[columns_to_impute])


In [15]:
combined.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,_,DataType,Deck,Side
0,0001_01,Europa,0.0,B/0/P,TRAPPIST-1e,39.0,0.0,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False,1,train,1.0,0.0
1,0002_01,Earth,0.0,F/0/S,TRAPPIST-1e,24.0,0.0,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True,1,train,5.0,1.0
2,0003_01,Europa,0.0,A/0/S,TRAPPIST-1e,58.0,1.0,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False,1,train,0.0,1.0
3,0003_02,Europa,0.0,A/0/S,TRAPPIST-1e,33.0,0.0,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False,2,train,0.0,1.0
4,0004_01,Earth,0.0,F/1/S,TRAPPIST-1e,16.0,0.0,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True,1,train,5.0,1.0


In [16]:
combined.isnull().sum()

PassengerId        0
HomePlanet       157
CryoSleep          0
Cabin            162
Destination      154
Age                0
VIP                0
RoomService        0
FoodCourt          0
ShoppingMall       0
Spa                0
VRDeck             0
Name             294
Transported     4277
_                  0
DataType           0
Deck               0
Side               0
dtype: int64

In [17]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

# Function to predict missing values for a given column (e.g., 'HomePlanet' or 'Destination')
def predict_missing_values(df, column, features_columns):
    # Split the data based on known and unknown values
    known_values = df[df[column].notna()]
    unknown_values = df[df[column].isna()]

    # Prepare features (X) and target (y)
    X = known_values[features_columns]
    y = known_values[column]

    # Encode the target variable since it's categorical
    encoder = LabelEncoder()
    y_encoded = encoder.fit_transform(y)

    # Splitting the data for model validation
    X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

    # Training the model
    model = RandomForestClassifier(random_state=42)
    model.fit(X_train, y_train)

    # Predicting on the test set for validation (optional)
    y_pred = model.predict(X_test)
    print(f"Accuracy on test set for {column}:", accuracy_score(y_test, y_pred))

    # Predicting missing values
    X_unknown = unknown_values[features_columns]
    predicted_values_encoded = model.predict(X_unknown)

    # Decoding the predicted labels back to original categories
    predicted_values = encoder.inverse_transform(predicted_values_encoded)

    # Filling in the predicted values into the original dataset
    df.loc[df[column].isna(), column] = predicted_values

# Specify the features to use for prediction
features_columns = ['Age', 'VIP', 'CryoSleep', 'Deck', 'Side']  # Include other relevant columns as needed

# Predict missing 'HomePlanet'
predict_missing_values(combined, 'HomePlanet', features_columns)

# Predict missing 'Destination'
predict_missing_values(combined, 'Destination', features_columns)


Accuracy on test set for HomePlanet: 0.8560280920795942
Accuracy on test set for Destination: 0.6688767550702028


In [18]:
# Separate the combined dataset back into the original train and test datasets
train_processed = combined[combined['DataType'] == 'train'].drop('DataType', axis=1)
test_processed = combined[combined['DataType'] == 'test'].drop('DataType', axis=1)


In [19]:
train_processed.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,_,Deck,Side
0,0001_01,Europa,0.0,B/0/P,TRAPPIST-1e,39.0,0.0,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False,1,1.0,0.0
1,0002_01,Earth,0.0,F/0/S,TRAPPIST-1e,24.0,0.0,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True,1,5.0,1.0
2,0003_01,Europa,0.0,A/0/S,TRAPPIST-1e,58.0,1.0,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False,1,0.0,1.0
3,0003_02,Europa,0.0,A/0/S,TRAPPIST-1e,33.0,0.0,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False,2,0.0,1.0
4,0004_01,Earth,0.0,F/1/S,TRAPPIST-1e,16.0,0.0,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True,1,5.0,1.0


In [20]:
test_processed.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,_,Deck,Side
8693,0013_01,Earth,1.0,G/3/S,TRAPPIST-1e,27.0,0.0,0.0,0.0,0.0,0.0,0.0,Nelly Carsoning,,1,6.0,1.0
8694,0018_01,Earth,0.0,F/4/S,TRAPPIST-1e,19.0,0.0,0.0,9.0,0.0,2823.0,0.0,Lerome Peckers,,1,5.0,1.0
8695,0019_01,Europa,1.0,C/0/S,55 Cancri e,31.0,0.0,0.0,0.0,0.0,0.0,0.0,Sabih Unhearfus,,1,2.0,1.0
8696,0021_01,Europa,0.0,C/1/S,TRAPPIST-1e,38.0,0.0,0.0,6652.0,0.0,181.0,585.0,Meratz Caltilter,,1,2.0,1.0
8697,0023_01,Earth,0.0,F/5/S,TRAPPIST-1e,20.0,0.0,10.0,0.0,635.0,0.0,0.0,Brence Harperez,,1,5.0,1.0


In [21]:
train_processed.isnull().sum()

PassengerId       0
HomePlanet        0
CryoSleep         0
Cabin            99
Destination       0
Age               0
VIP               0
RoomService       0
FoodCourt         0
ShoppingMall      0
Spa               0
VRDeck            0
Name            200
Transported       0
_                 0
Deck              0
Side              0
dtype: int64

In [22]:
# Apply one-hot encoding to 'HomePlanet' and 'Destination' for both datasets
train_final_encoded = pd.get_dummies(train_processed, columns=['HomePlanet', 'Destination'])
test_final_encoded = pd.get_dummies(test_processed, columns=['HomePlanet', 'Destination'])


In [23]:
train_final_encoded.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8693 entries, 0 to 8692
Data columns (total 21 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   PassengerId                8693 non-null   object 
 1   CryoSleep                  8693 non-null   float64
 2   Cabin                      8594 non-null   object 
 3   Age                        8693 non-null   float64
 4   VIP                        8693 non-null   float64
 5   RoomService                8693 non-null   float64
 6   FoodCourt                  8693 non-null   float64
 7   ShoppingMall               8693 non-null   float64
 8   Spa                        8693 non-null   float64
 9   VRDeck                     8693 non-null   float64
 10  Name                       8493 non-null   object 
 11  Transported                8693 non-null   object 
 12  _                          8693 non-null   object 
 13  Deck                       8693 non-null   float

In [24]:
train_final_encoded.head()

Unnamed: 0,PassengerId,CryoSleep,Cabin,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,...,Transported,_,Deck,Side,HomePlanet_Earth,HomePlanet_Europa,HomePlanet_Mars,Destination_55 Cancri e,Destination_PSO J318.5-22,Destination_TRAPPIST-1e
0,0001_01,0.0,B/0/P,39.0,0.0,0.0,0.0,0.0,0.0,0.0,...,False,1,1.0,0.0,0,1,0,0,0,1
1,0002_01,0.0,F/0/S,24.0,0.0,109.0,9.0,25.0,549.0,44.0,...,True,1,5.0,1.0,1,0,0,0,0,1
2,0003_01,0.0,A/0/S,58.0,1.0,43.0,3576.0,0.0,6715.0,49.0,...,False,1,0.0,1.0,0,1,0,0,0,1
3,0003_02,0.0,A/0/S,33.0,0.0,0.0,1283.0,371.0,3329.0,193.0,...,False,2,0.0,1.0,0,1,0,0,0,1
4,0004_01,0.0,F/1/S,16.0,0.0,303.0,70.0,151.0,565.0,2.0,...,True,1,5.0,1.0,1,0,0,0,0,1


In [25]:
def convert_bool(col):
    if col is True:
        return 1
    else:
        return 0

In [26]:
y_train = train_final_encoded['Transported'].apply(convert_bool)
X_train = train_final_encoded.drop(['PassengerId', 'Cabin', 'Name','Transported'], axis=1)
# Example model training
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)


In [27]:
X_train.head()

Unnamed: 0,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,_,Deck,Side,HomePlanet_Earth,HomePlanet_Europa,HomePlanet_Mars,Destination_55 Cancri e,Destination_PSO J318.5-22,Destination_TRAPPIST-1e
0,0.0,39.0,0.0,0.0,0.0,0.0,0.0,0.0,1,1.0,0.0,0,1,0,0,0,1
1,0.0,24.0,0.0,109.0,9.0,25.0,549.0,44.0,1,5.0,1.0,1,0,0,0,0,1
2,0.0,58.0,1.0,43.0,3576.0,0.0,6715.0,49.0,1,0.0,1.0,0,1,0,0,0,1
3,0.0,33.0,0.0,0.0,1283.0,371.0,3329.0,193.0,2,0.0,1.0,0,1,0,0,0,1
4,0.0,16.0,0.0,303.0,70.0,151.0,565.0,2.0,1,5.0,1.0,1,0,0,0,0,1


In [28]:
test_final_encoded.head()

Unnamed: 0,PassengerId,CryoSleep,Cabin,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,...,Transported,_,Deck,Side,HomePlanet_Earth,HomePlanet_Europa,HomePlanet_Mars,Destination_55 Cancri e,Destination_PSO J318.5-22,Destination_TRAPPIST-1e
8693,0013_01,1.0,G/3/S,27.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,1,6.0,1.0,1,0,0,0,0,1
8694,0018_01,0.0,F/4/S,19.0,0.0,0.0,9.0,0.0,2823.0,0.0,...,,1,5.0,1.0,1,0,0,0,0,1
8695,0019_01,1.0,C/0/S,31.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,1,2.0,1.0,0,1,0,1,0,0
8696,0021_01,0.0,C/1/S,38.0,0.0,0.0,6652.0,0.0,181.0,585.0,...,,1,2.0,1.0,0,1,0,0,0,1
8697,0023_01,0.0,F/5/S,20.0,0.0,10.0,0.0,635.0,0.0,0.0,...,,1,5.0,1.0,1,0,0,0,0,1


In [29]:
test_data = test_final_encoded.drop(['PassengerId', 'Cabin', 'Name','Transported'], axis=1)

In [30]:
test_data.head()

Unnamed: 0,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,_,Deck,Side,HomePlanet_Earth,HomePlanet_Europa,HomePlanet_Mars,Destination_55 Cancri e,Destination_PSO J318.5-22,Destination_TRAPPIST-1e
8693,1.0,27.0,0.0,0.0,0.0,0.0,0.0,0.0,1,6.0,1.0,1,0,0,0,0,1
8694,0.0,19.0,0.0,0.0,9.0,0.0,2823.0,0.0,1,5.0,1.0,1,0,0,0,0,1
8695,1.0,31.0,0.0,0.0,0.0,0.0,0.0,0.0,1,2.0,1.0,0,1,0,1,0,0
8696,0.0,38.0,0.0,0.0,6652.0,0.0,181.0,585.0,1,2.0,1.0,0,1,0,0,0,1
8697,0.0,20.0,0.0,10.0,0.0,635.0,0.0,0.0,1,5.0,1.0,1,0,0,0,0,1


In [31]:
predicted_transported = model.predict(test_data)

In [32]:
# Prepare the submission DataFrame
submission_df = pd.DataFrame({
    'PassengerId': test['PassengerId'],
    'Transported': predicted_transported.astype(bool)  # Ensure the boolean type if your model outputs integers
})

# Save the submission file
submission_df.to_csv('submission.csv', index=False)

In [33]:
sample_submission_df = pd.read_csv('/content/drive/MyDrive/sample_submission.csv')

In [34]:
from sklearn.metrics import accuracy_score

# Merge the sample submission with your predictions to align them
comparison_df = sample_submission_df.merge(submission_df, on='PassengerId', suffixes=('_sample', '_predicted'))

# Calculate accuracy
accuracy = accuracy_score(comparison_df['Transported_sample'], comparison_df['Transported_predicted'])
print(f"Accuracy: {accuracy}")

Accuracy: 0.5209258826280103


In [35]:
from sklearn.model_selection import train_test_split


X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)


In [36]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_val)


In [37]:
import tensorflow as tf
from tensorflow.keras import layers, models

# Define the model
model = models.Sequential([
    layers.Dense(64, activation='relu', input_shape=(X_train_scaled.shape[1],)),
    layers.Dense(64, activation='relu'),
    layers.Dense(1, activation='sigmoid')  # Use 'softmax' for multi-class classification and adjust the units accordingly
])

# Compile the model
model.compile(optimizer='adam',
              loss='binary_crossentropy',  # Use 'categorical_crossentropy' for multi-class classification
              metrics=['accuracy'])

# Summary of the model
model.summary()


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 64)                1152      
                                                                 
 dense_1 (Dense)             (None, 64)                4160      
                                                                 
 dense_2 (Dense)             (None, 1)                 65        
                                                                 
Total params: 5377 (21.00 KB)
Trainable params: 5377 (21.00 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [38]:
history = model.fit(X_train_scaled, y_train, epochs=100, validation_split=0.2, verbose=1)


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [39]:
test_data = scaler.fit_transform(test_data)

In [40]:
predicted_transported = model.predict(test_data)



In [41]:
predicted_transported

array([[7.8574193e-01],
       [5.9440773e-04],
       [9.9974126e-01],
       ...,
       [6.0769075e-01],
       [9.4247311e-01],
       [7.5771034e-01]], dtype=float32)

In [42]:
import numpy as np
predictions = np.where(predicted_transported > 0.5, 1, 0)

In [43]:
predictions

array([[1],
       [0],
       [1],
       ...,
       [1],
       [1],
       [1]])

In [45]:
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score

# Assuming 'predictions' is your 2D predictions array and 'test' is your test dataset
# Flatten the predictions array to make it 1-dimensional
predictions_1d = predictions.ravel()

# Now create the DataFrame using the 1D predictions array
submission_df = pd.DataFrame({
    'PassengerId': test['PassengerId'],
    'Transported': predictions_1d.astype(bool)  # Ensure the boolean type if your model outputs integers
})

# Assuming 'sample_submission_df' is a sample submission DataFrame you're comparing against
comparison_df = sample_submission_df.merge(submission_df, on='PassengerId', suffixes=('_sample', '_predicted'))

# Calculate the accuracy
accuracy = accuracy_score(comparison_df['Transported_sample'], comparison_df['Transported_predicted'])
print(f"Accuracy: {accuracy}")


# Save the submission file
submission_df.to_csv('/content/drive/MyDrive/submission3.csv', index=False)
print(submission_df)

Accuracy: 0.4828150572831424
     PassengerId  Transported
0        0013_01         True
1        0018_01        False
2        0019_01         True
3        0021_01         True
4        0023_01         True
...          ...          ...
4272     9266_02         True
4273     9269_01        False
4274     9271_01         True
4275     9273_01         True
4276     9277_01         True

[4277 rows x 2 columns]
