In [153]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier


In [None]:
train = pd.read_csv('space_train.csv')
test = pd.read_csv('space_test.csv')
TARGET = 'Transported'

print(train.info())
num_col = train.select_dtypes(include=['float64']).columns.tolist()
cat_col = train.select_dtypes(include=['object']).columns.tolist()
train[['group_id', 'sub_id']] = train['PassengerId'].str.split('_', expand=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8492 non-null   object 
 2   CryoSleep     8476 non-null   object 
 3   Cabin         8494 non-null   object 
 4   Destination   8511 non-null   object 
 5   Age           8514 non-null   float64
 6   VIP           8490 non-null   object 
 7   RoomService   8512 non-null   float64
 8   FoodCourt     8510 non-null   float64
 9   ShoppingMall  8485 non-null   float64
 10  Spa           8510 non-null   float64
 11  VRDeck        8505 non-null   float64
 12  Name          8493 non-null   object 
 13  Transported   8693 non-null   bool   
dtypes: bool(1), float64(6), object(7)
memory usage: 891.5+ KB
None


In [143]:
x_train = train.drop(columns='Transported')
x_test = test.copy()
y_train = train['Transported']
y_test = pd.read_csv('sample_submission.csv')['Transported']

In [152]:
num_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

cat_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

pre = ColumnTransformer(transformers=[
    ('num', num_pipeline, num_col),
    ('cat', cat_pipeline, cat_col)
])


pipe = Pipeline([
  ('Process', pre),
  ('model', RandomForestClassifier(n_estimators=200, random_state=2))
])

In [145]:
pipe.fit(x_train, y_train)
y_pred = pipe.predict(x_test)

In [147]:
# pipe.score(x_test, y_test)

In [148]:
print(y_pred.shape)

(4277,)


In [149]:
print(x_test.shape)

(4277, 13)


In [150]:
data = pd.DataFrame({'PassengerId':x_test.PassengerId ,'Transported':y_pred})
data.to_csv('sample_submission.csv', index=False)

In [151]:
pd.read_csv('sample_submission1.csv').shape

(4277, 2)