In [1]:
import pandas as pd

In [2]:
# pathing is relative
df = pd.read_csv('../data/train.csv')
df.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [3]:
# clean
df = df[['Survived', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'Sex']]
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Age       714 non-null    float64
 3   SibSp     891 non-null    int64  
 4   Parch     891 non-null    int64  
 5   Fare      891 non-null    float64
 6   Sex       891 non-null    object 
dtypes: float64(2), int64(4), object(1)
memory usage: 48.9+ KB


In [4]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [5]:
numeric_transformer = Pipeline(steps=[
       ('imputer', SimpleImputer(strategy='mean')),
       ('scaler', StandardScaler()),
       ('ohe', OneHotEncoder(handle_unknown ='ignore')),
])

categorical_transformer = Pipeline(steps=[
       ('imputer', SimpleImputer(strategy='constant')),
       ('ohe', OneHotEncoder(handle_unknown ='ignore')),
])

In [6]:
numeric_features = [
    'Pclass', 
    'Age', 
    'SibSp', 
    'Parch', 
    'Fare'
]

categorical_features = [
    'Sex'
]

preprocessor = ColumnTransformer(
   transformers=[
    ('numeric', numeric_transformer, numeric_features),
    ('categorical', categorical_transformer, categorical_features),
]) 

In [7]:
# define X, y
from sklearn.ensemble import RandomForestClassifier as rf
dependent_variable = 'Survived'

X = df.drop('Survived', axis=1)
y = df[dependent_variable]

In [8]:
X.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Sex
0,3,22.0,1,0,7.25,male
1,1,38.0,1,0,71.2833,female
2,3,26.0,0,0,7.925,female
3,1,35.0,1,0,53.1,female
4,3,35.0,0,0,8.05,male


In [9]:
# import and instantiate xgboost classifier
from xgboost import XGBClassifier
clf_xgb = XGBClassifier()

In [10]:
pipeline = Pipeline(
    steps = [
                ('preprocessor', preprocessor),
                ('classifier', clf_xgb)
           ]
)

clf = pipeline.fit(X, y)
print (clf)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('numeric',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer()),
                                                                  ('scaler',
                                                                   StandardScaler()),
                                                                  ('ohe',
                                                                   OneHotEncoder(handle_unknown='ignore'))]),
                                                  ['Pclass', 'Age', 'SibSp',
                                                   'Parch', 'Fare']),
                                                 ('categorical',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='constant')),
  

In [11]:
import joblib
joblib.dump(clf, 'model.pkl')
clf = joblib.load('model.pkl')

In [12]:
json_payload = pd.DataFrame.from_dict([{"Pclass": "3", "Age": "22.0", "SibSp": "1", "Parch": "0", "Fare": "70", "Sex": "m"}])
prediction = clf.predict(json_payload)[0]
prediction

0