In [7]:
import numpy as np
import pandas as pd
from df2numpy import TransformDF2Numpy, one_hot_encode
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

In [8]:
# load the Titanic dataset (https://www.kaggle.com/c/titanic/data)
df_train = pd.read_csv('./train.csv')
df_test = pd.read_csv('./test.csv')

df_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [9]:
# initializing a transformer instance
trans = TransformDF2Numpy(objective_col='Survived',
                          fillnan=True,
                          numerical_scaling=True,
                          min_category_count=4,
                          copy=True)

# fit the transformer, and get the numpy.arrays
# x_train contains the factorized categorical variables (first half) and the numerical variables (second half).
x_train, y_train = trans.fit_transform(df_train)

# apply the fitted tranformer to the test data
x_test = trans.transform(df_test)

print("x_train[0, :]:", x_train[0, :])
print("y_train[0]:", y_train[0])

Starting to fit a transformer of TransformDF2Numpy.
Garbage variable Dropped: (column: 'Name')
Numerical NaNs filled with alternative value: (column: 'Age'), (filled rows: 177, value: 29.699118)
Categories thresholded: (column: 'Ticket'), (valid categories: 19, dropped categories: 662)
Categories thresholded: (column: 'Cabin'), (valid categories: 3, dropped categories: 144)
Categorical NaNs filled with alternative value: (column: 'Cabin'), (filled rows: 687, factorized value: 0.000000, category: 'TransformDF2Numpy_NaN_category')
Categorical NaNs filled with alternative value: (column: 'Embarked'), (filled rows: 2, factorized value: 3.000000, category: 'TransformDF2Numpy_NaN_category')

Transformer fitted.
Number of the categorical variables: 3
Number of the numerical variables: 7
---------------------------------------------------
x_train[0, :]: [ 0.          0.          0.         -1.73010795  0.82737625 -0.73769359
 -0.59248055  0.43279297 -0.47367302 -0.50244516]
y_train[0]: 0


In [10]:
# one hot encode
x_train_one_hot, variable_names = one_hot_encode(trans, x_train, elim_verbose=True)
x_test_one_hot = one_hot_encode(trans, x_test, elim_verbose=True)[0]

print("x_train_one_hot[0, :]:", x_train_one_hot[0, :])
print("variable names:", variable_names)

x_train_one_hot[0, :]: [ 0.34138154 -0.06715343 -0.08898625 -0.07512217 -0.06715343 -0.06715343
 -0.0823387  -0.0823387  -0.0823387  -0.07512217 -0.08898625 -0.06715343
 -0.08898625 -0.06715343 -0.06715343 -0.06715343 -0.06715343 -0.06715343
 -0.06715343  0.54492498 -0.52409743 -0.06715343 -0.06715343  0.61930636
 -0.48204268 -0.30756234 -1.73010795  0.82737625 -0.73769359 -0.59248055
  0.43279297 -0.47367302 -0.50244516]
variable names: ['Ticket_TransformDF2Numpy_dropped_category', 'Ticket_349909', 'Ticket_347082', 'Ticket_382652', 'Ticket_347077', 'Ticket_19950', 'Ticket_3101295', 'Ticket_CA 2144', 'Ticket_347088', 'Ticket_S.O.C. 14879', 'Ticket_1601', 'Ticket_W./C. 6608', 'Ticket_CA. 2343', 'Ticket_4133', 'Ticket_LINE', 'Ticket_113781', 'Ticket_17421', 'Ticket_PC 17757', 'Ticket_113760', 'Cabin_TransformDF2Numpy_NaN_category', 'Cabin_TransformDF2Numpy_dropped_category', 'Cabin_G6', 'Cabin_C23 C25 C27', 'Embarked_S', 'Embarked_C', 'Embarked_Q', 'PassengerId', 'Pclass', 'Sex', 'Age', 

In [13]:
# Training Random Forest
classifier = RandomForestClassifier(n_estimators=100)
classifier.fit(x_train_one_hot, y_train)
y_pred = classifier.predict(x_test_one_hot)

print(y_pred)

[0 0 0 0 0 0 0 0 1 0 0 0 1 0 1 1 0 0 0 0 0 1 1 0 1 0 1 0 0 0 0 0 0 0 1 0 0
 0 0 0 0 0 0 1 1 0 1 0 1 0 0 0 1 1 0 0 0 0 0 1 0 0 0 0 1 1 0 0 1 1 0 0 0 1
 1 1 0 1 0 0 1 0 0 0 0 0 1 0 1 1 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 0 0 0
 1 1 0 1 0 0 1 1 1 1 0 1 0 0 1 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0
 0 0 1 0 0 0 0 0 1 0 0 1 1 1 1 0 0 0 1 0 1 0 0 0 0 0 0 1 1 1 1 1 0 1 1 0 1
 0 1 0 0 0 0 0 1 0 1 0 1 0 0 0 1 1 1 1 0 0 0 0 1 0 0 0 0 0 0 0 1 0 1 0 1 0
 1 0 1 1 0 1 0 0 0 1 0 0 1 0 1 0 1 1 1 1 1 0 0 1 1 0 1 0 1 0 1 0 0 0 0 0 1
 0 0 0 1 1 1 0 0 0 0 0 0 0 1 1 0 1 0 0 0 0 0 1 1 0 1 0 0 0 0 0 0 0 0 0 0 0
 1 0 0 0 0 0 0 0 1 1 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 1 1 0
 0 0 0 0 0 0 0 0 0 0 1 0 1 0 1 0 1 1 0 0 0 1 1 1 0 0 0 0 1 1 0 1 0 0 0 1 1
 0 1 0 0 1 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 1 0 1 0 0 1 0 1 0 1 0 0
 0 1 1 1 1 0 0 1 0 0 1]


In [14]:
# Export the prediction to csv
PassengerId = np.array(df_test["PassengerId"]).astype(int)
my_solution = pd.DataFrame(y_pred, PassengerId, columns = ["Survived"])
my_solution.to_csv("prediction.csv", index_label = ["PassengerId"])  # kaggle score: 0.78708