# Demo

## Tabular data classification

In [32]:
import numpy as np
import pandas as pd
from df2numpy import TransformDF2Numpy, one_hot_encode
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

In [33]:
# load the Titanic dataset (https://www.kaggle.com/c/titanic/data)
df_train = pd.read_csv('./train.csv')
df_test = pd.read_csv('./test.csv')

df_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [34]:
# initializing a transformer instance
trans = TransformDF2Numpy(objective_col='Survived',
                          fillnan=True,
                          numerical_scaling=True,
                          min_category_count=4,
                          copy=True)

# fit the transformer, and get the numpy.arrays
# x_train contains the factorized categorical variables (first half) and the numerical variables (second half).
x_train, y_train = trans.fit_transform(df_train)

# apply the fitted tranformer to the test data
x_test = trans.transform(df_test)

Starting to fit a transformer of TransformDF2Numpy.
Garbage variable Dropped: (column: 'Name')
Numerical NaNs filled with alternative value: (column: 'Age'), (filled rows: 177, value: 29.699118)
Categories thresholded: (column: 'Ticket'), (valid categories: 19, dropped categories: 662)
Categories thresholded: (column: 'Cabin'), (valid categories: 3, dropped categories: 144)
Categorical NaNs filled with alternative value: (column: 'Cabin'), (filled rows: 687, factorized value: 0.000000, category: 'TransformDF2Numpy_NaN_category')
Categorical NaNs filled with alternative value: (column: 'Embarked'), (filled rows: 2, factorized value: 3.000000, category: 'TransformDF2Numpy_NaN_category')

Transformer fitted.
Number of the categorical variables: 3
Number of the numerical variables: 7
---------------------------------------------------


In [35]:
# one hot encode
x_train_one_hot, variable_names = one_hot_encode(trans, x_train, elim_verbose=True)
x_test_one_hot = one_hot_encode(trans, x_test, elim_verbose=True)[0]

print("variable names:", variable_names)

variable names: ['Ticket_TransformDF2Numpy_dropped_category', 'Ticket_349909', 'Ticket_347082', 'Ticket_382652', 'Ticket_347077', 'Ticket_19950', 'Ticket_3101295', 'Ticket_CA 2144', 'Ticket_347088', 'Ticket_S.O.C. 14879', 'Ticket_1601', 'Ticket_W./C. 6608', 'Ticket_CA. 2343', 'Ticket_4133', 'Ticket_LINE', 'Ticket_113781', 'Ticket_17421', 'Ticket_PC 17757', 'Ticket_113760', 'Cabin_TransformDF2Numpy_NaN_category', 'Cabin_TransformDF2Numpy_dropped_category', 'Cabin_G6', 'Cabin_C23 C25 C27', 'Embarked_S', 'Embarked_C', 'Embarked_Q', 'PassengerId', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare']


In [36]:
# Training Random Forest
classifier = RandomForestClassifier(n_estimators=100)
classifier.fit(x_train_one_hot, y_train)
y_pred = classifier.predict(x_test_one_hot)

print(y_pred)

[0 0 0 0 0 0 0 0 1 0 0 0 1 0 1 1 0 0 0 0 0 0 1 0 1 0 1 0 0 0 0 0 0 0 1 0 0
 0 0 1 0 0 0 1 1 0 1 0 1 0 0 0 1 1 0 0 0 0 0 1 0 0 0 1 1 1 1 0 1 1 0 0 0 1
 1 1 0 1 0 0 1 0 0 0 0 0 1 0 1 1 0 0 1 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0
 1 1 0 1 0 0 1 1 1 1 0 1 0 0 1 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0
 0 0 1 0 0 0 0 0 1 0 0 0 1 1 1 0 0 0 1 0 1 0 0 0 0 0 0 1 1 1 1 1 0 1 1 0 1
 0 1 0 0 0 0 0 0 0 1 0 1 0 0 0 0 1 1 1 0 0 0 0 1 0 0 0 0 0 0 0 1 0 1 0 1 0
 1 0 1 0 0 1 0 0 0 1 0 0 1 0 1 0 1 1 1 1 1 0 0 1 1 0 1 0 1 0 1 0 0 0 0 0 1
 0 0 0 1 1 0 0 0 0 0 0 1 0 1 1 0 1 0 0 0 0 0 1 1 0 1 0 0 0 0 0 0 0 0 0 0 0
 1 0 0 0 0 0 0 0 1 1 1 1 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0 1 0 1 0 0 0 1 1 0
 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 1 0 0 0 1 1 1 0 0 0 0 1 1 0 1 0 0 0 1 0
 0 1 0 0 1 1 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 1 0 1 0 0 1 0 1 0 1 1 0
 1 1 1 1 1 0 0 1 0 0 1]


In [37]:
# Export the prediction to csv
PassengerId = np.array(df_test["PassengerId"]).astype(int)
my_solution = pd.DataFrame(y_pred, PassengerId, columns = ["Survived"])
my_solution.to_csv("prediction.csv", index_label = ["PassengerId"]) 

# kaggle score: 0.78708

## Confirming variable information

In [38]:
# names of the variables in the order
print("all variables", trans.variables()) 
print("categorical variables", trans.categoricals())  
print("numerical variables", trans.numericals())

all variables ['Ticket', 'Cabin', 'Embarked', 'PassengerId', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare']
categorical variables ['Ticket', 'Cabin', 'Embarked']
numerical variables ['PassengerId', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare']


In [39]:
# variable type
print(trans.is_numerical("Age"))

True


In [40]:
# variable name <-> index link
print(trans.name_to_index("Ticket"))
print(trans.index_to_name(0)) 

0
Ticket


In [41]:
# categories of a categorical variable
print(trans.categories("Ticket"))
print(trans.categories(0))  

Index(['TransformDF2Numpy_dropped_category', '349909', '347082', '382652',
       '347077', '19950', '3101295', 'CA 2144', '347088', 'S.O.C. 14879',
       '1601', 'W./C. 6608', 'CA. 2343', '4133', 'LINE', '113781', '17421',
       'PC 17757', '113760', '2666'],
      dtype='object')
Index(['TransformDF2Numpy_dropped_category', '349909', '347082', '382652',
       '347077', '19950', '3101295', 'CA 2144', '347088', 'S.O.C. 14879',
       '1601', 'W./C. 6608', 'CA. 2343', '4133', 'LINE', '113781', '17421',
       'PC 17757', '113760', '2666'],
      dtype='object')


In [42]:
# category <-> factorized value link
print(trans.category_to_factorized("Ticket", 'LINE'))
print(trans.category_to_factorized(0, "LINE")) 
print(trans.factorized_to_category("Ticket", 14.))
print(trans.factorized_to_category(0, 14.)) 

14.0
14.0
LINE
LINE


In [43]:
# number of unique categories
print(trans.nuniques())
print(trans.nunique("Ticket"))
print(trans.nunique(0)) 

[20, 5, 4]
20
20
