In [116]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

# Load and inspect data

In [2]:
df = pd.read_csv("data/titanic.csv")

In [3]:
df.shape

(891, 12)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [5]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [6]:
df.nunique()

PassengerId    891
Survived         2
Pclass           3
Name           891
Sex              2
Age             88
SibSp            7
Parch            7
Ticket         681
Fare           248
Cabin          147
Embarked         3
dtype: int64

- `PassengerID` and `Name` are unique columns
- `Ticket` has high cardinality

In [7]:
df["Ticket"].value_counts().head(20)

347082          7
CA. 2343        7
1601            7
CA 2144         6
3101295         6
347088          6
S.O.C. 14879    5
382652          5
LINE            4
113760          4
349909          4
347077          4
4133            4
2666            4
17421           4
W./C. 6608      4
PC 17757        4
19950           4
113781          4
230080          3
Name: Ticket, dtype: int64

# 1. Data Cleaning & Preprocessing

In [8]:
df.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [9]:
df["Age"].describe()

count    714.000000
mean      29.699118
std       14.526497
min        0.420000
25%       20.125000
50%       28.000000
75%       38.000000
max       80.000000
Name: Age, dtype: float64

In [10]:
df[df["Age"].isna()]["Pclass"].value_counts(normalize=True)

3    0.768362
1    0.169492
2    0.062147
Name: Pclass, dtype: float64

In [11]:
df[~df["Age"].isna()]["Pclass"].value_counts(normalize=True)

3    0.497199
1    0.260504
2    0.242297
Name: Pclass, dtype: float64

In [12]:
df[df["Age"].isna()]["Embarked"].value_counts(normalize=True)

S    0.508475
Q    0.276836
C    0.214689
Name: Embarked, dtype: float64

In [13]:
df[~df["Age"].isna()]["Embarked"].value_counts(normalize=True)

S    0.778090
C    0.182584
Q    0.039326
Name: Embarked, dtype: float64

In [14]:
# impute 2 missing values for Embarked with mode
df.loc[df["Embarked"].isna(), "Embarked"] = "S"

## Impute missing age information using linear regression

age = const + Pclass + female + embarked

In [15]:
import statsmodels.api as sm

# create df
predict_age = df.loc[~df["Age"].isna(), ["Age", "Pclass", "Sex", "Embarked"]].copy()
predict_age = predict_age.assign(const=1)

In [16]:
predict_age = predict_age.assign(Female=(predict_age["Sex"] == "female").astype(int))
predict_age = predict_age.drop(columns="Sex")

In [17]:
predict_age = predict_age.merge(pd.get_dummies(predict_age["Embarked"], drop_first=True, prefix="Embarked"),
                                left_index=True, 
                                right_index=True)
predict_age = predict_age.drop(columns="Embarked")

In [18]:
predict_age

Unnamed: 0,Age,Pclass,const,Female,Embarked_Q,Embarked_S
0,22.0,3,1,0,0,1
1,38.0,1,1,1,0,0
2,26.0,3,1,1,0,1
3,35.0,1,1,1,0,1
4,35.0,3,1,0,0,1
...,...,...,...,...,...,...
885,39.0,3,1,1,1,0
886,27.0,2,1,0,0,1
887,19.0,1,1,1,0,1
889,26.0,1,1,0,0,0


In [19]:
[col for col in predict_age.columns if col != "Age"]

['Pclass', 'const', 'Female', 'Embarked_Q', 'Embarked_S']

In [20]:
lin_reg = sm.OLS(predict_age["Age"],
                 predict_age[[col for col in predict_age.columns if col != "Age"]]).fit()

In [21]:
lin_reg.summary()

0,1,2,3
Dep. Variable:,Age,R-squared:,0.164
Model:,OLS,Adj. R-squared:,0.159
Method:,Least Squares,F-statistic:,34.77
Date:,"Tue, 05 May 2020",Prob (F-statistic):,1.5600000000000002e-26
Time:,14:57:42,Log-Likelihood:,-2859.3
No. Observations:,714,AIC:,5729.0
Df Residuals:,709,BIC:,5751.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Pclass,-7.1644,0.629,-11.389,0.000,-8.399,-5.929
const,45.4789,1.722,26.415,0.000,42.099,48.859
Female,-4.5903,1.051,-4.367,0.000,-6.654,-2.526
Embarked_Q,4.5357,2.850,1.591,0.112,-1.060,10.131
Embarked_S,2.2409,1.349,1.661,0.097,-0.408,4.890

0,1,2,3
Omnibus:,9.039,Durbin-Watson:,1.903
Prob(Omnibus):,0.011,Jarque-Bera (JB):,9.386
Skew:,0.226,Prob(JB):,0.00916
Kurtosis:,3.334,Cond. No.,16.0


In [22]:
predicted_age = df.loc[df["Age"].isna(), ["Age", "Pclass", "Sex", "Embarked"]].copy()

predicted_age = predicted_age.assign(const=1)
predicted_age = predicted_age.assign(Female=(predicted_age["Sex"] == "female").astype(int))
predicted_age = predicted_age.drop(columns="Sex")

predicted_age = predicted_age.merge(pd.get_dummies(predicted_age["Embarked"], drop_first=True, prefix="Embarked"),
                                    left_index=True, 
                                    right_index=True)
predicted_age = predicted_age.drop(columns="Embarked")

In [23]:
df.loc[df["Age"].isna(), "Age"]

5     NaN
17    NaN
19    NaN
26    NaN
28    NaN
       ..
859   NaN
863   NaN
868   NaN
878   NaN
888   NaN
Name: Age, Length: 177, dtype: float64

In [24]:
lin_reg.predict(predicted_age[[col for col in predicted_age.columns if col != "Age"]])

5      28.521322
17     33.390871
19     19.395283
26     23.985584
28     23.931021
         ...    
859    23.985584
863    21.636138
868    26.226439
878    26.226439
888    21.636138
Length: 177, dtype: float64

In [25]:
# impute that values
df.loc[df["Age"].isna(), "Age"] = lin_reg.predict(predicted_age[[col for col in predicted_age.columns if col != "Age"]])

In [26]:
df.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         0
dtype: int64

---

In [29]:
df["Cabin"].value_counts()

B96 B98        4
G6             4
C23 C25 C27    4
E101           3
D              3
              ..
D28            1
F G63          1
C87            1
B82 B84        1
E50            1
Name: Cabin, Length: 147, dtype: int64

In [31]:
df["Cabin"].isna().value_counts(normalize=True)

True     0.771044
False    0.228956
Name: Cabin, dtype: float64

In [33]:
df = df.assign(MissingCabin=df["Cabin"].isna().astype(int))
df = df.drop(columns="Cabin")

In [34]:
df.isna().sum()

PassengerId     0
Survived        0
Pclass          0
Name            0
Sex             0
Age             0
SibSp           0
Parch           0
Ticket          0
Fare            0
Embarked        0
MissingCabin    0
dtype: int64

In [35]:
df.replace("", np.nan).isna().sum()

PassengerId     0
Survived        0
Pclass          0
Name            0
Sex             0
Age             0
SibSp           0
Parch           0
Ticket          0
Fare            0
Embarked        0
MissingCabin    0
dtype: int64

In [36]:
df.replace(" ", np.nan).isna().sum()

PassengerId     0
Survived        0
Pclass          0
Name            0
Sex             0
Age             0
SibSp           0
Parch           0
Ticket          0
Fare            0
Embarked        0
MissingCabin    0
dtype: int64

In [37]:
# drop ticket as well
df = df.drop(columns="Ticket")

In [40]:
df = df.assign(Female=(df["Sex"] == "female").astype(int))
df = df.drop(columns="Sex")

df = df.merge(pd.get_dummies(df["Embarked"], drop_first=True, prefix="Embarked"),
              left_index=True, 
              right_index=True)
df = df.drop(columns="Embarked")

In [42]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Age,SibSp,Parch,Fare,MissingCabin,Female,Embarked_Q,Embarked_S
0,1,0,3,"Braund, Mr. Owen Harris",22.0,1,0,7.25,1,0,0,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.0,1,0,71.2833,0,1,0,0
2,3,1,3,"Heikkinen, Miss. Laina",26.0,0,0,7.925,1,1,0,1
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.0,1,0,53.1,0,1,0,1
4,5,0,3,"Allen, Mr. William Henry",35.0,0,0,8.05,1,0,0,1


# 2. Feature engineering & extraction

In [45]:
# calculate family size
df = df.assign(FamilySize=df["SibSp"] + df["Parch"] + 1)

In [46]:
# boolean for single travellers
df = df.assign(SingleTraveller=(df["FamilySize"] == 1).astype(int))

In [47]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Age,SibSp,Parch,Fare,MissingCabin,Female,Embarked_Q,Embarked_S,FamilySize,SingleTraveller
0,1,0,3,"Braund, Mr. Owen Harris",22.0,1,0,7.25,1,0,0,1,2,0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.0,1,0,71.2833,0,1,0,0,2,0
2,3,1,3,"Heikkinen, Miss. Laina",26.0,0,0,7.925,1,1,0,1,1,1
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.0,1,0,53.1,0,1,0,1,2,0
4,5,0,3,"Allen, Mr. William Henry",35.0,0,0,8.05,1,0,0,1,1,1


In [51]:
# Extract information from 'Name'
df["Name"].head(20),

0                               Braund, Mr. Owen Harris
1     Cumings, Mrs. John Bradley (Florence Briggs Th...
2                                Heikkinen, Miss. Laina
3          Futrelle, Mrs. Jacques Heath (Lily May Peel)
4                              Allen, Mr. William Henry
5                                      Moran, Mr. James
6                               McCarthy, Mr. Timothy J
7                        Palsson, Master. Gosta Leonard
8     Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)
9                   Nasser, Mrs. Nicholas (Adele Achem)
10                      Sandstrom, Miss. Marguerite Rut
11                             Bonnell, Miss. Elizabeth
12                       Saundercock, Mr. William Henry
13                          Andersson, Mr. Anders Johan
14                 Vestrom, Miss. Hulda Amanda Adolfina
15                     Hewlett, Mrs. (Mary D Kingcome) 
16                                 Rice, Master. Eugene
17                         Williams, Mr. Charles

In [58]:
# Extract title from Name using regex
import re

title_pattern = r",\s(.*)\."
titles = df["Name"].str.extract(title_pattern)[0].replace("Ms", "Miss")
titles.value_counts()

Mr                          517
Miss                        183
Mrs                         124
Master                       40
Dr                            7
Rev                           6
Major                         2
Col                           2
Mlle                          2
Sir                           1
Mrs. Martin (Elizabeth L      1
Jonkheer                      1
Don                           1
Capt                          1
Mme                           1
the Countess                  1
Lady                          1
Name: 0, dtype: int64

In [62]:
titles_to_keep = titles.value_counts()[titles.value_counts() > 100].index.to_list()

In [64]:
titles[~titles.isin(titles_to_keep)] = "Other"

In [70]:
df = df.assign(Title=titles)
df = df.drop(columns="Name")

In [82]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare,MissingCabin,Female,Embarked_Q,Embarked_S,FamilySize,SingleTraveller,Title_Miss,Title_Mr,Title_Mrs,Title_Other
0,1,0,3,22.0,1,0,7.25,1,0,0,1,2,0,0,1,0,0
1,2,1,1,38.0,1,0,71.2833,0,1,0,0,2,0,0,0,1,0
2,3,1,3,26.0,0,0,7.925,1,1,0,1,1,1,1,0,0,0
3,4,1,1,35.0,1,0,53.1,0,1,0,1,2,0,0,0,1,0
4,5,0,3,35.0,0,0,8.05,1,0,0,1,1,1,0,1,0,0


In [77]:
df = df.merge(pd.get_dummies(df["Title"], prefix="Title"),
              left_index=True,
              right_index=True)

In [80]:
df = df.drop(columns="Title")

# 3. Split into train and test

In [81]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(df, test_size=.15, random_state=42)

In [96]:
id_col = ["PasengerId"]
target_col = ["Survived"]
num_cols = ["Pclass", "Age", "SibSp", "Parch", "Fare", "FamilySize"]
bool_cols = ["MissingCabin", "Female", "Embarked_Q", "Embarked_S", "SingleTraveller", "Title_Miss",
             "Title_Mr", "Title_Mrs", "Title_Other"]

In [103]:
df["Survived"].value_counts(normalize=True)

0    0.616162
1    0.383838
Name: Survived, dtype: float64

In [90]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import balanced_accuracy_score

In [98]:
train_X = train[num_cols + bool_cols]
train_y = train[target_col]
test_X = test[num_cols + bool_cols]
test_y = test[target_col]

In [100]:
lr = LogisticRegression()
lr.fit(train_X, train_y.values.ravel())

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [101]:
predictions = lr.predict(test_X)

In [102]:
balanced_accuracy_score(test_y, predictions)

0.7852564102564102

# 4. Cross-validation

In [104]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_validate

In [106]:
models = {"Logistic Regression": LogisticRegression(),
          "Decision Tree": DecisionTreeClassifier(random_state=42),
          "Random Forest": RandomForestClassifier(n_estimators=100, n_jobs=-1)}

In [118]:
cross_validate(models["Logistic Regression"],
               train_X,
               train_y.values.ravel(),
               cv=5,
               scoring="balanced_accuracy")

{'fit_time': array([0.09962702, 0.04508376, 0.06700802, 0.04654312, 0.03113008]),
 'score_time': array([0.00116396, 0.00440621, 0.0015552 , 0.00377083, 0.00794196]),
 'test_score': array([0.8294204 , 0.78596491, 0.83809257, 0.7641844 , 0.8133632 ])}

In [119]:
cross_validate(models["Logistic Regression"],
               train_X,
               train_y.values.ravel(),
               cv=5,
               scoring="balanced_accuracy")["test_score"].mean()

0.8062050945411953

In [114]:
# wrap into a function that takes a model and computes the avg. balanced accuracy score
def validate_model(model):
    validation_results = cross_validate(model,
                                        train_X,
                                        train_y.values.ravel(),
                                        cv=5,
                                        scoring="balanced_accuracy")
    acc = validation_results["test_score"].mean()
    print(f"Balanced Mean Accuracy Score: {acc}")

In [117]:
validate_model(models["Logistic Regression"])

Balanced Mean Accuracy Score: 0.8062050945411953


In [120]:
for key, value in models.items():
    print(f"Model: {key}")
    validate_model(value)
    print("--------------\n")

Model: Logistic Regression
Balanced Mean Accuracy Score: 0.8062050945411953
--------------

Model: Decision Tree
Balanced Mean Accuracy Score: 0.7277580414719852
--------------

Model: Random Forest
Balanced Mean Accuracy Score: 0.78996061319844
--------------



# 6. Feature Selection

In [122]:
def validate_model(model, X):
    validation_results = cross_validate(model,
                                        X,
                                        train_y.values.ravel(),
                                        cv=5,
                                        scoring="balanced_accuracy")
    acc = validation_results["test_score"].mean()
    print(f"Balanced Mean Accuracy Score: {acc}")

In [123]:
def evaluate_models(models, X):
    for key, value in models.items():
        print(f"Model: {key}")
        validate_model(value, X)
        print("--------------\n")

In [124]:
evaluate_models(models, train_X)

Model: Logistic Regression
Balanced Mean Accuracy Score: 0.8062050945411953
--------------

Model: Decision Tree
Balanced Mean Accuracy Score: 0.7277580414719852
--------------

Model: Random Forest
Balanced Mean Accuracy Score: 0.7984455084887567
--------------



### 6.1. Recursive Feature Elimination - RFE

Using Random Forest

In [125]:
from sklearn.feature_selection import RFECV

In [126]:
selector_rf = RFECV(models["Random Forest"],
                    cv=5,
                    scoring="balanced_accuracy")

In [127]:
selector_rf.fit(train_X, train_y)

RFECV(cv=5,
      estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                       class_weight=None, criterion='gini',
                                       max_depth=None, max_features='auto',
                                       max_leaf_nodes=None, max_samples=None,
                                       min_impurity_decrease=0.0,
                                       min_impurity_split=None,
                                       min_samples_leaf=1, min_samples_split=2,
                                       min_weight_fraction_leaf=0.0,
                                       n_estimators=100, n_jobs=-1,
                                       oob_score=False, random_state=None,
                                       verbose=0, warm_start=False),
      min_features_to_select=1, n_jobs=None, scoring='balanced_accuracy',
      step=1, verbose=0)

In [151]:
selector_rf.get_support()

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True])

In [152]:
train_X.columns

Index(['Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'FamilySize', 'MissingCabin',
       'Female', 'Embarked_Q', 'Embarked_S', 'SingleTraveller', 'Title_Miss',
       'Title_Mr', 'Title_Mrs', 'Title_Other'],
      dtype='object')

In [153]:
train_X2 = train_X[train_X.columns[selector_rf.get_support()]]

In [154]:
evaluate_models(models, train_X2)

Model: Logistic Regression
Balanced Mean Accuracy Score: 0.8062050945411953
--------------

Model: Decision Tree
Balanced Mean Accuracy Score: 0.7277580414719852
--------------

Model: Random Forest
Balanced Mean Accuracy Score: 0.8051457697802835
--------------



# 7. Hyperparameter Tuning

In [135]:
models

{'Logistic Regression': LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                    intercept_scaling=1, l1_ratio=None, max_iter=100,
                    multi_class='auto', n_jobs=None, penalty='l2',
                    random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                    warm_start=False),
 'Decision Tree': DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                        max_depth=None, max_features=None, max_leaf_nodes=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, presort='deprecated',
                        random_state=42, splitter='best'),
 'Random Forest': RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                        criterion='gini', max_depth=None, max_features='auto',
                        ma

In [136]:
models["Random Forest"] = RandomForestClassifier(n_estimators=500, n_jobs=-1)

In [137]:
selector_rf2 = RFECV(models["Random Forest"],
                     cv=5,
                     scoring="balanced_accuracy")
selector_rf.fit(train_X, train_y)
train_X3 = train_X[train_X.columns[selector_rf.get_support()]]

In [139]:
evaluate_models(models, train_X3)

Model: Logistic Regression
Balanced Mean Accuracy Score: 0.8062050945411953
--------------

Model: Decision Tree
Balanced Mean Accuracy Score: 0.7277580414719852
--------------

Model: Random Forest
Balanced Mean Accuracy Score: 0.8001920428363645
--------------



**Identify optimal hyperparameters using GridSearch (+CV)**

In [140]:
from sklearn.model_selection import GridSearchCV

In [156]:
param_grid_rf = {"n_estimators": [500, 1000],
                 "criterion": ["gini", "entropy"],
                 "bootstrap": [True, False],
                 "max_depth": [3, 10, None],
                 "min_samples_leaf": [10, 30, 50, None]}

tune_rf = GridSearchCV(RandomForestClassifier(),
                       param_grid=param_grid_rf,
                       cv=3,
                       scoring="balanced_accuracy",
                       n_jobs=-1)

tune_rf.fit(train_X2, train_y)

GridSearchCV(cv=3, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=100, n_jobs=None,
                                              oob_score=False,
                                              rando

In [157]:
tune_rf.best_params_

{'bootstrap': False,
 'criterion': 'entropy',
 'max_depth': None,
 'min_samples_leaf': 10,
 'n_estimators': 500}

In [158]:
validate_model(RandomForestClassifier(**tune_rf.best_params_), train_X2)

Balanced Mean Accuracy Score: 0.8053854371806258


# 8. Final validation on test set

In [159]:
rf_model = RandomForestClassifier(**tune_rf.best_params_)
rf_model.fit(train_X2, train_y)

test_X2 = test_X[test_X.columns[selector_rf.get_support()]]
rf_pred = rf_model.predict(test_X2)

balanced_accuracy_score(test_y, rf_pred)

0.8083791208791209

In [160]:
rf_model

RandomForestClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None,
                       criterion='entropy', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=10, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=500,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)