To temporarily fix an IOPub rate exceed error, you can try to re-start the notebook with this command:

`jupyter notebook --NotebookApp.iopub_data_rate_limit=1.0e10`

In [7]:
import pandas as pd
import numpy as np

# modeling imports
from sklearn import linear_model 
from sklearn.ensemble import RandomForestClassifier 

# parameter optimization imports 
from sklearn import model_selection
from sklearn.model_selection import GridSearchCV # grid search 
from sklearn.model_selection import train_test_split # split up data frames
from sklearn.model_selection import cross_val_score # cross-validation

# plotting and eda imports
import pandas_profiling
import matplotlib.pyplot as plt
import seaborn as sns
% matplotlib inline

Load in an example dataset to play around with

In [10]:
from sklearn.datasets import load_iris
iris_data = load_iris()
iris_df = pd.DataFrame(iris_data['data'], columns=iris_data['feature_names'])
iris_df['Species'] = iris_data['target'] # Species encoded as 0, 1 or 2

In [11]:
iris_df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),Species
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [12]:
df = iris_df

In [13]:
df.dtypes

sepal length (cm)    float64
sepal width (cm)     float64
petal length (cm)    float64
petal width (cm)     float64
Species                int64
dtype: object

In [21]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 150 entries, 0 to 149
Data columns (total 5 columns):
sepal length (cm)    150 non-null float64
sepal width (cm)     150 non-null float64
petal length (cm)    150 non-null float64
petal width (cm)     150 non-null float64
Species              150 non-null int64
dtypes: float64(4), int64(1)
memory usage: 7.0 KB


In [14]:
pandas_profiling.ProfileReport( df )

0,1
Number of variables,5
Number of observations,150
Total Missing (%),0.0%
Total size in memory,5.9 KiB
Average record size in memory,40.5 B

0,1
Numeric,3
Categorical,0
Date,0
Text (Unique),0
Rejected,2

0,1
Correlation,0.95646

0,1
Distinct count,43
Unique (%),28.7%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,3.7587
Minimum,1
Maximum,6.9
Zeros (%),0.0%

0,1
Minimum,1.0
5-th percentile,1.3
Q1,1.6
Median,4.35
Q3,5.1
95-th percentile,6.1
Maximum,6.9
Range,5.9
Interquartile range,3.5

0,1
Standard deviation,1.7644
Coef of variation,0.46943
Kurtosis,-1.4019
Mean,3.7587
MAD,1.5619
Skewness,-0.27446
Sum,563.8
Variance,3.1132
Memory size,1.2 KiB

Value,Count,Frequency (%),Unnamed: 3
1.5,14,9.3%,
1.4,12,8.0%,
5.1,8,5.3%,
4.5,8,5.3%,
1.3,7,4.7%,
1.6,7,4.7%,
5.6,6,4.0%,
4.0,5,3.3%,
4.9,5,3.3%,
4.7,5,3.3%,

Value,Count,Frequency (%),Unnamed: 3
1.0,1,0.7%,
1.1,1,0.7%,
1.2,2,1.3%,
1.3,7,4.7%,
1.4,12,8.0%,

Value,Count,Frequency (%),Unnamed: 3
6.3,1,0.7%,
6.4,1,0.7%,
6.6,1,0.7%,
6.7,2,1.3%,
6.9,1,0.7%,

0,1
Correlation,0.96276

0,1
Distinct count,35
Unique (%),23.3%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,5.8433
Minimum,4.3
Maximum,7.9
Zeros (%),0.0%

0,1
Minimum,4.3
5-th percentile,4.6
Q1,5.1
Median,5.8
Q3,6.4
95-th percentile,7.255
Maximum,7.9
Range,3.6
Interquartile range,1.3

0,1
Standard deviation,0.82807
Coef of variation,0.14171
Kurtosis,-0.55206
Mean,5.8433
MAD,0.68756
Skewness,0.31491
Sum,876.5
Variance,0.68569
Memory size,1.2 KiB

Value,Count,Frequency (%),Unnamed: 3
5.0,10,6.7%,
6.3,9,6.0%,
5.1,9,6.0%,
6.7,8,5.3%,
5.7,8,5.3%,
5.5,7,4.7%,
5.8,7,4.7%,
6.4,7,4.7%,
6.0,6,4.0%,
4.9,6,4.0%,

Value,Count,Frequency (%),Unnamed: 3
4.3,1,0.7%,
4.4,3,2.0%,
4.5,1,0.7%,
4.6,4,2.7%,
4.7,2,1.3%,

Value,Count,Frequency (%),Unnamed: 3
7.3,1,0.7%,
7.4,1,0.7%,
7.6,1,0.7%,
7.7,4,2.7%,
7.9,1,0.7%,

0,1
Distinct count,23
Unique (%),15.3%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,3.054
Minimum,2
Maximum,4.4
Zeros (%),0.0%

0,1
Minimum,2.0
5-th percentile,2.345
Q1,2.8
Median,3.0
Q3,3.3
95-th percentile,3.8
Maximum,4.4
Range,2.4
Interquartile range,0.5

0,1
Standard deviation,0.43359
Coef of variation,0.14198
Kurtosis,0.29078
Mean,3.054
MAD,0.33309
Skewness,0.33405
Sum,458.1
Variance,0.188
Memory size,1.2 KiB

Value,Count,Frequency (%),Unnamed: 3
3.0,26,17.3%,
2.8,14,9.3%,
3.2,13,8.7%,
3.4,12,8.0%,
3.1,12,8.0%,
2.9,10,6.7%,
2.7,9,6.0%,
2.5,8,5.3%,
3.5,6,4.0%,
3.8,6,4.0%,

Value,Count,Frequency (%),Unnamed: 3
2.0,1,0.7%,
2.2,3,2.0%,
2.3,4,2.7%,
2.4,3,2.0%,
2.5,8,5.3%,

Value,Count,Frequency (%),Unnamed: 3
3.9,2,1.3%,
4.0,1,0.7%,
4.1,1,0.7%,
4.2,1,0.7%,
4.4,1,0.7%,

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),Species
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


Get rid of all rows with NA in them

In [15]:
df = df.dropna(  )

Drop all rows with an NA in a specific column

In [None]:
df = df.dropna( subset = ['colName'] )

Drop certain columns

In [None]:
df.drop(['col1', 'col2'], axis=1, inplace=True)

Check the percentage of Null Values

In [16]:
df.isnull().sum()/df.shape[0]

sepal length (cm)    0.0
sepal width (cm)     0.0
petal length (cm)    0.0
petal width (cm)     0.0
Species              0.0
dtype: float64

Fill in null values in each column with the mean of the column

In [18]:
df.apply(lambda x: x.fillna(x.mean()), axis = 0);

### Feature Engineering

In [19]:
def add_datepart(df, fldname, drop=True):
    """add_datepart converts a column of df from a datetime64 to many columns containing
    the information from the date. This applies changes inplace.
    Parameters:
    -----------
    df: A pandas data frame. df gain several new columns.
    fldname: A string that is the name of the date column you wish to expand.
        If it is not a datetime64 series, it will be converted to one with pd.to_datetime.
    drop: If true then the original date column will be removed.
    Examples:
    ---------
    >>> df = pd.DataFrame({ 'A' : pd.to_datetime(['3/11/2000', '3/12/2000', '3/13/2000'], infer_datetime_format=False) })
    >>> df
        A
    0   2000-03-11
    1   2000-03-12
    2   2000-03-13
    >>> add_datepart(df, 'A')
    >>> df
        AYear AMonth AWeek ADay ADayofweek ADayofyear AIs_month_end AIs_month_start AIs_quarter_end AIs_quarter_start AIs_year_end AIs_year_start AElapsed
    0   2000  3      10    11   5          71         False         False           False           False             False        False          952732800
    1   2000  3      10    12   6          72         False         False           False           False             False        False          952819200
    2   2000  3      11    13   0          73         False         False           False           False             False        False          952905600
    """
    fld = df[fldname]
    if not np.issubdtype(fld.dtype, np.datetime64):
        df[fldname] = fld = pd.to_datetime(fld, infer_datetime_format=True)
    targ_pre = re.sub('[Dd]ate$', '', fldname)
    for n in ('Year', 'Month', 'Week', 'Day', 'Dayofyear'):
        df[targ_pre+n] = getattr(fld.dt,n.lower())
    df[targ_pre+'Elapsed'] = fld.astype(np.int64) // 10**9
    if drop: df.drop(fldname, axis=1, inplace=True)

Convert categorical variables to numerical codes 

Then apply these numerical codes to the test set...

In [None]:
x_train, y_train, x_test, y_test = train_test_split( df.drop(['Species'], axis = 1) , df['Species'] )

### Hyper Parameter tuning

Run grid search on your hyperparameters on the x_train, y_train. It will use cross validation and give you the mean cross validation score. Pick the model giving the best score, and then use this model to make predictions on your held out test set of data (predict on x_test and compare results with y_test). In this case, because you are using cross validation, you do not need a separate validation set.

Different Scoring metrics to use inside the search CV functions : http://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter

Grid Search CV - with an example of using roc_auc as the scoring metric

In [None]:
from sklearn.model_selection import GridSearchCV

search_params = {
    'n_estimators':[5,10,20,30, 40, 50],
    'min_samples_leaf': [2,5,7,10],
    'max_depth': [3,5,7],
}

rfr = RandomForestRegressor()
gdcv = GridSearchCV( cv = 5, estimator = rfr, param_grid = search_params, scoring = 'roc_auc')
gdcv.fit(x_train, y_train)

In [None]:
gdcv.best_params_

In [None]:
rfr_best = RandomForestRegressor(min_samples_leaf=2, n_estimators=50, n_jobs=-1)
rfr_best.fit(x_train, y_train)
rfr_best.predict(X_test)

Randomized Search CV

In [None]:
from sklearn.model_selection import RandomizedSearchCV

search_params = {
    'n_estimators':[5,10,20,30, 40, 50],
    'min_samples_leaf': [2,5,7,10],
    'max_depth': [3,5,7],
}

rfr = RandomForestRegressor()
rscv = RandomizedSearchCV( cv = 5, estimator = rfr, param_distributions = search_params, scoring = 'Accuracy')
rscv.fit(x_train, y_train)

In [None]:
rscv.best_params_

Use the best_params_ returned and then use them to make a best model that you fit on the entire x_train and y_train data. And then you can predict on your test data.

In [None]:
rfr_best = RandomForestRegressor(min_samples_leaf=2, n_estimators=50, n_jobs=-1)
rfr_best.fit(x_train, y_train)

You can compare the score on the train and test sets to ensure that you are not overfitting.

In [None]:
rfr_best.score(x_train, y_train)
rfr_best.score(X_test, y_test)