In [1]:
import time
start_time = time.time()

### Library version

In [2]:
import nltk
import sklearn

print('The nltk version is {}.'.format(nltk.__version__))
print('The scikit-learn version is {}.'.format(sklearn.__version__))

The nltk version is 3.4.5.
The scikit-learn version is 0.23.1.


## Get data

In [3]:
# imports
import numpy as np
import pandas as pd

# set data to df
email_data = pd.read_csv('./data/enron/03_mini_processing.csv')

In [4]:
full_df = email_data[['m_body', 'gender']].copy()

In [5]:
full_df.groupby(['gender']).size()

gender
boy     90037
girl    68731
dtype: int64

## Exploratory data analysis

In [6]:
## 

## Visual EDA

In [7]:
## 

## Reclassify data

In [8]:
# bool target variables: boy = 0, girl = 1
full_df.gender = full_df.gender.replace('boy', 0)
full_df.gender = full_df.gender.replace('girl', 1)

In [9]:
# reclassify data into arrays
full_X = full_df.m_body.values # features
full_y = full_df.gender.values # targets (gender labels)

## Create sample subset for initial tests

In [10]:
# import numpy
import numpy as np

# set number of samples
n_samples = 1000

# random choice collect index
sample_idx = np.random.choice(np.arange(len(full_X)), size=n_samples, replace=True)

# create sample subsets
Xs = full_X[sample_idx]
ys = full_y[sample_idx]

## Feature extraction

### countvectorizer

In [11]:
# import feature extraction
from sklearn.feature_extraction.text import CountVectorizer

# instantiate vectorizer
count_vect = CountVectorizer()
# fit to data
%timeit count_vect.fit_transform(Xs)
Xs_vect = count_vect.fit_transform(Xs)

107 ms ± 2.88 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [12]:
# countvector review
print('Input: {}'.format(Xs[3]))
print('Type: {}'.format(type(Xs_vect)))
print('Shape: {}'.format(Xs_vect.shape))
print('Vector data: {}'.format(Xs_vect[3].data))

Input: The ENW Staff Mtg. has been CANCELED for this week(11/16) and next week(11/23 ). The meeting will resume on Thursday, November 30th. Happy Thanksgiving! Tammie
Type: <class 'scipy.sparse.csr.csr_matrix'>
Shape: (1000, 12190)
Vector data: [1 2 1 1 1 1 1 1 1 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1]


### term frequency (tf)

In [13]:
# import feature extraction
from sklearn.feature_extraction.text import TfidfTransformer

# fit estimator to vectored data
tf_transformer = TfidfTransformer(use_idf=False).fit(Xs_vect)
# tranform count-matrix to tf-idf representation
%timeit tf_transformer.transform(Xs_vect)
Xs_tf = tf_transformer.transform(Xs_vect)

392 µs ± 7.47 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [14]:
# tf review
print('Input: {}'.format(Xs[3]))
print('Type: {}'.format(type(Xs_tf)))
print('Shape: {}'.format(Xs_tf.shape))
print('Vector data: {}'.format(Xs_tf[3].data))

Input: The ENW Staff Mtg. has been CANCELED for this week(11/16) and next week(11/23 ). The meeting will resume on Thursday, November 30th. Happy Thanksgiving! Tammie
Type: <class 'scipy.sparse.csr.csr_matrix'>
Shape: (1000, 12190)
Vector data: [0.34299717 0.17149859 0.17149859 0.17149859 0.17149859 0.17149859
 0.17149859 0.17149859 0.17149859 0.17149859 0.17149859 0.17149859
 0.17149859 0.17149859 0.17149859 0.17149859 0.17149859 0.17149859
 0.17149859 0.17149859 0.34299717 0.17149859 0.17149859 0.34299717
 0.17149859]


### term frequency inverse document frequency (tf-idf)

In [15]:
# import feature extraction
from sklearn.feature_extraction.text import TfidfTransformer

# instantiate model
tfidf_transformer = TfidfTransformer()
# fit, transform
%timeit tfidf_transformer.fit_transform(Xs_vect)
Xs_tfidf = tfidf_transformer.fit_transform(Xs_vect)

2.17 ms ± 41.8 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [16]:
# tf review
print('Input: {}'.format(Xs[3]))
print('Type: {}'.format(type(Xs_tfidf)))
print('Shape: {}'.format(Xs_tfidf.shape))
print('Vector data: {}'.format(Xs_tfidf[3].data))

Input: The ENW Staff Mtg. has been CANCELED for this week(11/16) and next week(11/23 ). The meeting will resume on Thursday, November 30th. Happy Thanksgiving! Tammie
Type: <class 'scipy.sparse.csr.csr_matrix'>
Shape: (1000, 12190)
Vector data: [0.09328117 0.28143476 0.17419165 0.08180764 0.11706943 0.25250261
 0.28111494 0.19359287 0.22389028 0.08136966 0.19886969 0.14715706
 0.29785208 0.14405969 0.11827755 0.19359287 0.07328471 0.28111494
 0.26002864 0.13132321 0.06903416 0.26923975 0.19359287 0.18606684
 0.28912009]


## Multinomial naive bayes (MNB): single classifier

### Simulate 20 'new samples' or unlabeled data

In [17]:
# import model selection
from sklearn.model_selection import train_test_split

# simulating 20 'new samples' or unlabeled data
%timeit train_test_split(Xs_tfidf, ys, test_size=0.02, random_state=42)
Xs_train, Xs_test, ys_train, ys_test = train_test_split(Xs_tfidf, ys, test_size=0.02, random_state=42)

718 µs ± 17.8 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [18]:
# import model
from sklearn.naive_bayes import MultinomialNB

# create model object, fit model
mnb = MultinomialNB().fit(Xs_train, ys_train)

### Predict on unlabeled data

In [19]:
# prediction
ys_pred = mnb.predict(Xs_test)

# checking the prediction
#print('Prediction: {}'.format(ys_pred))

### Single predict: metrics

In [20]:
# import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

# calculate model accuracy
print('accuracy: {}'.format(np.mean(ys_pred == ys_test)))

# confusion matrix
print('Confusion matrix:')
print(confusion_matrix(ys_test, ys_pred))

# classification report
print('Classification report:')
print(classification_report(ys_test, ys_pred))

accuracy: 0.65
Confusion matrix:
[[11  0]
 [ 7  2]]
Classification report:
              precision    recall  f1-score   support

           0       0.61      1.00      0.76        11
           1       1.00      0.22      0.36         9

    accuracy                           0.65        20
   macro avg       0.81      0.61      0.56        20
weighted avg       0.79      0.65      0.58        20



## MNB 60/40 Train, test

In [21]:
# model selection
from sklearn.model_selection import train_test_split

# import model
from sklearn.naive_bayes import MultinomialNB

# import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report


# 60% train, 40% test
Xs_train, Xs_test, ys_train, ys_test = train_test_split(Xs_tfidf, ys, test_size=0.4, random_state=42)

# create model object, fit model
mnb = MultinomialNB().fit(Xs_train, ys_train)

# prediction
ys_pred = mnb.predict(Xs_test)

# print the predictions
#print('Prediction: {}'.format(ys_pred))

# calculate model accuracy
print('accuracy: {}'.format(np.mean(ys_pred == ys_test)))

# confusion matrix
print('Confusion matrix:')
print(confusion_matrix(ys_test, ys_pred))

# classification report
print('Classification report:')
print(classification_report(ys_test, ys_pred))

accuracy: 0.575
Confusion matrix:
[[210   2]
 [168  20]]
Classification report:
              precision    recall  f1-score   support

           0       0.56      0.99      0.71       212
           1       0.91      0.11      0.19       188

    accuracy                           0.57       400
   macro avg       0.73      0.55      0.45       400
weighted avg       0.72      0.57      0.47       400



## MNB: 10-fold validation

In [22]:
# import model
from sklearn.naive_bayes import MultinomialNB

# import model selection
from sklearn.model_selection import cross_val_score


# create model object
mnb = MultinomialNB()

# compute 10-fold cross-validation
scores_list = ['accuracy', 'precision', 'recall']

# print scores
print('10-Fold CV Scores')
for s in scores_list:
    cv_scores = cross_val_score(mnb, Xs_tfidf, ys, cv=10, scoring=s, n_jobs=-1)
    print('Mean {}: {:1.2g}'.format(s, np.mean(cv_scores)))
    print('Std {}: {:1.2g}'.format(s, np.std(cv_scores)))
    print('{} scores: {}'.format(s, cv_scores))

10-Fold CV Scores
Mean accuracy: 0.65
Std accuracy: 0.042
accuracy scores: [0.63 0.68 0.69 0.68 0.74 0.64 0.62 0.63 0.58 0.65]
Mean precision: 0.83
Std precision: 0.13
precision scores: [0.66666667 0.86666667 0.875      0.86666667 1.         0.83333333
 0.75       1.         0.58333333 0.84615385]
Mean recall: 0.26
Std recall: 0.072
recall scores: [0.27906977 0.30232558 0.3255814  0.30232558 0.39534884 0.22727273
 0.20454545 0.15909091 0.15909091 0.25      ]


### Cross-validation time trials

In [23]:
# cross-validation times
#cv_iterations = [3, 5, 10]
#for i in cv_iterations:
#    print('{}-Fold:'.format(i))
#    %timeit cross_val_score(mnb, Xs_tfidf, ys, cv=i)
#    print('{}-Fold, all processors:'.format(i))
#    %timeit cross_val_score(mnb, Xs_tfidf, ys, cv=i, n_jobs=-1)

3-Fold:
6.66 ms ± 112 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
3-Fold, all processors:
10.5 ms ± 173 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
5-Fold:
10.7 ms ± 141 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
5-Fold, all processors:
13.1 ms ± 209 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
10-Fold:
20.8 ms ± 341 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
10-Fold, all processors:
22 ms ± 937 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


## MNB: pipeline

In [24]:
%%time
# import feature extraction
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

# import model
from sklearn.naive_bayes import MultinomialNB

# import pipeline
from sklearn.pipeline import Pipeline

# import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report


# set steps for pipeline
steps = [('vect', CountVectorizer()),
        ('tfidf', TfidfTransformer()),
        ('mnb', MultinomialNB()),]

# create pipeline object
pipeline = Pipeline(steps)

# Create train and test sets
Xs_train, Xs_test, ys_train, ys_test = train_test_split(Xs, ys, test_size=0.4, random_state=42)

# fit model to data
pipeline.fit(Xs_train, ys_train)

# predict
ys_pred = pipeline.predict(Xs_test)

# generate confusion matrix, classification report
print(confusion_matrix(ys_test, ys_pred))
print(classification_report(ys_test, ys_pred))

[[210   2]
 [166  22]]
              precision    recall  f1-score   support

           0       0.56      0.99      0.71       212
           1       0.92      0.12      0.21       188

    accuracy                           0.58       400
   macro avg       0.74      0.55      0.46       400
weighted avg       0.73      0.58      0.48       400



## Multinomial naive bayes: GridSearchCV

In [25]:
%%time
def mnb_gridsearchcv(X, y, cv_n=10, test_size=0.4, random_state=42):
    """Pass features, target datasets and return MNB parameters, related accuracy"""
    # import feature extraction
    from sklearn.feature_extraction.text import CountVectorizer
    from sklearn.feature_extraction.text import TfidfTransformer

    # import model
    from sklearn.naive_bayes import MultinomialNB

    # import pipeline
    from sklearn.pipeline import Pipeline

    # import model selection
    from sklearn.model_selection import GridSearchCV

    # import metrics
    from sklearn.metrics import confusion_matrix
    from sklearn.metrics import classification_report

    
    # set steps for pipeline
    steps = [
        ('vect', CountVectorizer()),
        ('tfidf', TfidfTransformer()),
        ('mnb', MultinomialNB()),
    ]

    # create pipeline object
    pipeline = Pipeline(steps)

    # specify the parameters
    parameters = {
        'mnb__alpha':(1.0, 0, 1e-1, 1e-2, 1e-3),
    }

    # Create train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

    # Instantiate GridSearchCV
    mnb_cv = GridSearchCV(pipeline, param_grid=parameters, cv=cv_n, n_jobs=-1)

    # fit model to data
    mnb_cv.fit(X_train, y_train)

    # predict test labels
    y_pred = mnb_cv.predict(X_test)

    # generate confusion matrix, classification report
    print(confusion_matrix(y_test, y_pred))
    print(classification_report(y_test, y_pred))

    # print params
    print('Tuned MNB Parameters: {}'.format(mnb_cv.best_params_))
    print('Tuned MNB Accuracy: {}'.format(mnb_cv.best_score_))

mnb_gridsearchcv(Xs, ys)

[[199  13]
 [132  56]]
              precision    recall  f1-score   support

           0       0.60      0.94      0.73       212
           1       0.81      0.30      0.44       188

    accuracy                           0.64       400
   macro avg       0.71      0.62      0.58       400
weighted avg       0.70      0.64      0.59       400

Tuned MNB Parameters: {'mnb__alpha': 0.1, 'mnb__fit_prior': True, 'tfidf__use_idf': False, 'vect__ngram_range': (1, 1)}
Tuned MNB Accuracy: 0.6716666666666666
Wall time: 13.3 s


## Logistic regression (logreg): GridSearchCV
- Logistic Regression (aka logit, MaxEnt) classifier

In [26]:
%%time
def logreg_gridsearchcv(X, y, cv_n=10, test_size=0.4, random_state=42):
    """"""
    # import feature extraction
    from sklearn.feature_extraction.text import CountVectorizer
    from sklearn.feature_extraction.text import TfidfTransformer

    # import model
    from sklearn.linear_model import LogisticRegression

    # import pipeline
    from sklearn.pipeline import Pipeline

    # import model selection
    from sklearn.model_selection import GridSearchCV

    # import metrics
    from sklearn.metrics import confusion_matrix
    from sklearn.metrics import classification_report

    
    # set steps for pipeline
    steps = [
        ('vect', CountVectorizer()),
        ('tfidf', TfidfTransformer()),
        ('logreg', LogisticRegression()),
    ]

    # create pipeline object
    pipeline = Pipeline(steps)
    
    # Create the hyperparameter grid
    c_space = np.logspace(-5, 8, 15)
    parameters = {
        'logreg__C': c_space, 
        'logreg__penalty': ['l1', 'l2'],
        'logreg__n_jobs': [-1],
    }
    
#    parameters = {
#        'vect__ngram_range':[(1, 1), (1, 2)],
#        'tfidf__use_idf':(True, False),
#        'logreg__C':[1, 10, 100],
#        'logreg__n_jobs': [-1],
#    }

    # Create train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

    # Instantiate GridSearchCV
    logreg_cv = GridSearchCV(pipeline, param_grid=parameters, cv=cv_n, n_jobs=-1)

    # fit model to data
    logreg_cv.fit(X_train, y_train)

    # predict test labels
    y_pred = logreg_cv.predict(X_test)

    # generate confusion matrix, classification report
    print(confusion_matrix(y_test, y_pred))
    print(classification_report(y_test, y_pred))

    # print params
    print('Tuned Logreg Parameters: {}'.format(logreg_cv.best_params_))
    print('Tuned Logreg Accuracy: {}'.format(logreg_cv.best_score_))

logreg_gridsearchcv(Xs, ys)

[[156  56]
 [ 75 113]]
              precision    recall  f1-score   support

           0       0.68      0.74      0.70       212
           1       0.67      0.60      0.63       188

    accuracy                           0.67       400
   macro avg       0.67      0.67      0.67       400
weighted avg       0.67      0.67      0.67       400

Tuned Logreg Parameters: {'logreg__C': 19306.977288832535, 'tfidf__use_idf': False, 'vect__ngram_range': (1, 1)}
Tuned Logreg Accuracy: 0.6733333333333335
Wall time: 29.2 s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


## Stochastic gradient descent classifier (SGDC): GridSearchCV

In [27]:
%%time
def sgdc_gridsearchcv(X, y, cv_n=10, test_size=0.4, random_state=42):
    """Pass features, target datasets and return MNB parameters, related accuracy"""
    # import feature extraction
    from sklearn.feature_extraction.text import CountVectorizer
    from sklearn.feature_extraction.text import TfidfTransformer

    # import model
    from sklearn.linear_model import SGDClassifier

    # import pipeline
    from sklearn.pipeline import Pipeline

    # import model selection
    from sklearn.model_selection import GridSearchCV

    # import metrics
    from sklearn.metrics import confusion_matrix
    from sklearn.metrics import classification_report
    
    
    # pipeline steps
    steps = [
        ('vect', CountVectorizer()),
        ('tfidf', TfidfTransformer()),
        ('sgdc', SGDClassifier())
    ]
    
    # pipeline object
    pipeline = Pipeline(steps)

    # specify the parameters
    parameters = {
        'sgdc__alpha':(1e-4, 1e-5, 1e-6),
        'sgdc__random_state':[random_state],
    }
    
    # Create train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
    
    # grid search object
    sgdc_cv = GridSearchCV(pipeline, param_grid=parameters, cv=cv_n, n_jobs=-1)
    
    # fit model on training set
    sgdc_cv.fit(X_train, y_train)
    
    # predict test labels
    y_pred = sgdc_cv.predict(X_test)
    
    # generate confusion matrix, classification report
    print(confusion_matrix(y_test, y_pred))
    print(classification_report(y_test, y_pred))

    # print params
    print('Tuned SGDC Parameters: {}'.format(sgdc_cv.best_params_))
    print('Tuned SGDC Accuracy: {}'.format(sgdc_cv.best_score_))
    
sgdc_gridsearchcv(Xs, ys)

[[147  65]
 [ 65 123]]
              precision    recall  f1-score   support

           0       0.69      0.69      0.69       212
           1       0.65      0.65      0.65       188

    accuracy                           0.68       400
   macro avg       0.67      0.67      0.67       400
weighted avg       0.68      0.68      0.68       400

Tuned SGDC Parameters: {'sgdc__alpha': 1e-05, 'sgdc__random_state': 42, 'tfidf__use_idf': False, 'vect__ngram_range': (1, 1)}
Tuned SGDC Accuracy: 0.6833333333333333
Wall time: 4.02 s


## C-Support Vector Classification (CSVC): GridSearchCV
- Impractical beyond tens of thousands of samples

In [28]:
%%time
def csvc_gridsearchcv(X, y, cv_n=10, test_size=0.4, random_state=42):
    """"""
    # 
    if len(X)>10000:
        return "CSVC skipped: Over 10000 samples received."
    
    # import feature extraction
    from sklearn.feature_extraction.text import CountVectorizer
    from sklearn.feature_extraction.text import TfidfTransformer

    # import model
    from sklearn.svm import SVC

    # import pipeline
    from sklearn.pipeline import Pipeline

    # import model selection
    from sklearn.model_selection import GridSearchCV

    # import metrics
    from sklearn.metrics import confusion_matrix
    from sklearn.metrics import classification_report
    
    
    # pipeline steps
    steps = [
        ('vect', CountVectorizer()),
        ('tfidf', TfidfTransformer()),
        ('csvc', SVC())
    ]
    
    # pipeline object
    pipeline = Pipeline(steps)

    parameters = {
        'csvc__C':[1, 10, 100],
        'csvc__gamma':['scale', 'auto', 0.1, 0.01],
        'csvc__random_state':[random_state]
    }
    
    # Create train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
    
    # grid search object
    csvc_cv = GridSearchCV(pipeline, param_grid=parameters, cv=cv_n, n_jobs=-1)
    
    # fit model on training set
    csvc_cv.fit(X_train, y_train)
    
    # predict test labels
    y_pred = csvc_cv.predict(X_test)
    
    # generate confusion matrix, classification report
    print(confusion_matrix(y_test, y_pred))
    print(classification_report(y_test, y_pred))

    # print params
    print('Tuned CSVC Parameters: {}'.format(csvc_cv.best_params_))
    print('Tuned CSVC Accuracy: {}'.format(csvc_cv.best_score_))
    
csvc_gridsearchcv(Xs, ys)

[[172  40]
 [ 77 111]]
              precision    recall  f1-score   support

           0       0.69      0.81      0.75       212
           1       0.74      0.59      0.65       188

    accuracy                           0.71       400
   macro avg       0.71      0.70      0.70       400
weighted avg       0.71      0.71      0.70       400

Tuned CSVC Parameters: {'csvc__C': 100, 'csvc__gamma': 0.1, 'csvc__random_state': 42, 'tfidf__use_idf': True, 'vect__ngram_range': (1, 1)}
Tuned CSVC Accuracy: 0.6666666666666666
Wall time: 18.5 s


## Linear support vector classification (LSVC): GridSearchCV

In [29]:
%%time
def lsvc_gridsearchcv(X, y, cv_n=10, test_size=0.4, random_state=42):
    """"""
    # import feature extraction
    from sklearn.feature_extraction.text import CountVectorizer
    from sklearn.feature_extraction.text import TfidfTransformer

    # import model
    from sklearn.svm import LinearSVC

    # import pipeline
    from sklearn.pipeline import Pipeline

    # import model selection
    from sklearn.model_selection import GridSearchCV

    # import metrics
    from sklearn.metrics import confusion_matrix
    from sklearn.metrics import classification_report
    
    
    # pipeline steps
    steps = [
        ('vect', CountVectorizer()),
        ('tfidf', TfidfTransformer()),
        ('lsvc', LinearSVC())
    ]
    
    # pipeline object
    pipeline = Pipeline(steps)

    parameters = {
        'lsvc__penalty':['l1', 'l2'],
        'lsvc__loss':['hinge', 'squared_hinge'],
        'lsvc__C':[1, 10, 100],
        'lsvc__random_state':[random_state],
    }
    
    # Create train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
    
    # grid search object
    lsvc_cv = GridSearchCV(pipeline, param_grid=parameters, cv=cv_n, n_jobs=-1)
    
    # fit model on training set
    lsvc_cv.fit(X_train, y_train)
    
    # predict test labels
    y_pred = lsvc_cv.predict(X_test)
    
    # generate confusion matrix, classification report
    print(confusion_matrix(y_test, y_pred))
    print(classification_report(y_test, y_pred))

    # print params
    print('Tuned LSVC Parameters: {}'.format(lsvc_cv.best_params_))
    print('Tuned LSVC Accuracy: {}'.format(lsvc_cv.best_score_))
    
lsvc_gridsearchcv(Xs, ys)

[[158  54]
 [ 74 114]]
              precision    recall  f1-score   support

           0       0.68      0.75      0.71       212
           1       0.68      0.61      0.64       188

    accuracy                           0.68       400
   macro avg       0.68      0.68      0.68       400
weighted avg       0.68      0.68      0.68       400

Tuned LSVC Parameters: {'lsvc__C': 10, 'lsvc__loss': 'squared_hinge', 'lsvc__penalty': 'l2', 'lsvc__random_state': 42, 'tfidf__use_idf': False, 'vect__ngram_range': (1, 1)}
Tuned LSVC Accuracy: 0.6683333333333334
Wall time: 11.8 s


## Random forest classifier (RFC): GridSearchCV
- sub-sample size is controlled with the max_samples parameter if bootstrap=True (default), otherwise the whole dataset is used to build each tree

In [30]:
%%time
def rfc_gridsearchcv(X, y, cv_n=10, test_size=0.4, random_state=42):
    """"""
    # import feature extraction
    from sklearn.feature_extraction.text import CountVectorizer
    from sklearn.feature_extraction.text import TfidfTransformer

    # import model
    from sklearn.ensemble import RandomForestClassifier

    # import pipeline
    from sklearn.pipeline import Pipeline

    # import model selection
    from sklearn.model_selection import GridSearchCV

    # import metrics
    from sklearn.metrics import confusion_matrix
    from sklearn.metrics import classification_report
    
    
    # pipeline steps
    steps = [
        ('vect', CountVectorizer()),
        ('tfidf', TfidfTransformer()),
        ('rfc', RandomForestClassifier())
    ]
    
    # pipeline object
    pipeline = Pipeline(steps)

    parameters = {
        'rfc__n_estimators':[50, 100, 150],
        'rfc__max_features':[None, 'auto', 'log2'],
        'rfc__n_jobs':[-1],
        'rfc__min_samples_split':[2],
    }
    
    # Create train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
    
    # grid search object
    rfc_cv = GridSearchCV(pipeline, param_grid=parameters, cv=cv_n, n_jobs=-1)
    
    # fit model on training set
    rfc_cv.fit(X_train, y_train)
    
    # predict test labels
    y_pred = rfc_cv.predict(X_test)
    
    # generate confusion matrix, classification report
    print(confusion_matrix(y_test, y_pred))
    print(classification_report(y_test, y_pred))

    # print params
    print('Tuned RFC Parameters: {}'.format(rfc_cv.best_params_))
    print('Tuned RFC Accuracy: {}'.format(rfc_cv.best_score_))
    
rfc_gridsearchcv(Xs, ys)

[[195  17]
 [121  67]]
              precision    recall  f1-score   support

           0       0.62      0.92      0.74       212
           1       0.80      0.36      0.49       188

    accuracy                           0.66       400
   macro avg       0.71      0.64      0.62       400
weighted avg       0.70      0.66      0.62       400

Tuned RFC Parameters: {'rfc__max_features': 'auto', 'rfc__min_samples_split': 2, 'rfc__n_estimators': 50, 'tfidf__use_idf': True, 'vect__ngram_range': (1, 1)}
Tuned RFC Accuracy: 0.6449999999999999
Wall time: 2min 30s


## Ending Stats

In [33]:
# execution time
print("--- {:1.2f} seconds ---".format(time.time() - start_time))

--- 358.99 seconds ---


In [32]:
stop here

SyntaxError: invalid syntax (<ipython-input-32-a96ba3aab008>, line 1)