# NLP With RandomForestClassifier

## 1. Imports

In [1]:
# Imports
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline

import pandas as pd
import numpy as np
import joblib

# Global settings
RANDOM_STATE = 42

## 2. Load Data

In [4]:
# Load data
df = pd.read_csv("../data/news1.csv")

In [5]:
df.columns = ['Category', 'Title']

## 3. Quick Checks

In [6]:
# Shape of data
print(f'Data shape: {df.shape}')
print(f'\n')

# Columns of data
print(f'Data columns: {df.columns.tolist()}\n')
print(f'\n')

# Missing values of data (percent)
print(f'Data missing values % \n{df.isna().sum()[df.isna().sum() != 0].sort_values(ascending=False) * 100 / len(df)}\n')
print(f'\n')

# Unique values of data (normal and percent)
print(f'Data unique values (normal): \n{df.nunique().sort_values(ascending=False)}\n')
print(f'Data unique values (%): \n{df.nunique().sort_values(ascending=False) * 100 / len(df)}\n')
print(f'\n')

# Duplicates of data
print(f'Data duplicates: {df.duplicated().sum()}')

Data shape: (302, 2)


Data columns: ['Category', 'Title']



Data missing values % 
Series([], dtype: float64)



Data unique values (normal): 
Title       251
Category      2
dtype: int64

Data unique values (%): 
Title       83.112583
Category     0.662252
dtype: float64



Data duplicates: 50


## 4. Fix Data

In [7]:
# Drop duplicates
df = df.drop_duplicates(keep=False)

## 5. Split Data

In [8]:
# Split data
X = df['Title']
y = df['Category']

In [9]:
# Extract 70% of data for Train and 30% for Test and Validation
X_train, X_temp, y_train, y_temp = train_test_split(
    X,
    y,
    test_size=0.30, # 30 % of total
    stratify=y,
    random_state=RANDOM_STATE
)

In [10]:
# Create 25% Test and 5% Validation
X_test, X_val, y_test, y_val = train_test_split(
    X_temp,
    y_temp,
    test_size=0.1667,  # 5% of total
    stratify=y_temp,
    random_state=42
)

In [11]:
# Verify category balance (need to be around the same)
print(y.value_counts(normalize=True))
print(y_train.value_counts(normalize=True))
print(y_test.value_counts(normalize=True))
print(y_val.value_counts(normalize=True))

Category
Politics    0.90099
Rights      0.09901
Name: proportion, dtype: float64
Category
Politics    0.900709
Rights      0.099291
Name: proportion, dtype: float64
Category
Politics    0.9
Rights      0.1
Name: proportion, dtype: float64
Category
Politics    0.909091
Rights      0.090909
Name: proportion, dtype: float64


## 6. Model Settings

In [12]:
# Pipeline settings
pipe = Pipeline([
    ("tfidf", TfidfVectorizer()),
    ("rf", RandomForestClassifier(
        random_state=RANDOM_STATE
    ))
])

In [13]:
# Tf-idf settings
tfidf_params = {
    # "tfidf__max_features": [1000, 2000],
    # "tfidf__ngram_range": [(1,1)],
    # "tfidf__min_df": [2, 3],
    # "tfidf__max_df": [0.8, 0.9],
    # "tfidf__sublinear_tf": [True],
    "tfidf__max_features": [1000],
    "tfidf__min_df": [2],
    "tfidf__max_df": [0.9],
    "tfidf__sublinear_tf": [True]
}

In [14]:
# RandomForest settings
rf_params = {
    # "rf__n_estimators": [200, 300],
    # "rf__max_depth": [20],
    # "rf__min_samples_split": [3, 5],
    # "rf__min_samples_leaf": [1, 2],
    # "rf__max_features": ["sqrt", "log2"],
    "rf__n_estimators": [200],
    "rf__min_samples_split": [3],
    "rf__min_samples_leaf": [1, 2],
    "rf__max_features": ["sqrt"]
}

In [15]:
param_grid = {**tfidf_params, **rf_params}

In [16]:
grid = GridSearchCV(
    pipe,
    param_grid=param_grid,
    cv=2,
    n_jobs=-1,
    verbose=3,
    return_train_score=True
)

## 7. Final Model

In [17]:
# Fit model
grid.fit(X_train, y_train)

Fitting 2 folds for each of 2 candidates, totalling 4 fits


0,1,2
,estimator,Pipeline(step...m_state=42))])
,param_grid,"{'rf__max_features': ['sqrt'], 'rf__min_samples_leaf': [1, 2], 'rf__min_samples_split': [3], 'rf__n_estimators': [200], ...}"
,scoring,
,n_jobs,-1
,refit,True
,cv,2
,verbose,3
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,True

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,analyzer,'word'
,stop_words,
,token_pattern,'(?u)\\b\\w\\w+\\b'

0,1,2
,n_estimators,200
,criterion,'gini'
,max_depth,
,min_samples_split,3
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [18]:
# Predict test
y_pred = grid.predict(X_test)

## 8. Final Checks

In [19]:
# Evaluate the prediction
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")

# Check for overfit or underfit
results = pd.DataFrame(grid.cv_results_)
print(results[[
    "params",
    "mean_train_score",
    "mean_test_score"
]])

Accuracy: 92.00%
                                              params  mean_train_score  \
0  {'rf__max_features': 'sqrt', 'rf__min_samples_...          0.957646   
1  {'rf__max_features': 'sqrt', 'rf__min_samples_...          0.907847   

   mean_test_score  
0         0.914789  
1         0.900704  


In [20]:
# Model predicting results
print("Train accuracy:", grid.score(X_train, y_train))
print("Validation accuracy:", grid.score(X_val, y_val))

Train accuracy: 0.9858156028368794
Validation accuracy: 0.9090909090909091


## 9. Save model to PKL

In [None]:
# Save model
joblib.dump(grid.best_estimator_, "../model/nlp_model.pkl")

['../model/nlp_model1.pkl']

# Done!