In [21]:
import pandas as pd
import numpy as np
from nltk.corpus import stopwords

import re
import nltk
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold, GridSearchCV

In [12]:
ds = pd.read_csv("Cases\SPAM\SPAM text message 20170820 - Data.csv")

In [13]:
ds.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [24]:
 stops = stopwords.words("english")

In [25]:
ps = PorterStemmer()

In [26]:
corpus = []

for i in np.arange(0, ds.shape[0]):
    review = ds["Category"][i]
    review = re.sub("[^a-zA-Z]", " ", review)
    review = review.lower()
    review = review.split()
    
    # Below line 
    review = [ps.stem(word) for word in review if not word in set(stops)]
    review = " ".join(review)
    corpus.append(review)

## Using 

In [27]:
tf_idf = TfidfVectorizer(max_features = 800)
X = tf_idf.fit_transform(corpus).toarray()

print( tf_idf.get_feature_names_out() )

y = ds["Category"].map({"spam": 1, "ham": 0})

['ham' 'spam']


In [28]:
rf = RandomForestClassifier(random_state = 2022)

In [29]:
kfold = StratifiedKFold(n_splits = 5,
                       shuffle = True,
                       random_state = 2022)

In [30]:
print(rf.get_params())

{'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': 'sqrt', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_jobs': None, 'oob_score': False, 'random_state': 2022, 'verbose': 0, 'warm_start': False}


In [31]:
params = {"max_features": [10, 50, 100, 200]}

In [32]:
gcv = GridSearchCV(rf,
                  param_grid = params,
                  verbose = 3,
                  scoring = "roc_auc",
                  cv = kfold)
gcv.fit(X, y)

Fitting 5 folds for each of 4 candidates, totalling 20 fits
[CV 1/5] END ...................max_features=10;, score=1.000 total time=   0.1s
[CV 2/5] END ...................max_features=10;, score=1.000 total time=   0.1s
[CV 3/5] END ...................max_features=10;, score=1.000 total time=   0.1s
[CV 4/5] END ...................max_features=10;, score=1.000 total time=   0.1s
[CV 5/5] END ...................max_features=10;, score=1.000 total time=   0.1s
[CV 1/5] END ...................max_features=50;, score=1.000 total time=   0.1s
[CV 2/5] END ...................max_features=50;, score=1.000 total time=   0.1s
[CV 3/5] END ...................max_features=50;, score=1.000 total time=   0.1s
[CV 4/5] END ...................max_features=50;, score=1.000 total time=   0.1s
[CV 5/5] END ...................max_features=50;, score=1.000 total time=   0.1s
[CV 1/5] END ..................max_features=100;, score=1.000 total time=   0.2s
[CV 2/5] END ..................max_features=100;,

In [34]:
print(gcv.best_params_)

{'max_features': 10}


In [35]:
print(gcv.best_score_)

1.0
