In [23]:
import pandas as pd
import numpy as np
from nltk.corpus import stopwords

import re
import nltk
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold, GridSearchCV

In [24]:
ds = pd.read_json(r"Cases\News Sarcasm\Sarcasm_Headlines_Dataset_v2.json",
                 lines = True)

In [25]:
df.head()

Unnamed: 0,is_sarcastic,headline
0,1,thirtysomething scientists unveil doomsday clo...
1,0,dem rep. totally nails why congress is falling...
2,0,eat your veggies: 9 deliciously different recipes
3,1,inclement weather prevents liar from getting t...
4,1,mother comes pretty close to using word 'strea...


In [26]:
ds = ds[["is_sarcastic", "headline"]]

In [27]:
stops = stopwords.words("English")

In [28]:
ps = PorterStemmer()

In [29]:
corpus = []

for i in np.arange(0, ds.shape[0]):
    review = ds["headline"][i]
    review = re.sub("[^a-zA-Z]", " ", review)
    review = review.lower()
    review = review.split()
    
    # Below line 
    review = [ps.stem(word) for word in review if not word in set(stops)]
    review = " ".join(review)
    corpus.append(review)

# Count Vectorization

In [30]:
vtz = CountVectorizer(max_features = 800)
X = vtz.fit_transform(corpus).toarray()

print(vtz.get_feature_names_out())

y = ds["is_sarcastic"]

['abort' 'abus' 'accept' 'accus' 'across' 'act' 'activist' 'actor'
 'actual' 'ad' 'add' 'address' 'administr' 'admit' 'ador' 'age' 'aid'
 'air' 'al' 'alleg' 'allow' 'alreadi' 'alway' 'amazon' 'america'
 'american' 'anim' 'announc' 'anoth' 'anti' 'anyth' 'apart' 'apolog'
 'appear' 'appl' 'approv' 'area' 'around' 'arrest' 'art' 'artist' 'ask'
 'assault' 'assur' 'attack' 'attempt' 'attend' 'author' 'avoid' 'award'
 'away' 'babi' 'back' 'bad' 'ban' 'band' 'bank' 'bar' 'battl' 'bear'
 'beat' 'beauti' 'becom' 'begin' 'behind' 'believ' 'berni' 'best' 'better'
 'biden' 'big' 'bill' 'billion' 'birth' 'birthday' 'black' 'blame' 'blast'
 'blood' 'board' 'bodi' 'bomb' 'book' 'box' 'boy' 'boyfriend' 'break'
 'bring' 'brother' 'build' 'burn' 'bush' 'busi' 'buy' 'california' 'call'
 'campaign' 'cancer' 'candid' 'car' 'card' 'care' 'career' 'case' 'cat'
 'caus' 'celebr' 'center' 'ceo' 'challeng' 'chanc' 'chang' 'charact'
 'charg' 'check' 'chief' 'child' 'children' 'china' 'chines' 'chri'
 'christian' 

In [31]:
rf = RandomForestClassifier(random_state = 2022)

In [32]:
kfold = StratifiedKFold(n_splits = 5,
                       shuffle = True,
                       random_state = 2022)

In [33]:
print(rf.get_params())

{'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': 'sqrt', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_jobs': None, 'oob_score': False, 'random_state': 2022, 'verbose': 0, 'warm_start': False}


In [34]:
params = {"max_features": [10, 50, 100, 200]}               

In [None]:
gcv = GridSearchCV(rf,
                  param_grid = params,
                  verbose = 3,
                  scoring = "roc_auc",
                  cv = kfold)
gcv.fit(X, y)

print(gcv.best_params_)
print(gcv.best_score_)


Fitting 5 folds for each of 4 candidates, totalling 20 fits
[CV 1/5] END ...................max_features=10;, score=0.772 total time=  41.4s
[CV 2/5] END ...................max_features=10;, score=0.785 total time=  41.8s
[CV 3/5] END ...................max_features=10;, score=0.781 total time=  40.7s
[CV 4/5] END ...................max_features=10;, score=0.781 total time=  40.9s
[CV 5/5] END ...................max_features=10;, score=0.789 total time=  41.0s
[CV 1/5] END ...................max_features=50;, score=0.766 total time= 2.4min
[CV 2/5] END ...................max_features=50;, score=0.782 total time= 2.4min
[CV 3/5] END ...................max_features=50;, score=0.775 total time= 2.5min
[CV 4/5] END ...................max_features=50;, score=0.772 total time= 2.4min
[CV 5/5] END ...................max_features=50;, score=0.783 total time= 2.5min
[CV 1/5] END ..................max_features=100;, score=0.764 total time= 4.6min
[CV 2/5] END ..................max_features=100;,