In [1]:
import pandas as pd
df = pd.read_csv('D:/Professional/NLP/source/codeBasics/11_bag_of_n_grams/Fake_Real_Data.csv')

In [2]:
df.head()

Unnamed: 0,Text,label
0,Top Trump Surrogate BRUTALLY Stabs Him In The...,Fake
1,U.S. conservative leader optimistic of common ...,Real
2,"Trump proposes U.S. tax overhaul, stirs concer...",Real
3,Court Forces Ohio To Allow Millions Of Illega...,Fake
4,Democrats say Trump agrees to work on immigrat...,Real


In [3]:
df.label.value_counts()

label
Fake    5000
Real    4900
Name: count, dtype: int64

In [4]:
df['category_num'] = df['label'].map({
    'Fake': 0,
    'Real': 1,
})

In [5]:
df.head()

Unnamed: 0,Text,label,category_num
0,Top Trump Surrogate BRUTALLY Stabs Him In The...,Fake,0
1,U.S. conservative leader optimistic of common ...,Real,1
2,"Trump proposes U.S. tax overhaul, stirs concer...",Real,1
3,Court Forces Ohio To Allow Millions Of Illega...,Fake,0
4,Democrats say Trump agrees to work on immigrat...,Real,1


In [6]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    df.Text, 
    df.category_num, 
    test_size=0.2, # 20% samples will go to test dataset
    random_state=2022,
    stratify=df.category_num
)

In [11]:
print(X_train.shape)
print(X_test.shape)

(7920,)
(1980,)


In [18]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier  


In [15]:
# pipline of unigram
print('pipline of unigram')
clf_uni = Pipeline([
    ('vector_bow', CountVectorizer(ngram_range=(1,1))),
    ('multi_naive', MultinomialNB())
])
clf_uni.fit(X_train, y_train)

y_pred = clf_uni.predict(X_test)
print(classification_report(y_test, y_pred))

# pipline of bigram
print('pipline of bigram')
clf_bi = Pipeline([
    ('vector_bow', CountVectorizer(ngram_range=(1,2))),
    ('multi_naive', MultinomialNB())
])
clf_bi.fit(X_train, y_train)

y_pred = clf_bi.predict(X_test)
print(classification_report(y_test, y_pred))

# pipline of trigram
print('pipline of trigram')
clf_tri = Pipeline([
    ('vector_bow', CountVectorizer(ngram_range=(1,3))),
    ('multi_naive', MultinomialNB())
])
clf_tri.fit(X_train, y_train)

y_pred = clf_tri.predict(X_test)
print(classification_report(y_test, y_pred))

pipline of unigram
              precision    recall  f1-score   support

           0       0.98      0.97      0.98      1000
           1       0.97      0.98      0.98       980

    accuracy                           0.98      1980
   macro avg       0.98      0.98      0.98      1980
weighted avg       0.98      0.98      0.98      1980

pipline of bigram
              precision    recall  f1-score   support

           0       0.98      0.99      0.99      1000
           1       0.99      0.98      0.99       980

    accuracy                           0.99      1980
   macro avg       0.99      0.99      0.99      1980
weighted avg       0.99      0.99      0.99      1980

pipline of trigram
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      1000
           1       0.99      0.99      0.99       980

    accuracy                           0.99      1980
   macro avg       0.99      0.99      0.99      1980
weighted avg      

In [16]:
# pipline of unigram
print('pipline of unigram')
clf_uni = Pipeline([
    ('vector_bow', CountVectorizer(ngram_range=(1,1))),
    ('knn_cls', KNeighborsClassifier(n_neighbors=10, metric='euclidean' ))
])
clf_uni.fit(X_train, y_train)

y_pred = clf_uni.predict(X_test)
print(classification_report(y_test, y_pred))

# pipline of bigram
print('pipline of bigram')
clf_bi = Pipeline([
    ('vector_bow', CountVectorizer(ngram_range=(1,2))),
    ('knn_cls', KNeighborsClassifier(n_neighbors=10, metric='euclidean' ))
])
clf_bi.fit(X_train, y_train)

y_pred = clf_bi.predict(X_test)
print(classification_report(y_test, y_pred))

# pipline of trigram
print('pipline of trigram')
clf_tri = Pipeline([
    ('vector_bow', CountVectorizer(ngram_range=(1,3))),
    ('knn_cls', KNeighborsClassifier(n_neighbors=10, metric='euclidean' ))
])
clf_tri.fit(X_train, y_train)

y_pred = clf_tri.predict(X_test)
print(classification_report(y_test, y_pred))

pipline of unigram
              precision    recall  f1-score   support

           0       0.96      0.93      0.95      1000
           1       0.93      0.96      0.95       980

    accuracy                           0.95      1980
   macro avg       0.95      0.95      0.95      1980
weighted avg       0.95      0.95      0.95      1980

pipline of bigram
              precision    recall  f1-score   support

           0       0.97      0.77      0.86      1000
           1       0.81      0.98      0.88       980

    accuracy                           0.87      1980
   macro avg       0.89      0.88      0.87      1980
weighted avg       0.89      0.87      0.87      1980

pipline of trigram
              precision    recall  f1-score   support

           0       0.96      0.49      0.65      1000
           1       0.65      0.98      0.78       980

    accuracy                           0.73      1980
   macro avg       0.81      0.74      0.72      1980
weighted avg      

In [17]:
# pipline of unigram
print('pipline of unigram')
clf_uni = Pipeline([
    ('vector_bow', CountVectorizer(ngram_range=(1,1))),
    ('knn_cls', KNeighborsClassifier(n_neighbors=10, metric='cosine' ))
])
clf_uni.fit(X_train, y_train)

y_pred = clf_uni.predict(X_test)
print(classification_report(y_test, y_pred))

# pipline of bigram
print('pipline of bigram')
clf_bi = Pipeline([
    ('vector_bow', CountVectorizer(ngram_range=(1,2))),
    ('knn_cls', KNeighborsClassifier(n_neighbors=10, metric='cosine' ))
])
clf_bi.fit(X_train, y_train)

y_pred = clf_bi.predict(X_test)
print(classification_report(y_test, y_pred))

# pipline of trigram
print('pipline of trigram')
clf_tri = Pipeline([
    ('vector_bow', CountVectorizer(ngram_range=(1,3))),
    ('knn_cls', KNeighborsClassifier(n_neighbors=10, metric='cosine' ))
])
clf_tri.fit(X_train, y_train)

y_pred = clf_tri.predict(X_test)
print(classification_report(y_test, y_pred))

pipline of unigram
              precision    recall  f1-score   support

           0       0.98      0.85      0.91      1000
           1       0.86      0.99      0.92       980

    accuracy                           0.92      1980
   macro avg       0.92      0.92      0.92      1980
weighted avg       0.92      0.92      0.92      1980

pipline of bigram
              precision    recall  f1-score   support

           0       0.99      0.71      0.83      1000
           1       0.77      1.00      0.87       980

    accuracy                           0.85      1980
   macro avg       0.88      0.85      0.85      1980
weighted avg       0.88      0.85      0.85      1980

pipline of trigram
              precision    recall  f1-score   support

           0       0.99      0.55      0.71      1000
           1       0.69      1.00      0.81       980

    accuracy                           0.77      1980
   macro avg       0.84      0.77      0.76      1980
weighted avg      

In [19]:
# pipline of trigram
print('pipline of trigram')
clf_tri = Pipeline([
    ('vector_bow', CountVectorizer(ngram_range=(1,3))),
    ('rdf_cls', RandomForestClassifier(n_estimators= 10, criterion="entropy"))
])
clf_tri.fit(X_train, y_train)

y_pred = clf_tri.predict(X_test)
print(classification_report(y_test, y_pred))

pipline of trigram
              precision    recall  f1-score   support

           0       0.96      0.99      0.97      1000
           1       0.99      0.96      0.97       980

    accuracy                           0.97      1980
   macro avg       0.97      0.97      0.97      1980
weighted avg       0.97      0.97      0.97      1980



In [None]:
-