In [1]:
import pandas as pd
import nltk
import tensorflow as tf
from tensorflow import  keras
from nltk.corpus import stopwords
import numpy as np
from sklearn.preprocessing import LabelEncoder
import spacy

In [2]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to C:\Users\Nunoo Justice
[nltk_data]     Samuel\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
df = pd.read_csv('./datasets/news.csv')

In [4]:
df.sample(5)

Unnamed: 0.1,Unnamed: 0,title,text,label
3060,10514,Texas Republican Calls Hillary The WORST Word ...,\nWith Hillary Clinton making history this e...,FAKE
116,2596,Netanyahu: I Won't Allow Israel to Be 'Submerg...,Prime Minister Benjamin Netanyahu on Sunday sa...,REAL
3246,2242,Clinton campaign denies access to pool reporter,The Hillary Clinton campaign denied access to ...,REAL
4644,3449,How a shortage of lethal injection drugs put t...,The Supreme Court on Monday decided that Oklah...,REAL
1005,7363,A brutal spoof advert for the new Macbook that...,Next Swipe left/right A brutal spoof advert fo...,FAKE


In [5]:
df.drop(['Unnamed: 0'], inplace=True, axis = 1)

In [44]:
df[['text', 'label']].sample(5)

Unnamed: 0,text,label
5476,chairwoman supposedly nonpartisan Federal Elec...,1
3093,""" you hold attorney general nominee hostage is...",1
4267,Facebook Lets Advertisers exclude Users Race J...,0
1743,mutually beneficial campaign detente Donald Tr...,1
4073,teen walk free gang - rape conviction Judge sa...,0


In [7]:
df['text'] = df['text'].apply(lambda x: ' '.join([word for word in x.split() if word.lower() not in stop_words]))

In [8]:
df.sample(5)

Unnamed: 0,title,text,label
54,New Comment Features have been Added,First Comment! Leave Reply Click get info form...,FAKE
1372,The quiet global crisis that scares the State ...,big new State Department assessment identified...,REAL
851,"Head Of Medicare, Who Oversaw Obamacare Rollou...","Head Medicare, Oversaw Obamacare Rollout, Step...",REAL
5354,"Sanders, Republican governors eye comeback in ...",Residents three small New Hampshire towns cast...,REAL
5796,Senator Mark Kirk Mocks Disabled Veteran Tammy...,debate Rep. Tammy Duckworth Sen. Mark Kirk (R-...,FAKE


In [9]:
nlp = spacy.load('en_core_web_sm')

In [22]:
def lemma(x):
    doc = nlp(x)
    return ' '.join([token.lemma_ for token in doc])

In [25]:
df['text'] = df['text'].apply(lemma)

In [26]:
df['label'] = df['label'].apply(lambda x: 1 if x=='REAL' else 0)
df.label

0       0
1       0
2       1
3       0
4       1
       ..
6330    1
6331    0
6332    0
6333    1
6334    1
Name: label, Length: 6335, dtype: int64

In [27]:
df['label'].value_counts()

1    3171
0    3164
Name: label, dtype: int64

In [28]:
x = df['text']
y = df['label']

In [29]:
x.sample(5)

1418    Trump Tower surround Dump Trucks Anticipation ...
2355    language obscure much reveal . it ’ important ...
1318    ( CNN ) republican candidate president gather ...
5361    Clinton elect , Obama hand power dictator . up...
4148    Yemen ’s Hudaydah suffer dire humanitarian sit...
Name: text, dtype: object

In [30]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

In [31]:
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=50, )
x_train.shape

(4751,)

In [35]:
vectorizer = CountVectorizer()
x_train_vec = vectorizer.fit_transform(x_train.values)

In [36]:
x_test_vec = vectorizer.transform(x_test)

In [37]:
x_train_vec.shape

(4751, 51390)

In [38]:
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

In [39]:
rfc = RandomForestClassifier()
rfc.fit(x_train_vec, y_train)

In [40]:
rfc.score(x_test_vec, y_test)

0.8996212121212122

In [43]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression


models = {
    'Random Forest Classifier': RandomForestClassifier(),
    'Support Vector Classifier': SVC(),
    'Decision Tree Classifier': DecisionTreeClassifier(),
    'Logistic Regression' : LogisticRegression(),
    
#     'Gaussian Naive Bayes Classifier': GaussianNB(),
#     'Multinomial Naive Bayes Classifier': MultinomialNB()
}

for name, model in models.items():
    clf = Pipeline([
        ('vectorizer', CountVectorizer()),  # Assuming your data is text
        (name, model)
    ])
    clf.fit(x_train, y_train)
    prediction = clf.predict(x_test)
    accuracy = accuracy_score(y_test, prediction)
    
    print(f'{name} : {accuracy*100}')


Random Forest Classifier : 90.27777777777779
Support Vector Classifier : 87.12121212121212
Decision Tree Classifier : 81.81818181818183


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Logistic Regression : 91.72979797979798


In [45]:
from sklearn.linear_model import LogisticRegression
lr_model = LogisticRegression()
lr_model.fit(x_train_vec, y_train,)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [46]:
lr_model.score(x_test_vec, y_test)

0.9172979797979798

In [47]:
y_pred = lr_model.predict(x_test_vec)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.92      0.92      0.92       794
           1       0.92      0.92      0.92       790

    accuracy                           0.92      1584
   macro avg       0.92      0.92      0.92      1584
weighted avg       0.92      0.92      0.92      1584



In [48]:
from sklearn.pipeline import Pipeline
clf = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('nb', MultinomialNB())
])
clf.fit(x_train, y_train)

In [49]:
clf.score(x_test,y_test)

0.8983585858585859

In [132]:
# clf.fit(x_train, y_train)
x_train_vec.shape[1]

59808

In [50]:
from nltk.corpus import wordnet

In [69]:
syn = wordnet.synsets('alter')

In [70]:
print("Synset name: ", syn[0].name())
print("\nSynset meaning: ", syn[0].definition())
print("\nSynset example: ", syn[0].examples())

Synset name:  change.v.01

Synset meaning:  cause to change; make different; cause a transformation

Synset example:  ['The advent of the automobile may have altered the growth pattern of the city', 'The discussion has changed my thinking about the issue']
