In [1]:
import pandas as pd
import numpy as np
import re
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ADMIN\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ADMIN\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
df = pd.read_csv('evaluationFakenews.csv', encoding='latin1')
df.head()

Unnamed: 0,title,text,label
0,"Sanders back in U.S. Senate, blasts 'coloniali...",WASHINGTON (Reuters) - Democratic U.S. preside...,1
1,Kremlin: Syria peoples' congress being 'active...,MOSCOW (Reuters) - A proposal to convene a con...,1
2,Oregon Cop Convicted Of Shattering Bikers Co...,"In a baffling fit of rage, an Oregon State Pol...",0
3,Twitter Erupts With Glee Over #CruzSexScandal...,The last thing any politician running for the ...,0
4,MUST WATCH VIDEO: Obama Tries To Trash Trump B...,This is too good to miss! Mr. Teleprompter did...,0


In [41]:
def clean_text(texts):
    c_text = []
    for text in texts:
        # Replace special characters
        text = text.replace('\x92', "'")
        text = text.replace('\x93', '"')
        text = text.replace('\x94', '"')

        sent_list = sent_tokenize(text)

        cleaned_sentences = []
        for t_sent in sent_list:
            c_sent = re.sub(r'[^\w.]', ' ', t_sent)
            cleaned_sentences.append(c_sent)

        # Join cleaned sentences into a single text
        cleaned_text = ' '.join(cleaned_sentences)
        c_text.append(cleaned_text)

    return c_text

In [43]:
clean_text_list = clean_text(list(df['text']))
clean_title_list = clean_text(list(df['title']))


ori = df['text'][0]
clean = clean_text_list[0]

print(f'Orignal Text: {ori}\n\n Clean Text: {clean}')

Orignal Text: WASHINGTON (Reuters) - Democratic U.S. presidential hopeful Bernie Sanders brought his firebrand rhetoric back to the floor of the Senate on Tuesday to condemn a White House-backed bill on Puerto Ricos financial crisis as colonialism at its worst. Sanders, a self-described democratic socialist who turned an unlikely presidential bid into a political movement to combat inequality, warned that legislation due for a crucial Senate vote on Wednesday would subject Puerto Rico to Republican trickle-down economics and favor vulture capitalists at the expense of the islands increasingly impoverished population. An aide said it was the first time Sanders has spoken in the Senate since December. Does that sound like the kind of morality that should be passed here in the United States Senate? Sanders fumed during an eight-minute appearance to support an hours-long speech by Democratic Senator Robert Menendez of New Jersey, who opposes the bill. The Vermont senator used his a

In [45]:
def combine_text(text_list1,text_list2):
  news_list = []

  for title , text in zip(text_list1, text_list2):
    texts = title+' '+text
    news_list.append(texts)
  return news_list

In [47]:
news = combine_text(clean_title_list,clean_text_list)
news[0]

'Sanders back in U.S. Senate  blasts  colonialism  in Puerto Rico WASHINGTON  Reuters    Democratic U.S. presidential hopeful Bernie Sanders brought his firebrand rhetoric back to the floor of the Senate on Tuesday to condemn a White House backed bill on Puerto Rico s financial crisis as  colonialism at its worst.  Sanders  a self described democratic socialist who turned an unlikely presidential bid into a political movement to combat inequality  warned that legislation due for a crucial Senate vote on Wednesday would subject Puerto Rico to Republican trickle down economics and favor  vulture capitalists  at the expense of the island s increasingly impoverished population. An aide said it was the first time Sanders has spoken in the Senate since December.  Does that sound like the kind of morality that should be passed here in the United States Senate   Sanders fumed during an eight minute appearance to support an hours long speech by Democratic Senator Robert Menendez of New Jersey  

In [49]:
df['news'] = news
df.head()

Unnamed: 0,title,text,label,news
0,"Sanders back in U.S. Senate, blasts 'coloniali...",WASHINGTON (Reuters) - Democratic U.S. preside...,1,Sanders back in U.S. Senate blasts coloniali...
1,Kremlin: Syria peoples' congress being 'active...,MOSCOW (Reuters) - A proposal to convene a con...,1,Kremlin Syria peoples congress being active...
2,Oregon Cop Convicted Of Shattering Bikers Co...,"In a baffling fit of rage, an Oregon State Pol...",0,Oregon Cop Convicted Of Shattering Biker s Co...
3,Twitter Erupts With Glee Over #CruzSexScandal...,The last thing any politician running for the ...,0,Twitter Erupts With Glee Over CruzSexScandal...
4,MUST WATCH VIDEO: Obama Tries To Trash Trump B...,This is too good to miss! Mr. Teleprompter did...,0,MUST WATCH VIDEO Obama Tries To Trash Trump B...


In [51]:
def custom_word_tokenize(texts):
    stop_words = set(stopwords.words('english'))
    word_list = []
    tokenized_texts = []

    for text in texts:
        tokens = word_tokenize(text)
        for token in tokens:
          if token not in stop_words:
            word_list.append(token)
        tokenized_texts.append(word_list)
        word_list = []
    return tokenized_texts

In [55]:
tokens_list = custom_word_tokenize(df['news'])
tokens_list[0][0:11]

['Sanders',
 'back',
 'U.S.',
 'Senate',
 'blasts',
 'colonialism',
 'Puerto',
 'Rico',
 'WASHINGTON',
 'Reuters',
 'Democratic']

In [59]:
texts = [' '.join(token) for token in tokens_list]

In [63]:
tf_vectorizer = TfidfVectorizer()
X = tf_vectorizer.fit_transform(texts)
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = LogisticRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

In [85]:
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

classes = ['label 0 :fake', 'label 1 :real']
confus_matrix = pd.DataFrame(conf_matrix,
                            index = classes,
                             columns = classes
                            )
confus_matrix
print("Accuray:", accuracy)
print("Confusion Matrix:\n", confus_matrix)
print("Classification Report:\n", class_report)



Accuray: 0.958128078817734
Confusion Matrix:
                label 0 :fake  label 1 :real
label 0 :fake            731             33
label 1 :real             35            825
Classification Report:
               precision    recall  f1-score   support

           0       0.95      0.96      0.96       764
           1       0.96      0.96      0.96       860

    accuracy                           0.96      1624
   macro avg       0.96      0.96      0.96      1624
weighted avg       0.96      0.96      0.96      1624



In [87]:
!pwd

/c/Users/ADMIN/Desktop/Pythonfolder
