<a href="https://colab.research.google.com/github/hellen2021/Lux-Dev-Python-functions/blob/main/Fake_News_Detector_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## 1)Specifying the analytic question
Build a fake news detecting model
## 2)Defining the metric of success
This project will be a success if I achieve an accuracy of atleast 80%
## 3)Recording Experimental Designs
*   Load the dataset
*   Preview and Explore the dataset
*   Data Cleaning
*   Perform Exploratory Data Analysis(EDA)
*   Modelling
*   Evaluation of the model





In [None]:
# import libraries
import pandas as pd
import numpy as np


## Load the Dataset

In [None]:
news =  pd.read_csv('/content/news.csv',delimiter=",",names = ['ID', 'title', 'test', 'label'])
news.drop(0, axis = 0, inplace =True)

## Preview and Explore the data

In [None]:
# head
news.head()

Unnamed: 0,ID,title,test,label
1,8476.0,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
2,10294.0,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
3,3608.0,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
4,10142.0,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
5,875.0,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


In [None]:
# tail
news.tail()

Unnamed: 0,ID,title,test,label
6331,4490.0,State Department says it can't find emails fro...,The State Department told the Republican Natio...,REAL
6332,8062.0,The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...,The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...,FAKE
6333,8622.0,Anti-Trump Protesters Are Tools of the Oligarc...,Anti-Trump Protesters Are Tools of the Oligar...,FAKE
6334,4021.0,"In Ethiopia, Obama seeks progress on peace, se...","ADDIS ABABA, Ethiopia —President Obama convene...",REAL
6335,4330.0,Jeb Bush Is Suddenly Attacking Trump. Here's W...,Jeb Bush Is Suddenly Attacking Trump. Here's W...,REAL


## Data Cleaning

In [None]:
news.label.unique()

array(['FAKE', 'REAL'], dtype=object)

In [None]:
# We will also download and import nlkt which is a tokenizer. 
# This library will help us break (messages) into individual linguistic units i.e. words.
#
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
# Pre-processing
# We will first emoving useless variance for our task at hand 
# 

# Converting the labels from strings to binary values for our classifier
# 
news['label'] = news.label.map({'FAKE': 0, 'REAL': 1})

# Converting all characters in the text and title to lower case
# 
news['title'] = news.title.map(lambda x: x.lower())
news['test'] = news.test.map(lambda x: x.lower())

# Removing any punctuation
# 
news['title'] = news.title.str.replace('[^\w\s]', '')
news['test'] = news.test.str.replace('[^\w\s]', '')

  app.launch_new_instance()


In [None]:
# Pre-processing 
# Tokenizing the messages into into single words using nltk. 

# Applying the tokenization
# 
news['title'] = news['title'].apply(nltk.word_tokenize)
news['test'] = news['test'].apply(nltk.word_tokenize)

In [None]:
# stemming - to normalize our text for all variations of words carry the same meaning, 
# regardless of the tense.
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()
 
news['title'] = news['title'].apply(lambda x: [stemmer.stem(y) for y in x])
news['test'] = news['test'].apply(lambda x: [stemmer.stem(y) for y in x])

In [None]:
# transform the data into occurrences, 
# which will be the features that we will feed into our model
# 
from sklearn.feature_extraction.text import CountVectorizer

# This converts the list of words into space-separated strings
# news[['title', 'test']] = news[['title', 'test']].apply(lambda x: ' '.join(x))
news['test'] = news['test'].apply(lambda x: ' '.join(x))
news['title'] = news['title'].apply(lambda x: ' '.join(x))


count_vect = CountVectorizer()
#train = news[['title', 'test']]
test = news['label']
counts = count_vect.fit_transform(news['test'])


In [None]:
from sklearn.feature_extraction.text import TfidfTransformer

transformer = TfidfTransformer().fit(counts)

counts = transformer.transform(counts)

In [None]:
test.shape

(6335,)

In [None]:
# Training the Model
# Now that we have performed feature extraction from our data, it is time to build our model. 
# We will start by splitting our data into training and test sets
# 
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(counts, test, test_size=0.1, random_state=6)

In [None]:
# Fitting our model 

from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB().fit(X_train, y_train)

In [None]:
# evaluation
from sklearn.metrics import accuracy_score
predicted = model.predict(X_test)
print(np.mean(predicted == y_test))
print(accuracy_score(y_test, predicted))

0.8091482649842271
0.8091482649842271
