In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv("FakeNews.csv")

In [3]:
df.shape

(4048, 6)

In [4]:
df.head()

Unnamed: 0,URLs,Headline,Body,Label,Unnamed: 4,Unnamed: 5
0,http://www.bbc.com/news/world-us-canada-414191...,Four ways Bob Corker skewered Donald Trump,Image copyright Getty Images\nOn Sunday mornin...,1,,
1,https://www.reuters.com/article/us-filmfestiva...,Linklater's war veteran comedy speaks to moder...,"LONDON (Reuters) - “Last Flag Flying”, a comed...",1,,
2,https://www.nytimes.com/2017/10/09/us/politics...,Trump’s Fight With Corker Jeopardizes His Legi...,The feud broke into public view last week when...,1,,
3,https://www.reuters.com/article/us-mexico-oil-...,Egypt's Cheiron wins tie-up with Pemex for Mex...,MEXICO CITY (Reuters) - Egypt’s Cheiron Holdin...,1,,
4,http://www.cnn.com/videos/cnnmoney/2017/10/08/...,Jason Aldean opens 'SNL' with Vegas tribute,"Country singer Jason Aldean, who was performin...",1,,


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4048 entries, 0 to 4047
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   URLs        4048 non-null   object
 1   Headline    4027 non-null   object
 2   Body        3994 non-null   object
 3   Label       4009 non-null   object
 4   Unnamed: 4  4 non-null      object
 5   Unnamed: 5  2 non-null      object
dtypes: object(6)
memory usage: 189.9+ KB


In [6]:
df.columns.values[4:]

array(['Unnamed: 4', 'Unnamed: 5'], dtype=object)

In [7]:
df.isnull().sum()

URLs             0
Headline        21
Body            54
Label           39
Unnamed: 4    4044
Unnamed: 5    4046
dtype: int64

In [8]:
df = df.drop(df.columns.values[4:],axis=1)
df

Unnamed: 0,URLs,Headline,Body,Label
0,http://www.bbc.com/news/world-us-canada-414191...,Four ways Bob Corker skewered Donald Trump,Image copyright Getty Images\nOn Sunday mornin...,1
1,https://www.reuters.com/article/us-filmfestiva...,Linklater's war veteran comedy speaks to moder...,"LONDON (Reuters) - “Last Flag Flying”, a comed...",1
2,https://www.nytimes.com/2017/10/09/us/politics...,Trump’s Fight With Corker Jeopardizes His Legi...,The feud broke into public view last week when...,1
3,https://www.reuters.com/article/us-mexico-oil-...,Egypt's Cheiron wins tie-up with Pemex for Mex...,MEXICO CITY (Reuters) - Egypt’s Cheiron Holdin...,1
4,http://www.cnn.com/videos/cnnmoney/2017/10/08/...,Jason Aldean opens 'SNL' with Vegas tribute,"Country singer Jason Aldean, who was performin...",1
...,...,...,...,...
4043,http://beforeitsnews.com/sports/2017/09/trends...,Trends to Watch,Trends to Watch\n% of readers think this story...,0
4044,http://beforeitsnews.com/u-s-politics/2017/10/...,Trump Jr. Is Soon To Give A 30-Minute Speech F...,Trump Jr. Is Soon To Give A 30-Minute Speech F...,0
4045,https://www.activistpost.com/2017/09/ron-paul-...,"Ron Paul on Trump, Anarchism & the AltRight",,0
4046,https://www.reuters.com/article/us-china-pharm...,China to accept overseas trial data in bid to ...,SHANGHAI (Reuters) - China said it plans to ac...,1


In [9]:
#Deleting any row with incomplete data

data=df.dropna()

In [10]:
#Checking if any incomplete row-entries remain

data.isnull().sum()

URLs        0
Headline    0
Body        0
Label       0
dtype: int64

In [11]:
data['News'] = data['Headline'] + " " + data['Body']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['News'] = data['Headline'] + " " + data['Body']


In [12]:
data.head()

Unnamed: 0,URLs,Headline,Body,Label,News
0,http://www.bbc.com/news/world-us-canada-414191...,Four ways Bob Corker skewered Donald Trump,Image copyright Getty Images\nOn Sunday mornin...,1,Four ways Bob Corker skewered Donald Trump Ima...
1,https://www.reuters.com/article/us-filmfestiva...,Linklater's war veteran comedy speaks to moder...,"LONDON (Reuters) - “Last Flag Flying”, a comed...",1,Linklater's war veteran comedy speaks to moder...
2,https://www.nytimes.com/2017/10/09/us/politics...,Trump’s Fight With Corker Jeopardizes His Legi...,The feud broke into public view last week when...,1,Trump’s Fight With Corker Jeopardizes His Legi...
3,https://www.reuters.com/article/us-mexico-oil-...,Egypt's Cheiron wins tie-up with Pemex for Mex...,MEXICO CITY (Reuters) - Egypt’s Cheiron Holdin...,1,Egypt's Cheiron wins tie-up with Pemex for Mex...
4,http://www.cnn.com/videos/cnnmoney/2017/10/08/...,Jason Aldean opens 'SNL' with Vegas tribute,"Country singer Jason Aldean, who was performin...",1,Jason Aldean opens 'SNL' with Vegas tribute Co...


In [13]:
#Drop the rest of the columns apart from Label and News
data = data.drop(columns=['URLs','Headline','Body'])

In [14]:
data.head()

Unnamed: 0,Label,News
0,1,Four ways Bob Corker skewered Donald Trump Ima...
1,1,Linklater's war veteran comedy speaks to moder...
2,1,Trump’s Fight With Corker Jeopardizes His Legi...
3,1,Egypt's Cheiron wins tie-up with Pemex for Mex...
4,1,Jason Aldean opens 'SNL' with Vegas tribute Co...


## Data Preprocessing

In [15]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer


In [16]:
sw=stopwords.words('english')
ss=SnowballStemmer('english')

In [17]:
def data_cleaning(document):
    document=document.lower()
    
    #tokenizing
    words=word_tokenize(document)
    
    #stopwords removal and stemming
    new_words=[ss.stem(word) for word in words if word not in sw and len(word)>1]
    
    #convert back to sentence
    cleaned_sentence=" ".join(new_words)
    return cleaned_sentence 

In [18]:
data['Cleaned News'] = data['News'].apply(data_cleaning)

In [19]:
data.head()

Unnamed: 0,Label,News,Cleaned News
0,1,Four ways Bob Corker skewered Donald Trump Ima...,four way bob corker skewer donald trump imag c...
1,1,Linklater's war veteran comedy speaks to moder...,linklat 's war veteran comedi speak modern ame...
2,1,Trump’s Fight With Corker Jeopardizes His Legi...,trump fight corker jeopard legisl agenda feud ...
3,1,Egypt's Cheiron wins tie-up with Pemex for Mex...,egypt 's cheiron win tie-up pemex mexican onsh...
4,1,Jason Aldean opens 'SNL' with Vegas tribute Co...,jason aldean open snl vega tribut countri sing...


In [20]:
X = data['Cleaned News'].values
y = data['Label'].values

## Splitting Training and Testing data

In [21]:
from sklearn.model_selection import train_test_split

In [22]:
train_test_split?

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [24]:
X_train.shape

(3190,)

In [25]:
X_test.shape

(798,)

## Vectorization

In [26]:
from sklearn.feature_extraction.text import CountVectorizer


In [27]:
cv=CountVectorizer(max_features=7000)

In [28]:
X_train = cv.fit_transform(X_train).toarray()

In [29]:
X_train.shape

(3190, 7000)

In [30]:
X_test = cv.transform(X_test).toarray()

In [31]:
X_test.shape

(798, 7000)

In [32]:
cv.get_feature_names()

['00',
 '000',
 '01',
 '02',
 '03',
 '04',
 '05',
 '06',
 '07',
 '08',
 '09',
 '10',
 '100',
 '1000',
 '101',
 '105',
 '10th',
 '11',
 '110',
 '112',
 '118',
 '11th',
 '12',
 '120',
 '125',
 '12th',
 '13',
 '130',
 '13th',
 '14',
 '140',
 '15',
 '150',
 '155',
 '16',
 '16th',
 '17',
 '170',
 '17th',
 '18',
 '180',
 '19',
 '1923',
 '1945',
 '1950',
 '1950s',
 '1953',
 '1958',
 '1960s',
 '1961',
 '1965',
 '1966',
 '1968',
 '1969',
 '1970',
 '1970s',
 '1972',
 '1973',
 '1974',
 '1975',
 '1977',
 '1978',
 '1980',
 '1980s',
 '1981',
 '1982',
 '1985',
 '1986',
 '1987',
 '1989',
 '1990',
 '1990s',
 '1991',
 '1992',
 '1993',
 '1994',
 '1996',
 '1997',
 '1998',
 '1999',
 '19th',
 '1b',
 '1st',
 '20',
 '200',
 '2000',
 '2001',
 '2002',
 '2003',
 '2004',
 '2005',
 '2006',
 '2007',
 '2008',
 '2009',
 '2010',
 '2011',
 '2012',
 '2013',
 '2014',
 '2015',
 '2016',
 '2017',
 '2018',
 '2019',
 '2020',
 '2021',
 '2022',
 '2023',
 '2025',
 '2030',
 '2049',
 '20th',
 '21',
 '21st',
 '22',
 '23',
 '23rd',


In [33]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression

In [34]:
# model = MultinomialNB()
model = LogisticRegression()

In [35]:
model.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression()

## Prediction

In [36]:
y_pred = model.predict(X_test)

In [37]:
y_pred[:5]

array(['1', '0', '1', '1', '1'], dtype=object)

In [38]:
y_test[:5]

array(['1', '0', '1', '1', '1'], dtype=object)

## Evaluation

In [39]:
model.score(X_test, y_test)

0.974937343358396

In [41]:
## Max_features=10000 : Model Accuracy=94.61
## Max_features=7000 : Model Accuracy=94.99
## Logistic Regression, Max_features=7000 : Model Accuracy=97.49