# Fake news detector project.

### Imports

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

### Opening and analyzing data

In [2]:
df = pd.read_csv('train.csv')

In [3]:
df.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [4]:
df.isnull().sum()

id           0
title      558
author    1957
text        39
label        0
dtype: int64

Since there are null values, I have decided to remove them all for greater convenience.

In [5]:
df.dropna(inplace=True)

In [6]:
df.isnull().sum()

id        0
title     0
author    0
text      0
label     0
dtype: int64

In [7]:
df.shape

(18285, 5)

In order to find keywords within the texts, it is necessary to eliminate stopwords that do not add value. For this reason, we will download stopwords and then eliminate them to have a clean text full of relevant words for our model.

In [8]:
import re 
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer

In [9]:
#Download stopwords
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\kevin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [10]:
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [11]:
df = df.drop('id', axis=1)

In [12]:
df.head()

Unnamed: 0,title,author,text,label
0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


I will make a new column called 'content' that will contain the title of the news and its author, this column will be the one to which we will apply the elimination of worthless words, and then submit them to the model.

In [14]:
df['content'] = df['title'] + df['author']
df.head()

Unnamed: 0,title,author,text,label,content
0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1,House Dem Aide: We Didn’t Even See Comey’s Let...
1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0,"FLYNN: Hillary Clinton, Big Woman on Campus - ..."
2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1,Why the Truth Might Get You FiredConsortiumnew...
3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1,15 Civilians Killed In Single US Airstrike Hav...
4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1,Iranian woman jailed for fictional unpublished...


In [15]:
ps = PorterStemmer()

In [16]:
#Function to quit irrelevant text
def stemmer(content):
    stemmed_cont = re.sub('[^a-zA-Z]',' ', content)
    stemmed_cont = stemmed_cont.lower()
    stemmed_cont = stemmed_cont.split()
    stemmed_cont = [ps.stem(word) for word in stemmed_cont if not word in stopwords.words('english')]
    stemmed_cont = ' '.join(stemmed_cont)
    
    return stemmed_cont

In [17]:
df['content'] = df['content'].apply(stemmer)

In [18]:
df['content'].head()

0    hous dem aid even see comey letter jason chaff...
1    flynn hillari clinton big woman campu breitbar...
2               truth might get firedconsortiumnew com
3    civilian kill singl us airstrik identifiedjess...
4    iranian woman jail fiction unpublish stori wom...
Name: content, dtype: object

Now I will asign the x and y value for the model

In [19]:
x = df['content']
y = df['label'].values

In [20]:
df.head()

Unnamed: 0,title,author,text,label,content
0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1,hous dem aid even see comey letter jason chaff...
1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0,flynn hillari clinton big woman campu breitbar...
2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1,truth might get firedconsortiumnew com
3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1,civilian kill singl us airstrik identifiedjess...
4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1,iranian woman jail fiction unpublish stori wom...


Now I'll vetorize the words to train the model.

In [21]:
vectorizer = TfidfVectorizer()
vectorizer.fit(x)

In [22]:
x = vectorizer.transform(x)

In [23]:
print(x)

  (0, 20457)	0.2741046376369148
  (0, 17069)	0.2426153836578785
  (0, 11356)	0.3415830641618049
  (0, 11016)	0.279324877683043
  (0, 9976)	0.3173184375207729
  (0, 9865)	0.41077429174205615
  (0, 9010)	0.20727075804308753
  (0, 6454)	0.21801393050458376
  (0, 4923)	0.2540774581871974
  (0, 3726)	0.2355388396545462
  (0, 3116)	0.354508260960043
  (0, 361)	0.2562719040578664
  (1, 21929)	0.29963745600134867
  (1, 8724)	0.19058754221527793
  (1, 7163)	0.6958439148234276
  (1, 3516)	0.19096016583224892
  (1, 2797)	0.3732854017723977
  (1, 2349)	0.36040018720713196
  (1, 1849)	0.2898257387924506
  (2, 20363)	0.4177972820765128
  (2, 12168)	0.462906698285606
  (2, 7715)	0.32674837002578083
  (2, 7025)	0.6454408470021312
  (2, 3705)	0.296318132213445
  (3, 20855)	0.24265162713858263
  :	:
  (18282, 22234)	0.07836533196928752
  (18282, 19725)	0.19529960615266756
  (18282, 18923)	0.31801459318439085
  (18282, 16649)	0.2503095139646597
  (18282, 15651)	0.27217819865902854
  (18282, 15395)	0.2600

Split the data.

In [25]:
#Split the dataset
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 1)

### Model creation

In [26]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier.fit(x_train, y_train)

### Model Accuracy

In [27]:
from sklearn.metrics import confusion_matrix, accuracy_score

In [28]:
y_train_pred = classifier.predict(x_train) #Predict train results 
confm_test = confusion_matrix(y_train_pred, y_train)
accs_test = accuracy_score(y_train_pred, y_train)

print('Accuracy score of Train data: ', accs_test)
print('Confusion matrix: \n', confm_test)

Accuracy score of Train data:  0.9866694011484823
Confusion matrix: 
 [[8157   67]
 [ 128 6276]]


In [29]:
y_pred = classifier.predict(x_test) #Predict test results
confm_test = confusion_matrix(y_pred, y_test)
accs_test = accuracy_score(y_pred, y_test)

print('Accuracy score of test data: ', accs_test)
print('Confusion matrix: \n', confm_test)

Accuracy score of test data:  0.9734755263877495
Confusion matrix: 
 [[2003   24]
 [  73 1557]]


### Conclusions

The model had a decent accuracy score, looking at the confusion matrix we can see that there were problems regarding false negatives, so we should work more on it.