In [1]:
import pandas as pd 
import numpy as np 
import re 

# stopwords: are commonly used words in english, that are unnecessary (are, is, the, etc..)
from nltk.corpus import stopwords 

# PorterStemmer: to reduce words or stem a word to its base or root form. e.g eating -> eat, praying->pray
from nltk.stem.porter import PorterStemmer 

# TfidfVectorizer: TF-IDF (Term frequency - Inverse document frequency) Vectorizer
# TF (term frequency): How frequent a word(term) appears in a document
# IDF (Inverse document frequency): How important a word(term) is in the document

# TfidfVectorizer: Returns a matrix of TF-IDF values
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score


### This exercise is complex because it has columns that are texts which computers do not understand. We treat as a natural language processing

### Downloading the stopwords

In [2]:
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')  # Download stopwords data

# Get a list of English stopwords
stop_words = set(stopwords.words('english'))

[nltk_data] Error loading stopwords: <urlopen error [Errno 60]
[nltk_data]     Operation timed out>


LookupError: 
**********************************************************************
  Resource [93mstopwords[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('stopwords')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mcorpora/stopwords[0m

  Searched in:
    - '/Users/ihon/nltk_data'
    - '/Users/ihon/anaconda3/nltk_data'
    - '/Users/ihon/anaconda3/share/nltk_data'
    - '/Users/ihon/anaconda3/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
**********************************************************************


In [3]:
news_data = pd.read_csv('train.csv')

In [4]:
news_data.head()

# label 0: real news 
# label 1: fake news 

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [5]:
news_data.shape

(20800, 5)

### Check for missing values in dataframe & fill with empty strings

In [6]:
news_data.isnull().sum()

id           0
title      558
author    1957
text        39
label        0
dtype: int64

In [7]:
# Replacing the Null values with empty strings 
news_data = news_data.fillna('')

In [8]:
news_data.isnull().sum()

id        0
title     0
author    0
text      0
label     0
dtype: int64

#### We will make use of the title & author column to make predictions, to  save time, cause the text column are large and might require longer processing time 

In [None]:
# Combining the title & author column

In [9]:
news_data['content'] = news_data['author'] + " " + news_data['title']

In [10]:
news_data.head()

Unnamed: 0,id,title,author,text,label,content
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1,Darrell Lucus House Dem Aide: We Didn’t Even S...
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0,"Daniel J. Flynn FLYNN: Hillary Clinton, Big Wo..."
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1,Consortiumnews.com Why the Truth Might Get You...
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1,Jessica Purkiss 15 Civilians Killed In Single ...
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1,Howard Portnoy Iranian woman jailed for fictio...


# Stemming

In [None]:
# Reducing a word to its root word, e.g: actor, actress, acting -> act

In [13]:
port_stem = PorterStemmer()

In [17]:
def stemming(content):
    stemmed_content = re.sub('[^a-zA-Z]', ' ', content)
    stemmed_content = stemmed_content.lower()
    stemmed_content = stemmed_content.split()
#     stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
    stemmed_content = [port_stem.stem(word) for word in stemmed_content]
    stemmed_content = ' '.join(stemmed_content)
    return stemmed_content

In [18]:
# Apply the stemming function to the content column 

news_data['content'] = news_data['content'].apply(stemming)

In [19]:
news_data.head()

Unnamed: 0,id,title,author,text,label,content
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1,darrel lucu hous dem aid we didn t even see co...
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0,daniel j flynn flynn hillari clinton big woman...
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1,consortiumnew com whi the truth might get you ...
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1,jessica purkiss civilian kill in singl us airs...
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1,howard portnoy iranian woman jail for fiction ...


In [20]:
# Seperating the data from the target/label

X = news_data['content'].values
y = news_data['label'].values

In [22]:
print(X)

['darrel lucu hous dem aid we didn t even see comey s letter until jason chaffetz tweet it'
 'daniel j flynn flynn hillari clinton big woman on campu breitbart'
 'consortiumnew com whi the truth might get you fire' ...
 'michael j de la merc and rachel abram maci s is said to receiv takeov approach by hudson s bay the new york time'
 'alex ansari nato russia to hold parallel exercis in balkan'
 'david swanson what keep the f aliv']


### Converting textual data to numerical data 

In [23]:
vectorizer = TfidfVectorizer()

In [24]:
vectorizer.fit(X)

In [25]:
X = vectorizer.transform(X)

In [27]:
print(X)

  (0, 16591)	0.19185657544008947
  (0, 16060)	0.28379135534554073
  (0, 15767)	0.2530073032987084
  (0, 13533)	0.22790561725543848
  (0, 8953)	0.3229500947064574
  (0, 8673)	0.25946859467111033
  (0, 7734)	0.22014490291665442
  (0, 7642)	0.15585882712957794
  (0, 7041)	0.1942886495938918
  (0, 4997)	0.20723456165570228
  (0, 4028)	0.26352190272448234
  (0, 3811)	0.2402904504663692
  (0, 3619)	0.31966152954272303
  (0, 2977)	0.21925031305348408
  (0, 2501)	0.32655231027754433
  (0, 272)	0.2399067450673633
  (1, 16897)	0.29690287265706966
  (1, 10754)	0.15877335700396866
  (1, 6850)	0.1880499691597717
  (1, 5528)	0.7052687008120976
  (1, 3587)	0.2603921904305678
  (1, 2831)	0.1885236047225887
  (1, 2241)	0.3778771044604833
  (1, 1910)	0.15325078862748648
  (1, 1512)	0.29025992042009807
  :	:
  (20797, 8407)	0.21643347101798308
  (20797, 7614)	0.12723270154056254
  (20797, 7079)	0.21135740479289314
  (20797, 3662)	0.20511774287709164
  (20797, 2149)	0.14695085567302218
  (20797, 1298)	0.3

In [28]:
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=2)

### Training the model (Logistic Regression)

In [29]:
model = LogisticRegression()

In [30]:
model.fit(X_train, Y_train)

In [31]:
# Check model prediction accuracy for training data
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)

print('Accuracy score:', training_data_accuracy)

Accuracy score: 0.9865985576923076


In [33]:
# Check model prediction accuracy for test data
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)

print('Accuracy score:', test_data_accuracy)

Accuracy score: 0.9776442307692308


# Making a predictive system

In [39]:
X_new = X_test[0]

prediction = model.predict(X_new)

print(prediction) # Compare with Y_test[0]

[1]
