In [23]:
# import pandas module
import pandas as pd
# import cleaned dataframe
df = pd.read_csv("cleaned_news.csv")
df.head()
print(df)

text  fake_news
0      Donald Trump just couldn t wish all Americans ...          1
1      House Intelligence Committee Chairman Devin Nu...          1
2      On Friday, it was revealed that former Milwauk...          1
3      On Christmas day, Donald Trump announced that ...          1
4      Pope Francis used his annual Christmas Day mes...          1
...                                                  ...        ...
44864  NATO allies on Tuesday welcomed President Dona...          0
44865  LexisNexis, a provider of legal, regulatory an...          0
44866  In the shadow of disused Soviet-era factories ...          0
44867  Vatican Secretary of State Cardinal Pietro Par...          0
44868  Indonesia will buy 11 Sukhoi fighter jets wort...          0

[44869 rows x 2 columns]


In [None]:
# split the data
DV = "fake_news" # the dependent variable, text is the independent variable here
X = df.drop([DV], axis = 1) # drop from our X array because this is the text data that gets trained
y = df[DV] # 0 is real, 1 is fake news // this is the dependent variable

# train on 75% of the dataset and test on the remaining 25%
from sklearn.model_selection import train_test_split
# we train on 75% of the data, test on the rest
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)



In [36]:
from sklearn.feature_extraction.text import CountVectorizer

count_vect = CountVectorizer(max_features = 5000)# limiting to 5000 unique words, but room to play with this here!
X_train_counts = count_vect.fit_transform(X_train["text"]) 
# print(count_vect.vocabulary_) # here is our bag of words! 
X_test = count_vect.transform(X_test["text"]) # note: we don't fit it to the model! Or else this is all useless

#print(X_train_counts) # prints the document number, term number, and how frequent the term appears
#print(X_test)

In [None]:

# What is the probability a particular word appears given that the article is true?
# This is in part determined by the probability that the news article is true (or fake) given that a particular word exists in it. 
# This is repeated for every unique word in our vocabulary. 
# The summation from all of these calculations and division by a normalization constant assigns the article to be real or not based on the result (a probability).
from sklearn.naive_bayes import MultinomialNB
# fit the training dataset on the NB classifier
Naive = MultinomialNB()
Naive.fit(X_train_counts, y_train)

# have the model make predictions on the testing portion of the data and take a look at how accurate it is using a handy sklearn tool called accuracy_score
from sklearn.metrics import accuracy_score
# predict the labels on validation dataset
predictions_NB = Naive.predict(X_test)
# Use accuracy_score function to get the accuracy
# very accurate bc of assumption of independence!
print("Accuracy Score:", accuracy_score(predictions_NB, y_test)*100) # test against its DV (y_test) (or fake/real news)


In [None]:
# real life article prediction
# link: https://entertainment.theonion.com/drake-fans-accuse-kenny-chesney-of-manipulating-billboa-1843484082
onion = ["""Calling the country singer’s place at the top of Top 200 completely illegitimate, fans of the rapper–singer Drake took to social media Friday to accuse Kenny Chesney of manipulating Billboard’s algorithm by putting effort into his album. “It’s just unfair that this guy could keep Drake from his rightful place on the charts by putting out quality music that he actually cares about,” said Aiden Howard, 14, who echoed the sentiments of Drake fans worldwide in his assertion that the artist’s mediocre B-sides deserved more acclaim and recognition. “He clearly gamed the streaming numbers when he decided to put time and energy into his craft. It’s such horseshit that Billboard rewards that behavior and punishes Drizzy for making a half-assed mixtape full of songs he’d already dropped on SoundCloud. How the hell is ‘Toosie Slide’ going to compare to a song that the artist thought about for more than 15 minutes?” At press time, Drake released a statement asking fans to ignore Kenny Chesney and focus on the horseshit that he just released."""]

onion_vec = count_vect.transform(onion)
predict_onion = Naive.predict(onion_vec)
print(predict_onion) # if 1 is printed, it is fake news


In [None]:
# link: https://www.nytimes.com/2020/05/16/us/politics/linick-investigation-pompeo.html?action=click&module=Top%20Stories&pgtype=Homepage
nyt = ["""Two top congressional Democrats opened an investigation on Saturday into President Trump’s removal of 
          Steve A. Linick, who led the office of the inspector general at the State Department, citing a pattern 
          of “politically-motivated firing of inspectors general.” Mr. Trump told Speaker Nancy Pelosi late 
          Friday night that he was ousting Mr. Linick, who was named by President Barack Obama to the State 
          Department post, and replacing him with an ambassador with close ties to Vice President Mike Pence in 
          the latest purge of inspectors general whom Mr. Trump has deemed insufficiently loyal to his 
          administration. In letters to the White House, State Department, and Mr. Linick, Representative Eliot 
          L. Engel of New York, the chairman of the House Foreign Affairs Committee, and Senator Bob Menendez of 
          New Jersey, the top Democrat on the Senate Foreign Relations Committee, requested that the administration
          turn over records and information related to the firing of Mr. Linick as well as “records of all I.G. 
          investigations involving the Office of the Secretary that were open, pending, or incomplete at the 
          time of Mr. Linick’s firing.” Mr. Engel and Mr. Menendez said in their letters that they believe 
          Secretary of State Mike Pompeo recommended Mr. Linick’s ouster because he had opened an investigation 
          into Mr. Pompeo’s conduct. The lawmakers did not provide any more details, but a Democratic aide said 
          that Mr. Linick had been looking into whether Mr. Pompeo had misused a political appointee at the State 
          Department to perform personal tasks for himself and his wife. “Such an action, transparently designed to
          protect Secretary Pompeo from personal accountability, would undermine the foundation of our democratic 
          institutions and may be an illegal act of retaliation,” the lawmakers wrote. Under law, the administration
          must notify Congress 30 days before formally terminating an inspector general. Mr. Linick is expected to 
          leave his post then. Mr. Trump’s decision to remove Mr. Linick is the latest in a series of ousters aimed
          at inspectors general who the president and his allies believe are opposed to his agenda. In May, Mr. 
          Trump moved to oust Christi A. Grimm, the principal deputy inspector general for the Department of Health
          and Human Services, whose office had issued a report revealing the dire state of the nation’s response to
          the pathogen. He has also taken steps to remove two other inspectors general, for the intelligence
          community and for the Defense Department. Mr. Linick was spotlighted during the impeachment inquiry when 
          he requested an urgent meeting with congressional staff members to give them copies of documents related 
          to the State Department and Ukraine, signaling they could be relevant to the House investigation into 
          whether President Trump pressured Ukraine to investigate former Vice President Joseph R. Biden Jr. and 
          his son Hunter Biden. The documents — a record of contacts between Rudolph W. Giuliani, the president’s 
          personal lawyer, and Ukrainian prosecutors, as well as accounts of Ukrainian law enforcement proceedings 
          — turned out to be largely inconsequential."""]

nyt_vec = count_vect.transform(nyt)
predict_nyt = Naive.predict(nyt_vec)
print(predict_nyt)

In [45]:
# function summarizing the modeling
# only parameter is a list object containing the text in the news story
def classifier(text):
    Naive = MultinomialNB()
    Naive.fit(X_train_counts, y_train)
    
    # n.b: you may need to wrap the argument in brackets to make it a   vector if you passed in a string
    word_vec = count_vect.transform(text) 
    
    predict = Naive.predict(word_vec)
    return "Fake News Story" if predict[0] else "Real News Story"


In [47]:
print("NYT: " + classifier(nyt))
print("Onion: " + classifier(onion))

NYT: Real News Story
Onion: Fake News Story
