In [1]:
import numpy as np
import pandas as pd
from ngram_modeling import MarkovChain

In [2]:
# Load data and convert it to numpy array
data = pd.read_csv('news_cleaned.csv', header=None, na_values=['.'], encoding='latin-1')
data = np.array(data)

# Flatten it so we got array of strings
data = data.flatten()

# Concatenate the articles
data = ' '.join(data)

# Set the grams and designated text size
n_grams = 5
text_size = 200

# Instantiate markov chain and generate text
markov = MarkovChain(data, n_grams, text_size)

In [122]:
output = markov.generateText()
output_parts = output.split(' ')[:10]
headline = (' ').join(output_parts)
print(headline)

bailey circus is shutting down  after dwindling attendance and


In [118]:
%matplotlib inline
import matplotlib.pyplot as plt
from urllib.parse import urlencode
from urllib.request import Request, urlopen
import json
import numpy as np
import pandas as pd

In [119]:
api_url = 'http://localhost:8080/fakebox/check' # Set destination URL here

In [120]:
def get_json_str(title, content=None, url=None):
    if content and url:
        post_fields = {'title': title, 'content': content, 'url': url}     # Set POST fields here
    elif content:
        post_fields = {'title': title, 'content': content}     # Set POST fields here
    else:
        post_fields = {'title': title}
    request = Request(api_url, urlencode(post_fields).encode())
    json_str = urlopen(request).read().decode()
    return json_str

In [121]:
def parse_json_str(json_str, content=None, url=None):
    json_data = json.loads(json_str)
    title_fake_score = json_data['title']['score']
    title_pred_type = json_data['title']['decision']
    if content:
        content_fake_score = json_data['content']['score']
        content_pred_type = json_data['content']['decision']
    else:
        content_fake_score = 0
        content_pred_type = 0
    if url:
        domain_type = json_data['domain']['category']
    else:
        domain_type = None
    return title_fake_score, title_pred_type, content_fake_score, content_pred_type, domain_type

In [250]:
title_fake_score = 0
while title_fake_score < 0.94:
    output = markov.generateText()
    output_parts = output.split(' ')[:15]
    headline = (' ').join(output_parts)
    title = headline
    json_str = get_json_str(title)
    title_fake_score, title_pred_type, content_fake_score, content_pred_type, domain_type = parse_json_str(json_str)

print(title_fake_score)
print(title)

0.9542144536972046
former turkish minister for european union affairs in one email in 2013  mr alptekin


In [124]:
title_fake_score

0.48547863960266113

In [251]:
import pickle
filename = 'headline_generator.pkl'
with open(filename, 'wb') as f:
    pickle.dump([markov], f)

In [257]:
filename = 'headline_generator.pkl'
with open(filename, 'rb') as f:
    markov_test = pickle.load(f)[0]

In [262]:
def generate_fake_headlines(markov, n):
    
    api_url = 'http://localhost:8080/fakebox/check' # Set destination URL here
    def get_json_str(title, content=None, url=None):
        if content and url:
            post_fields = {'title': title, 'content': content, 'url': url}     # Set POST fields here
        elif content:
            post_fields = {'title': title, 'content': content}     # Set POST fields here
        else:
            post_fields = {'title': title}
        request = Request(api_url, urlencode(post_fields).encode())
        json_str = urlopen(request).read().decode()
        return json_str
    def parse_json_str(json_str, content=None, url=None):
        json_data = json.loads(json_str)
        title_fake_score = json_data['title']['score']
        title_pred_type = json_data['title']['decision']
        if content:
            content_fake_score = json_data['content']['score']
            content_pred_type = json_data['content']['decision']
        else:
            content_fake_score = 0
            content_pred_type = 0
        if url:
            domain_type = json_data['domain']['category']
        else:
            domain_type = None
        return title_fake_score, title_pred_type, content_fake_score, content_pred_type, domain_type
    
    def generate_one_headline():
        title_fake_score = 0
        while title_fake_score < 0.94:
            output = markov.generateText()
            output_parts = output.split(' ')[:15]
            headline = (' ').join(output_parts)
            title = headline
            json_str = get_json_str(title)
            title_fake_score, title_pred_type, content_fake_score, content_pred_type, domain_type = parse_json_str(json_str)
        return title
    headlines = []
    for _ in range(n):
        headlines.append(generate_one_headline())
    return headlines

In [263]:
headlines = generate_fake_headlines(markov_test, n=10)

In [264]:
headlines

['nearly 500  000 in political contributions from gov terry mcauliffe  a key ally',
 'wednesday  seven people were killed in northwestern pakistan in two suicide bombings  one',
 'unemployment rate  at 4 8 percent in january  is in a range fed',
 'valeant reported a 2 4 billion loss last monday  mr ackman had had enough',
 'bronx  42 percent of those interacting queens and 31 percent of those in brooklyn',
 'owns more than 20  000 kinds and approximately 14 million square feet of office',
 'firebombed r o t c centers a small number  like the weathermen  took',
 'in the provinces of ontario  quebec  alberta and british columbia while their targets',
 'plant in the countrys remote north  near the russian border a russian nationalist parliamentarian',
 'markets  in afghanistan  dozens of people were killed in more than 80 wounded']

In [294]:
test_df = pd.read_csv('real_or_fake.csv', dtype = {'title': str, 'text': str})

In [295]:
test_df = test_df.loc[test_df['label']=='REAL']

In [310]:
filtered_df = filtered_df[:5]
real_headlines = filtered_df['title'].tolist()

In [311]:
real_headlines = [h.lower() for h in real_headlines]

In [312]:
real_headlines

['iran reportedly makes new push for uranium concessions in nuclear talks',
 'with all three clintons in iowa, a glimpse at the fire that has eluded hillary clinton’s campaign',
 'the 1 chart that explains everything you need to know about partisanship in america',
 'new senate majority leader’s main goal for gop: don’t be scary',
 'sanders trounces clinton in w. va. -- but will it make a difference?']

In [313]:
all_headlines = headlines + real_headlines

In [314]:
all_headlines = [h.upper() for h in all_headlines]

In [315]:
all_headlines

['NEARLY 500  000 IN POLITICAL CONTRIBUTIONS FROM GOV TERRY MCAULIFFE  A KEY ALLY',
 'WEDNESDAY  SEVEN PEOPLE WERE KILLED IN NORTHWESTERN PAKISTAN IN TWO SUICIDE BOMBINGS  ONE',
 'UNEMPLOYMENT RATE  AT 4 8 PERCENT IN JANUARY  IS IN A RANGE FED',
 'VALEANT REPORTED A 2 4 BILLION LOSS LAST MONDAY  MR ACKMAN HAD HAD ENOUGH',
 'BRONX  42 PERCENT OF THOSE INTERACTING QUEENS AND 31 PERCENT OF THOSE IN BROOKLYN',
 'OWNS MORE THAN 20  000 KINDS AND APPROXIMATELY 14 MILLION SQUARE FEET OF OFFICE',
 'FIREBOMBED R O T C CENTERS A SMALL NUMBER  LIKE THE WEATHERMEN  TOOK',
 'IN THE PROVINCES OF ONTARIO  QUEBEC  ALBERTA AND BRITISH COLUMBIA WHILE THEIR TARGETS',
 'PLANT IN THE COUNTRYS REMOTE NORTH  NEAR THE RUSSIAN BORDER A RUSSIAN NATIONALIST PARLIAMENTARIAN',
 'MARKETS  IN AFGHANISTAN  DOZENS OF PEOPLE WERE KILLED IN MORE THAN 80 WOUNDED',
 'IRAN REPORTEDLY MAKES NEW PUSH FOR URANIUM CONCESSIONS IN NUCLEAR TALKS',
 'WITH ALL THREE CLINTONS IN IOWA, A GLIMPSE AT THE FIRE THAT HAS ELUDED HILLARY CL

In [279]:
labels = ['f'] * 10 + ['r'] * 5

In [280]:
labels

['f', 'f', 'f', 'f', 'f', 'f', 'f', 'f', 'f', 'f', 'r', 'r', 'r', 'r', 'r']

In [316]:
data = {'headlines': all_headlines, 'labels': labels}

In [317]:
foo = pd.DataFrame(data)

In [318]:
foo.to_csv('headlines.csv')

In [307]:
filtered_indices = []
for index, row in test_df.iterrows():
    if len(row['title'].split(' ')) > 10:
        filtered_indices.append(index)

In [308]:
filtered_df = test_df.loc[filtered_indices]

In [309]:
len(filtered_df)

1234

In [321]:
df = pd.read_csv('headlines.csv', index_col='id', dtype = {'title': str, 'label': str})
random_id = np.random.randint(0, len(df)-1)

In [323]:
chosen_row = df.loc[random_id]
headline = chosen_row['headlines']
label = chosen_row['labels']

In [324]:
headline

'OWNS MORE THAN 20,000 KINDS AND APPROXIMATELY 14 MILLION SQUARE FEET OF OFFICE'

In [325]:
def get_random_headline(filename):
    df = pd.read_csv(filename, index_col='id', dtype = {'title': str, 'label': str})
    random_id = np.random.randint(0, len(df)-1)
    chosen_row = df.loc[random_id]
    headline = chosen_row['headlines']
    label = chosen_row['labels']
    return headline, label

In [326]:
headline, label = get_random_headline('headlines.csv')

In [327]:
print(headline, label)

PLANT IN THE COUNTRYS REMOTE NORTH  NEAR THE RUSSIAN BORDER A RUSSIAN NATIONALIST PARLIAMENTARIAN f
