In [88]:
%matplotlib inline
from urllib.parse import urlencode
from urllib.request import Request, urlopen
import json
import numpy as np

In [6]:
url = 'http://localhost:8080/fakebox/check' # Set destination URL here

In [None]:
def get_json_str(title, content=None):
    if content:
        post_fields = {'title': title, 'content': content}     # Set POST fields here
    else:
        post_fields = {'title': title}
    request = Request(url, urlencode(post_fields).encode())
    json_str = urlopen(request).read().decode()
    return json_str

In [48]:
def parse_json_str(json_str, content=None):
    json_data = json.loads(json_str)
    title_fake_score = json_data['title']['score']
    title_pred_type = json_data['title']['decision']
    if content:
        content_fake_score = json_data['content']['score']
        content_pred_type = json_data['content']['decision']
    else:
        content_fake_score = 0
        content_pred_type = 0
    return title_fake_score, title_pred_type, content_fake_score, content_pred_type

In [44]:
import pandas as pd

In [45]:
df = pd.read_csv('data/filtered_fake.csv', index_col='uuid', dtype = {'title': str, 'text': str, 'main_img_url': str})

In [50]:
all_title_fake_score = []
all_title_pred_type = []
all_content_fake_score = []
all_content_pred_type = []
i = 0
for index, row in df.iterrows():
    content = row['text'][:50]
    # content = None
    json_str = get_json_str(row['title'], content)
    title_fake_score, title_pred_type, content_fake_score, content_pred_type = parse_json_str(json_str, content)
    all_title_fake_score.append(title_fake_score)
    all_title_pred_type.append(title_pred_type)
    all_content_fake_score.append(content_fake_score)
    all_content_pred_type.append(content_pred_type)
    i += 1
    if i%50 == 0:
        print("Iteration = ", i)

Iteration =  50
Iteration =  100
Iteration =  150
Iteration =  200
Iteration =  250
Iteration =  300
Iteration =  350
Iteration =  400
Iteration =  450


In [51]:
df['title_fake_score'] = all_title_fake_score
df['title_pred_type'] = all_title_pred_type
df['content_fake_score'] = all_content_fake_score
df['content_pred_type'] = all_content_pred_type

In [52]:
df.to_csv('data/filtered_fake_labelled.csv')

## Validation

In [54]:
test_df = pd.read_csv('data/real_or_fake.csv', dtype = {'title': str, 'text': str})

In [56]:
len(test_df)

6335

In [55]:
test_df.head(5)

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


In [114]:
test_df = test_df[:3000]

In [None]:
all_title_fake_score = []
all_title_pred_type = []
all_content_fake_score = []
all_content_pred_type = []
i = 0
for index, row in test_df.iterrows():
    content = row['text'][:10000]
    # content = None
    json_str = get_json_str(row['title'], content)
    title_fake_score, title_pred_type, content_fake_score, content_pred_type = parse_json_str(json_str, content)
    all_title_fake_score.append(title_fake_score)
    all_title_pred_type.append(title_pred_type)
    all_content_fake_score.append(content_fake_score)
    all_content_pred_type.append(content_pred_type)
    i += 1
    if i%1000 == 0:
        print("Iteration = ", i)

In [None]:
test_df['title_fake_score'] = all_title_fake_score
test_df['title_pred_type'] = all_title_pred_type
test_df['content_fake_score'] = all_content_fake_score
test_df['content_pred_type'] = all_content_pred_type

In [None]:
test_df.to_csv('data/real_or_fake_labelled.csv')

In [None]:
test_df

In [None]:
df['type'].value_counts().plot(kind='bar')

In [None]:
X = np.zeros((len(test_df),2))
X[:,0] = test_df['title_fake_score'].values
X[:,1] = test_df['content_fake_score'].values

In [None]:
y = np.zeros((len(test_df),))
for i, label in enumerate(test_df['label']):
    if label == 'FAKE':
        y[i] = 1
    else:
        y[i] = -1

In [None]:
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

In [None]:
clf = SVC()
clf.fit(X_train, y_train)
y_predict = clf.predict(X_test)
sum(y_predict == y_test) / len(y_test) *100

In [None]:
clf = LogisticRegression()
clf.fit(X_train, y_train)
y_predict = clf.predict(X_test)
sum(y_predict == y_test) / len(y_test) *100

In [None]:
X_train.shape