In [136]:
import polyglot
import numpy as np
import pandas as pd
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelBinarizer
from sklearn.model_selection import train_test_split
import math
import re
from operator import itemgetter

In [137]:
df = pd.read_excel("data/FN_Training_Set.xlsx")

In [138]:
df.columns


Index(['fake_news_score', 'click_bait_score', 'Content Title', 'Content Url',
       'Content Published Time', 'Content'],
      dtype='object')

In [139]:
click_bait_content = df[df['click_bait_score']==3]['Content'].tolist()
non_click_bait_content = df[df['click_bait_score']==1]['Content'].tolist()
fake_content = df[df['fake_news_score']==3]['Content'].tolist()
non_fake_content = df[df['fake_news_score']==1]['Content'].tolist()

click_bait_header = df[df['click_bait_score']==3]['Content Title'].tolist()
non_click_bait_header = df[df['click_bait_score']==1]['Content Title'].tolist()
fake_header = df[df['fake_news_score']==3]['Content Title'].tolist()
non_fake_header = df[df['fake_news_score']==1]['Content Title'].tolist()
regex_clean = '[\n„\".,!?“:\-\/_\xa0\(\)…]'


In [140]:
def calculate_pmi(word, all_positive, all_negative): 
    word = re.sub(regex_clean, '', word).lower()
    all_positive = [re.sub(regex_clean, '', str(p)).lower() for p in all_positive]
    all_negative = [re.sub(regex_clean, '', str(p)).lower() for p in all_negative]
    positive = [d for d in all_positive if word in d]
    negative = [d for d in all_negative if word in d]
    #pmi = (len(positive)*len(all_negative)+0.00001)/(len(negative)*len(all_positive)+0.00001)
    pmi_pos = len(positive)*1.0/(len(positive) + len(negative))
    pmi_neg = len(negative)*1.0/(len(positive) + len(negative))
    #pmi_norm = math.log2(pmi_pos)
    return [word, pmi_pos, pmi_neg]
    

In [141]:
p = calculate_pmi('борисов', click_bait, non_click_bait)
print(p)

['борисов', 0.7407407407407407, 0.25925925925925924]


In [142]:
all_headers = df['Content Title'].tolist()
all_conents = df['Content'].tolist()

In [143]:
def build_vocabulary(input_text):
    from collections import Counter
    c = Counter()
    for line in input_text:
        words = re.sub(regex_clean, '', str(line)).lower().split(' ')
        c.update(words)
    for k in list(c):
        if c[k] < 20:
            del c[k]
    return c
    

In [146]:
header_vocab = build_vocabulary(all_headers)
content_vocab = build_vocabulary(all_conents)

In [147]:
pmi_headers_clickbait = [calculate_pmi(h, click_bait_header, non_click_bait_header) for h in header_vocab]
pmi_headers_fact = [calculate_pmi(h, fake_header, non_fake_header) for h in header_vocab]
pmi_content_clickbait = [calculate_pmi(h, click_bait_content, non_click_bait_content) for h in content_vocab]
pmi_content_fact = [calculate_pmi(h, fake_content, non_fake_content) for h in content_vocab]


In [161]:
with open('pmi_content_fact', 'w', encoding='utf8') as w:
    for p in pmi_content_fact:
        norm = [str(i) for i in p]
        w.write('\t'.join(norm) + '\n')