In [1]:
"""
from google.colab import drive
drive.mount('/content/drive')
"""

"\nfrom google.colab import drive\ndrive.mount('/content/drive')\n"

In [2]:
# Kill the warnings:
import warnings
warnings.filterwarnings(action='ignore')

In [3]:
# Main Dependencies:
import numpy as np # linear algebra
import pandas as pd # data processing

import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib import rcParams

import config

import nltk
from textblob import TextBlob, Word

# Statistics imports:
from statistics import mean
import scipy.stats
from scipy.stats import pearsonr,spearmanr
#!python -m spacy download en_core_web_sm
#!python -m textblob.download_corpora

In [4]:
# Data Load
true_df = pd.read_csv(config.true_url)
false_df = pd.read_csv(config.false_url)

true_df['class'] = 1
false_df['class'] = 0

fake_news_df = pd.concat([true_df,false_df])

In [5]:
# Data is highly structured allready
fake_news_df.head()

Unnamed: 0.1,Unnamed: 0,text,class
0,0,The head of a conservative Republican faction ...,1
1,1,Transgender people will be allowed for the fir...,1
2,2,The special counsel investigation of links bet...,1
3,3,Trump campaign adviser George Papadopoulos tol...,1
4,4,President Donald Trump called on the U.S. Post...,1


In [6]:
# Let's take a sample of 10k texts from the entire dataset
fake_news_df = fake_news_df.sample(10000, random_state=42).reset_index(drop=True)[['text','class']]

In [7]:
# In this file one can find a vocabulary that spans the emotions by words
# BTW, it contains phrases combined into a single word, so we will need to correct it with bigrams and trigrams accordingly
emData = pd.read_csv(config.emData_url, sep='\t', lineterminator='\n')
wordsData = pd.read_excel(config.wordsData_url, index_col=0)

List of emotions:

In [8]:
Emotions = emData['emotion'].drop_duplicates().tolist() # Pop the list of emotions
Emotions = [emotion.title() for emotion in Emotions] # Capitalize the first letter

In [9]:
# Select the English dataset, one-hot-encoded (emotions)
wordsData = wordsData[wordsData.columns.intersection(['English Word']+[emotion for emotion in Emotions])]

In [10]:
"""
# This shall be considered on production server
!python -m textblob.download_corpora
nltk.download('omw-1.4')
"""

"\n# This shall be considered on production server\n!python -m textblob.download_corpora\nnltk.download('omw-1.4')\n"

--------------------------------------- Code Phase -------------------------------------------

In [11]:
# Check whether there is a connection? Power-down will be considered as connection loss
from urllib.request import urlopen
def internet_is_on():
    try:
        urlopen("http://www.google.com/",timeout=1)
        return True
    except urllib.URLError as err:
        return False

In [12]:
# Snippet for timer tool
import functools
import time

def timer(func):
    """Print the runtime of the decorated function"""
    @functools.wraps(func)
    def wrapper_timer(*args, **kwargs):
        start_time = time.perf_counter()    # 1
        value = func(*args, **kwargs)
        end_time = time.perf_counter()      # 2
        run_time = end_time - start_time    # 3
        print(f"Finished {func.__name__!r} in {run_time:.4f} secs")
        return value
    return wrapper_timer

In [13]:
@timer
def feature_wordsCount(df_row, Sentence, df):
    # count the unique words in the Sentence and calculate the ratio
    uniqueWords = len(set(Sentence.words))
    totalWords = len((Sentence.words))
    df.at[df_row,'uniqe_words_ratio']=uniqueWords/totalWords

In [14]:
@timer
def feature_nounPolarity(df_row, Sentence, df):
    # Add feature for sum of polarity index into the dataset
    # df_row is an index of the row in the dataframe
    try:
        df.at[df_row,'nounPolarity'] = mean([TextBlob(nounS).sentiment.polarity for nounS in Sentence.noun_phrases])
    except:
        df.at[df_row,'nounPolarity'] = 0 # No nouns found

In [15]:
@timer
def feature_nounSubjectivity(df_row, Sentence, df):
    # Add feature for sum of subjectivity index into the dataset
    # df_row is an index of the row in the dataframe
    try:
        df.at[df_row,'nounSubjectivity'] = mean([TextBlob(nounS).sentiment.subjectivity for nounS in Sentence.noun_phrases])
    except:
        df.at[df_row,'nounSubjectivity'] = 0 # No nouns found

In [16]:
@timer
def feature_sentenceSentiment(df_row, Sentence, df):
    # Entire phrase sentiment analysis
    # df_row is an index of the row in the dataframe
    polarity, subjectivity = Sentence.sentiment
    df.at[df_row,'sentencePolarity'] = polarity
    df.at[df_row,'sentenceSubjectivity'] = subjectivity
    df.at[df_row,'meanPolarity_per_sentence'] = mean([sentence.polarity for sentence in Sentence.sentences])
    df.at[df_row,'meanSubjetivity_per_sentence'] = mean([sentence.subjectivity for sentence in Sentence.sentences])

In [17]:
@timer
def feature_Emotions(df_row, Sentence, df):
    # Insert the emotional count per words into dataset
    # df_row is an index of the row in the dataframe
    # WordsData is the English dataset, one-hot-encoded for emotions

    # Reset emotions for the selected row
    for emotion in Emotions:
        df.at[df_row,emotion]=0

    for word in [Word(word).singularize().lemmatize() for word in Sentence.words if word in wordsData.index]:
        try:
            for emotion in set(wordsData.columns[(wordsData[wordsData.index == word].values == 1)[0]].tolist()):
                df.at[df_row,emotion]+=1
        except:
            pass # no emotonal load for that specific word

In [18]:
@timer
def frequency_Analysis(df_row, Sentence, df):
    # Emotional load converting to frequency and amplitude
    # df_row is an index of the row in the dataframe

    #Sentence = TextBlob(fake_news_full_df['text'][df_row]).correct()
    data1 = np.array([sentence.polarity for sentence in Sentence.sentences]) # Sentence polarity
    data2 = np.array([sentence.subjectivity for sentence in Sentence.sentences]) # Sentence subjectivity
    sentence_timing = [len(sentence.words) for sentence in Sentence.sentences] # Sentence timing

    #Frequency Analysis:
    ps1 = np.abs(np.fft.fft(data1))**2
    ps2 = np.abs(np.fft.fft(data2))**2

    time_step = 1 / np.average(sentence_timing)
    freqs1 = np.fft.fftfreq(data1.size, time_step)
    freqs2 = np.fft.fftfreq(data2.size, time_step)

    MaxPolarityFrequency = round(max(freqs1),2) # Feature
    MaxSubjectivityFrequency = round(max(freqs2),2) # Feature

    df.at[df_row,'MaxPolarityFrequency'] = MaxPolarityFrequency
    df.at[df_row,'MaxSubjectivityFrequency'] = MaxSubjectivityFrequency

In [19]:
@timer
def correlation_and_entropy(df_row,Sentence,df):
    # Test for mutual correlation of sentences polarity and subjectivity
    # df_row is an index of the row in the dataframe

    #Sentence = TextBlob(fake_news_full_df['text'][df_row]).correct()
    data1 = np.array([sentence.polarity for sentence in Sentence.sentences]) # Sentence polarity
    data2 = np.array([sentence.subjectivity for sentence in Sentence.sentences]) # Sentence subjectivity

    # Peason correlation between polarity and subjectivity - Feature
    try:
       corrP, _ = pearsonr(data1, data2)
    except:
        corrP = 0 # less than 2 elements for correlation
    # Spearman correlation between polarity and subjectivity - Feature
    try:
        corrS, _ = spearmanr(data1, data2)
    except:
        corrS = 0 # less than 2 elements for correlation

    # Calculate entropy of words in the sentence
    p_data = pd.DataFrame(Sentence.words).value_counts()
    try:
        entropy = scipy.stats.entropy(p_data)
    except:
        entropy = 0 # No data for entropy calculation

    df.at[df_row,'corrP'] = corrP
    df.at[df_row,'corrS'] = corrS
    df.at[df_row,'entropy'] = entropy

In [20]:
# Export to csv:
def save_current_DF(path = 'Data/',row=0):
    filename = path+'fakeNews_corrected_features_'+str(row)+'.csv'
    with open(path, 'w', encoding = 'utf-8-sig') as f:
        fake_news_full_df.dropna().to_csv(f,index=True)

In [21]:
def construct_Features(indexRange,df,correct=True):
    # Construct the features
    for row in indexRange:
        print(f'Constructing features for row #{row} out of {len(df)}:')
        try:
          if correct:
            Sentence = TextBlob(df['text'][row]).correct()
          else:
            Sentence = TextBlob(df['text'][row])

          feature_wordsCount(row,Sentence,df)
          feature_nounPolarity(row, Sentence,df)
          feature_nounSubjectivity(row, Sentence,df)
          feature_sentenceSentiment(row, Sentence,df)
          feature_Emotions(row, Sentence, df)
          frequency_Analysis(row, Sentence, df)
          correlation_and_entropy(row, Sentence, df)

          if (row%1000) == 0:  # Test connection every 1000 features, to get less traffic
              if internet_is_on() == False:
                save_current_DF(path = 'Data/',row=row)
                break
        except Exception:
            print(f'row #{row} contains some bugs, skipping')

In [22]:
# Create the united dataset
fake_news_full_df = pd.concat([true_df,false_df]).reset_index(drop=True)[['text','class']].dropna()
construct_Features(range(len(fake_news_full_df)),fake_news_full_df,correct=True)

Constructing features for row #0 out of 78588:
Finished 'feature_wordsCount' in 0.0186 secs
Finished 'feature_nounPolarity' in 4.6171 secs
Finished 'feature_nounSubjectivity' in 0.0193 secs
Finished 'feature_sentenceSentiment' in 0.0417 secs
Finished 'feature_Emotions' in 2.4130 secs
Finished 'frequency_Analysis' in 0.0124 secs
Finished 'correlation_and_entropy' in 0.0054 secs
Constructing features for row #1 out of 78588:
Finished 'feature_wordsCount' in 0.0051 secs
Finished 'feature_nounPolarity' in 0.0204 secs
Finished 'feature_nounSubjectivity' in 0.0091 secs
Finished 'feature_sentenceSentiment' in 0.0143 secs
Finished 'feature_Emotions' in 0.2050 secs
Finished 'frequency_Analysis' in 0.0071 secs
Finished 'correlation_and_entropy' in 0.0037 secs
Constructing features for row #2 out of 78588:
Finished 'feature_wordsCount' in 0.0057 secs
Finished 'feature_nounPolarity' in 0.0215 secs
Finished 'feature_nounSubjectivity' in 0.0119 secs
Finished 'feature_sentenceSentiment' in 0.0157 sec


KeyboardInterrupt

