This is a modeling phase for the FakeNews project

Initial classifier has shown a good result with ExtraTreesClassifier (done by pycaret
LGBM seems to be somewhat better though
Here we will be training and tuning this model

In [None]:
# Kill the warnings:
import warnings
warnings.filterwarnings(action='ignore')

In [None]:
# Main Dependencies:
import numpy as np # linear algebra
import pandas as pd # data processing
from sklearn.ensemble import ExtraTreesClassifier
from lightgbm import LGBMClassifier

#from sklearn.model_selection import cross_val_score
#from sklearn.model_selection import RepeatedStratifiedKFold

import nltk
from textblob import TextBlob, Word
import config

# Statistics imports:
from statistics import mean
import scipy.stats
from scipy.stats import pearsonr,spearmanr

import matplotlib.pyplot as plt

In [None]:
# some necessary actions:

In [None]:
Emotions = ['Anger','Anticipation','Disgust','Fear','Joy','Sadness','Surprise','Trust']
wordsData = pd.read_excel(config.wordsData_url, index_col=0)
wordsData = wordsData[wordsData.columns.intersection(['English Word']+[emotion for emotion in Emotions])]

--- Feature Constructors:

In [None]:
def feature_wordsCount(df_row, Sentence, df):
    # count the unique words in the Sentence and calculate the ratio
    uniqueWords = len(set(Sentence.words))
    totalWords = len((Sentence.words))
    df.at[df_row,'uniqe_words_ratio']=uniqueWords/totalWords

In [None]:
def feature_nounPolarity(df_row, Sentence, df):
    # Add feature for sum of polarity index into the dataset
    # df_row is an index of the row in the dataframe
    #Sentence = TextBlob(fake_news_full_df['text'][df_row]).correct()
    try:
        df.at[df_row,'nounPolarity'] = mean([TextBlob(nounS).sentiment.polarity for nounS in Sentence.noun_phrases])
    except:
        df.at[df_row,'nounPolarity'] = 0 # No nouns found

In [None]:
def feature_nounSubjectivity(df_row, Sentence, df):
    # Add feature for sum of subjectivity index into the dataset
    # df_row is an index of the row in the dataframe
    #Sentence = TextBlob(fake_news_full_df['text'][df_row]).correct()
    try:
        df.at[df_row,'nounSubjectivity'] = mean([TextBlob(nounS).sentiment.subjectivity for nounS in Sentence.noun_phrases])
    except:
        df.at[df_row,'nounSubjectivity'] = 0 # No nouns found

In [None]:
def feature_sentenceSentiment(df_row, Sentence, df):
    # Entire phrase sentiment analysis
    # df_row is an index of the row in the dataframe
    #Sentence = TextBlob(fake_news_full_df['text'][df_row]).correct()
    polarity, subjectivity = Sentence.sentiment
    df.at[df_row,'sentencePolarity'] = polarity
    df.at[df_row,'sentenceSubjectivity'] = subjectivity
    df.at[df_row,'meanPolarity_per_sentence'] = mean([sentence.polarity for sentence in Sentence.sentences])
    df.at[df_row,'meanSubjetivity_per_sentence'] = mean([sentence.subjectivity for sentence in Sentence.sentences])

In [None]:
def feature_Emotions(df_row, Sentence, df):
    # Insert the emotional count per words into dataset
    # df_row is an index of the row in the dataframe
    # WordsData is the English dataset, one-hot-encoded for emotions

    # Reset emotions for the selected row
    for emotion in Emotions:
        df.at[df_row,emotion]=0

    for word in [Word(word).singularize().lemmatize() for word in Sentence.words if word in wordsData.index]:
        try:
            for emotion in set(wordsData.columns[(wordsData[wordsData.index == word].values == 1)[0]].tolist()):
                df.at[df_row,emotion]+=1
        except:
            pass # no emotonal load for that specific word

In [None]:
def frequency_Analysis(df_row, Sentence, df):
    # Emotional load converting to frequency and amplitude
    # df_row is an index of the row in the dataframe

    #Sentence = TextBlob(fake_news_full_df['text'][df_row]).correct()
    data1 = np.array([sentence.polarity for sentence in Sentence.sentences]) # Sentence polarity
    data2 = np.array([sentence.subjectivity for sentence in Sentence.sentences]) # Sentence subjectivity
    sentence_timing = [len(sentence.words) for sentence in Sentence.sentences] # Sentence timing

    #Frequency Analysis:
    ps1 = np.abs(np.fft.fft(data1))**2
    ps2 = np.abs(np.fft.fft(data2))**2

    time_step = 1 / np.average(sentence_timing)
    freqs1 = np.fft.fftfreq(data1.size, time_step)
    freqs2 = np.fft.fftfreq(data2.size, time_step)

    MaxPolarityFrequency = round(max(freqs1),2) # Feature
    MaxSubjectivityFrequency = round(max(freqs2),2) # Feature

    df.at[df_row,'MaxPolarityFrequency'] = MaxPolarityFrequency
    df.at[df_row,'MaxSubjectivityFrequency'] = MaxSubjectivityFrequency

In [None]:
def correlation_and_entropy(df_row,Sentence,df):
    # Test for mutual correlation of sentences polarity and subjectivity
    # df_row is an index of the row in the dataframe

    #Sentence = TextBlob(fake_news_full_df['text'][df_row]).correct()
    data1 = np.array([sentence.polarity for sentence in Sentence.sentences]) # Sentence polarity
    data2 = np.array([sentence.subjectivity for sentence in Sentence.sentences]) # Sentence subjectivity

    # Peason correlation between polarity and subjectivity - Feature
    try:
        corrP, _ = pearsonr(data1, data2)
    except:
        corrP = 0 # less than 2 elements for correlation
    # Spearman correlation between polarity and subjectivity - Feature
    try:
        corrS, _ = spearmanr(data1, data2)
    except:
        corrS = 0 # less than 2 elements for correlation

    # Calculate entropy of words in the sentence
    p_data = pd.DataFrame(Sentence.words).value_counts()
    try:
        entropy = scipy.stats.entropy(p_data)
    except:
        entropy = 0 # No data for entropy calculation

    df.at[df_row,'corrP'] = corrP
    df.at[df_row,'corrS'] = corrS
    df.at[df_row,'entropy'] = entropy

In [None]:
def construct_Features(indexRange,df,correct=True):
    # Construct the features
    for row in indexRange:
        print(f'Constructing features for row #{row} out of {len(df)}:')
        try:
            if correct:
                Sentence = TextBlob(df['text'][row]).correct()
            else:
                Sentence = TextBlob(df['text'][row])

            feature_wordsCount(row,Sentence,df)
            feature_nounPolarity(row, Sentence,df)
            feature_nounSubjectivity(row, Sentence,df)
            feature_sentenceSentiment(row, Sentence,df)
            feature_Emotions(row, Sentence, df)
            frequency_Analysis(row, Sentence, df)
            correlation_and_entropy(row, Sentence, df)
        except:
            print(f'row #{row} contains some bugs, skipping')

In [None]:
# Read the df with features
df = pd.read_csv('Data/fake_news_features_non_corrected.csv').drop(['Unnamed: 0'],axis=1)

In [None]:
#Let's leave just the features and target values
df = df.drop(['text'],axis=1)

In [None]:
X = df.drop(['class'],axis=1)
y = df['class']

In [None]:
model = LGBMClassifier(bagging_fraction=0.6, bagging_freq=1, boosting_type='gbdt',
                       class_weight=None, colsample_bytree=1.0, feature_fraction=0.4,
                       importance_type='split', learning_rate=0.3, max_depth=-1,
                       min_child_samples=61, min_child_weight=0.001, min_split_gain=0.1,
                       n_estimators=190, n_jobs=-1, num_leaves=20, objective=None,
                       random_state=786, reg_alpha=1e-06, reg_lambda=5, silent='warn',
                       subsample=1.0, subsample_for_bin=200000, subsample_freq=0)
model.fit(X, y)

In [None]:
model.score(X,y)

In [None]:
TrueText = "Snapchat could become more popular with advertisers than Twitter  Yahoo and AOL within three years  with the messaging app company forecast to be bring in revenues of more than $3bn (£2.4bn) a year by the end of 2019. That bullish forecast is based on advertisers targeting the hard-to-reach youth audience that Snapchat has seemingly cornered. More than half (51%) of video users on the app are under 24  compared with 23% for Facebook and 17% for Google's YouTube (17%)  according to Ampere Analysis. Brands are also keen to see a true rival emerge to challenge Facebook and Google  which have recently come in for heavy criticism for their advertising practices. The two web giants currently account for 58% of the $141bn global mobile ad market."

In [None]:
FakeText = "Yahoo and AOL could be extremely popular over the next decade and revenues are expected to skyrocket by 2020.  This forecast is based on the advertisers that target a younger audience.  Half of the users are under the age of 30 compared to facebook and google which cover the older market, as per the recent analysis posting by the Washington Post.  Facebook and google will be challenged.  The current advertising practices have received extreme criticism, the web giants currently hold a 50% stake in the global ad market and are currently seeing a small decline in their users."

In [None]:
testDF = pd.DataFrame(columns=['text', 'uniqe_words_ratio', 'nounPolarity', 'nounSubjectivity',
                               'sentencePolarity', 'sentenceSubjectivity', 'meanPolarity_per_sentence',
                               'meanSubjetivity_per_sentence', 'Anger', 'Anticipation', 'Disgust',
                               'Fear', 'Joy', 'Sadness', 'Surprise', 'Trust', 'MaxPolarityFrequency',
                               'MaxSubjectivityFrequency', 'corrP', 'corrS', 'entropy'])

In [None]:
testDF.at[0,'text'] = TrueText
construct_Features(range(1),testDF,correct=True)

In [None]:
testDF

In [None]:
model.predict(testDF.drop(['text'],axis=1).astype(float))[0] # Shall be True (1)

In [None]:
testDF.at[0,'text'] = FakeText
construct_Features(range(1),testDF,correct=True)

In [None]:
model.predict(testDF.drop(['text'],axis=1).astype(float))[0] # Shall be False (0)

In [None]:
# save
import joblib
joblib.dump(model, "model.pkl")

In [None]:
# load
import joblib
model = joblib.load("model.pkl")