#Libraries 

In [154]:
# !pip install wordcloud
# !pip install nltk
#! pip install text2emotion
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('vader_lexicon')

import pandas as pd 
import numpy as np 
import time

#For Text Cleaning
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import re
from nltk.corpus import stopwords
stop = stopwords.words('english')

#For EDA
from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import matplotlib.pyplot as plt

#For text vectorizing
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import RegexpTokenizer
from gensim import corpora
from gensim import models
import random

#For naives bayes DF
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix   

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn import model_selection, naive_bayes, svm
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression


# Emotion Analysis 
import text2emotion as te

# pos tagging 
from collections import Counter

# ploting 
import seaborn as sn
import matplotlib.pyplot as plt

# VADER generation 
#nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


##Required Functions 

In [156]:
# for preprocessing

# Function to clean the compiled dataframe
def preprocess(review):
    review = " ".join([stemmer.stem(w.lower()) for w in word_tokenize(review) if not w in stop_words])
    return review

def emotion_detection(sents):
    """Main algo for convertion for the 5 emotions """
    sent_emotion = te.get_emotion(sents)
    return sent_emotion
    

# Generating the Emotions 
def generate_emotions(news_dup):
    
    """Use to generate the dataframe that appends the orginal text and the emotion label vector"""
    
    emotion_list = []
    for i, row in news_dup.iterrows():
        emotion_dict = emotion_detection(row[2])
        emotion_dict['text'] = row[2]
        emotion_dict['label'] = row[3]
        emotion_list.append(emotion_dict)
        
        
    emotion_df = pd.DataFrame(emotion_list)
    horizontal_stack = news_dup.merge(emotion_df, how='left', on='text')
    horizontal_stack.drop(['index'], inplace=True, axis=1)       
            
    return horizontal_stack
        

def generate_pos_tag_dist(df):
    
    """Takes in a df : this dataframe contains the emotions,
        and outputs the dataframe with additional pos tags above that have no more than 30% missing """
        
    emotion_text = list(df['text'])
    counts = []
    for sentences in emotion_text:
        tokens = nltk.word_tokenize(sentences)    
        tags = nltk.pos_tag(tokens)
        counts.append(Counter( tag for word,  tag in tags))

    # creating the pos count feature and setting non rare pos tag features ,
    df_post_dist = pd.DataFrame.from_records(counts)
    df_post_dist_non_null = df_post_dist.loc[:,df_post_dist.columns[df_post_dist.isnull().mean() < 0.7]].reset_index()
    df_post_dist_non_null.fillna(0,inplace=True)
    
    # combining the dataframe 
    return df_post_dist_non_null

# generating Vader https://towardsdatascience.com/sentimental-analysis-using-vader-a3415fef7664
def vader_score_generation(emo_pos_df):
    
    """Generates the vader scores for the dataframe to create neu pos neg tags base on the text """
    sid = SentimentIntensityAnalyzer()
    polarity_Score = emo_pos_df['text'].apply(lambda review: sid.polarity_scores(review))
    df_p_scores = polarity_Score.apply(pd.Series).reset_index() # generating the scores and appending it the dataframe end 
    emo_pos_dist_pscore =  emo_pos_df.merge(df_p_scores, left_on='index', right_on='index')
    return emo_pos_dist_pscore

def zerolistmaker(n):
    listofzeros = [0] * n
    return listofzeros

def group_pos_features(pos_tagged_df):

    """This function takes in the POS feature dataframe and group POS tags together.
      Comments means that the POS tags in this data set is not prevelant as they either are not present or contain too much missing values

      Unpresent POS tags  for now it is known to be insignificant as mentioned in the research paper :
          pos_tagged_df['group_e'] = pos_tagged_df['EX']
          pos_tagged_df['NR'] + pos_tagged_df['NPS']
          pos_tagged_df['group_p'] = pos_tagged_df['PDT'] + pos_tagged_df['POS'] + pos_tagged_df['PP']
          pos_tagged_df['group_t'] = pos_tagged_df['TO']
          pos_tagged_df['group_u'] = pos_tagged_df['UH'] 
          pos_tagged_df['group_w'] = pos_tagged_df['WDT'] + pos_tagged_df['WP'] + pos_tagged_df['WP$'] + pos_tagged_df['WRB']
          # Could be a future work to include using different POS tagging dictionaries
     """ 
    #pos_group = ['group_c','group_d','group_f','group_i','group_j','group_m','group_n','group_r','group_v']
    pos_list = ['CC','CD','DT','FW','IN','JJ','JJR','JJS','MD','NN', 'NNS', 'RBR','RB','VB','VBD', 'VBN','VBP','VBZ']
    delete_pos_list = []
    pos_dict = {}
    for pos_l in pos_list:
      
      if pos_l in pos_tagged_df.columns:
        pos_dict[pos_l] = np.array(list(pos_tagged_df[pos_l]))
        delete_pos_list.append(pos_l)
      else:
        pos_dict[pos_l] = list(zerolistmaker(len(pos_tagged_df)))
      
    #print(np.sum([pos_dict['CC'],pos_dict['CD']] , axis=0))
    pos_tagged_df['group_c'] = np.sum([pos_dict['CC'],pos_dict['CD']], axis=0) 
    pos_tagged_df['group_d'] = pos_dict['DT'] 
    pos_tagged_df['group_f'] = pos_dict['FW'] 
    pos_tagged_df['group_i'] = pos_dict['IN']
    pos_tagged_df['group_j'] = np.sum([pos_dict['JJ'],pos_dict['JJR'], pos_dict['JJS']], axis=0)  
    
    pos_tagged_df['group_m'] = pos_dict['MD']
    pos_tagged_df['group_n'] = np.sum([pos_dict['NN'],pos_dict['NNS']], axis=0) 
    
    # changed RBP to RBR 
    pos_tagged_df['group_r'] = np.sum([pos_dict['RBR'],pos_dict['RB']], axis=0) 
    pos_tagged_df['group_v'] =  np.sum([pos_dict['VB'],pos_dict['VBD'], pos_dict['VBN'],  pos_dict['VBP'],  pos_dict['VBZ']], axis=0)  

    return pos_tagged_df, delete_pos_list


  

Data Preprocessing and Cleaning 

In [176]:
# Data Reading and Cleaning 
maindf = pd.read_excel('sample_data/other_domain_text.xlsx')
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

#The bottom two lines will remove all stop words and stem the column text, takes very long do not run
maindf['text'] = maindf.apply(lambda x: preprocess(x['text']), axis=1)

#Export the new cleaned data into excel file
maindf.to_excel(r'sample_data/next_100.xlsx', index = False, header=True)



#Feature Engineering 

##Emotion Analysis 

In [177]:
news  = pd.read_excel('sample_data/next_100.xlsx')
news_dup = news.copy()
emp_df = generate_emotions(news_dup)
emp_df.drop_duplicates(inplace=True)
emp_df.to_excel('sample_data/emotion_analysis_final.xlsx')

##Pos Tagging and Vader Text 

In [178]:
emp_df = pd.read_excel('sample_data/emotion_analysis_final.xlsx')

# Generating the POS tag dist 
emo_pos_df = generate_pos_tag_dist(emp_df)
emp_df = emp_df.reset_index()
emp_df = emp_df[emp_df.columns.drop(list(emp_df.filter(regex='index')))]
emp_df = emp_df.rename(columns={emp_df.columns[0]: 'index'})
comb_emo_pos_dist =  emp_df.merge(emo_pos_df, left_on='index', right_on='index')

# genrating vader text 
emo_pos_dist_pscore = vader_score_generation(comb_emo_pos_dist)
emo_pos_dist_pscore.to_excel('sample_data/emo_pos_dist_pscore.xlsx')


##POS Grouping analysis 

In [179]:
# Generating of Pos Grouping Features 
emotional_df_stack = pd.read_excel('sample_data/emo_pos_dist_pscore.xlsx')
del emotional_df_stack['compound']

emotional_df_stack_groupped_pos, pos_tags = group_pos_features(emotional_df_stack)
emotional_df_stack_groupped_pos.drop(columns=pos_tags, inplace=True)
emotional_df_stack_groupped_pos = emotional_df_stack_groupped_pos.loc[:, (emotional_df_stack_groupped_pos != 0).any(axis=0)]
emotional_df_stack_groupped_pos.to_excel('sample_data/pos_groupings_vader_emotion.xlsx')




## Entity Regconition Features 

In [180]:
import sys
import json
import ast
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')
emo_posgroup_vader_er = emotional_df_stack_groupped_pos.copy()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Package words is already up-to-date!


In [181]:
def getEntityDic(df):
  entity_dic = {}
  counter = 1
  total_records = df.shape[0]
  for index, row in df.iterrows():
    text = row['text'] + " "+ row['title']
    sys.stdout.write('\rCompletion progress: ' + str(counter) + ' of ' + str(total_records) + " articles")
    sys.stdout.flush()

    ## Put full stop behind each sentence
    new = []
    if "\n" in text:
        text = text.replace(" \n", ". ")
    text_list = text.split(". ")
    for x in text_list:
        new.append(x + ".")
    
    ## Cycle through each sentence
    for x in new:
        sent_tokens = word_tokenize(x)
        tagged_sent = nltk.pos_tag(sent_tokens)
        ne_tree = nltk.ne_chunk(tagged_sent)
        ne_list = extract_ne_from_tree(ne_tree)
        
        ## Insert into dictionary
        for y in ne_list:
            if y[0] not in entity_dic:
                entity_dic[y[0]] = {}
            if y[0] in entity_dic:
                string = y[1].lower()
                if string not in entity_dic[y[0]]:
                    entity_dic[y[0]][string] = 0
                if string in entity_dic[y[0]]:
                    entity_dic[y[0]][string] += 1
    counter += 1
  return entity_dic

In [182]:
def extract_ne_from_tree ( tree ):
    result = []
    for s in tree.subtrees():
        label = s.label()
        if (label == 'PERSON' or label == 'ORGANIZATION' or label == 'GPE'):
            leaves = s.leaves()
            ne = ''
            for l in leaves:
                ne = ne + ' ' + l[0]
            result.append((label, ne[1:]))
    return result

In [183]:
def addEntitiesToDataframe2(df):

    entity_dic = {}
    counter = 1
    total_records = df.shape[0]
    i = 0
    for index, row in df.iterrows():
        entity_dic['PERSON'] = {}
        entity_dic['ORGANIZATION'] = {}
        entity_dic['GPE'] = {}
        text = row['text'] + " "+ row['title']
        sys.stdout.write('\rCompletion progress: ' + str(counter) + ' of ' + str(total_records) + " articles")
        sys.stdout.flush()

        ## Put full stop behind each sentence
        new = []
        if "\n" in text:
          text = text.replace(" \n", ". ")
        text_list = text.split(". ")
        for x in text_list:
            new.append(x + ".")

        ## Cycle through each sentence
        for x in new:
            sent_tokens = word_tokenize(x)
            tagged_sent = nltk.pos_tag(sent_tokens)
            ne_tree = nltk.ne_chunk(tagged_sent)
            ne_list = extract_ne_from_tree(ne_tree)

            ## Insert into dictionary
            for y in ne_list:
                if y[0] not in entity_dic:
                    entity_dic[y[0]] = {}
                if y[0] in entity_dic:
                    string = y[1].lower()
                    if string not in entity_dic[y[0]]:
                        entity_dic[y[0]][string] = 0
                    if string in entity_dic[y[0]]:
                        entity_dic[y[0]][string] += 1
        counter += 1

        # Get df index
        ind = index_list[i]

        # get summation values
        total_p = 0
        for x in entity_dic['PERSON']:
          total_p += entity_dic['PERSON'][x]

        total_o = 0
        for x in entity_dic['ORGANIZATION']:
          total_o += entity_dic['ORGANIZATION'][x]

        total_g = 0
        for x in entity_dic['GPE']:
          total_g += entity_dic['GPE'][x]
        
        # insert into DF
        df.loc[ind:ind+1, 'person entities'] = str(entity_dic['PERSON'])
        df.loc[ind:ind+1, 'organisation entities'] = str(entity_dic['ORGANIZATION'])
        df.loc[ind:ind+1, 'location entities'] = str(entity_dic['GPE'])
        df.loc[ind:ind+1, 'person entity count'] = total_p
        df.loc[ind:ind+1, 'organisation entity count'] = total_o
        df.loc[ind:ind+1, 'location entity count'] = total_g
        entity_dic = {}
        i+=1
        
    return df

In [184]:
def attachTopEntities(df, fakeTop, realTop):
  store = []
  for x in trueTop:
    for y in trueTop[x]:
      store.append(("(" + str(x) + ")" + y))

  for x in fakeTop:
    for y in fakeTop[x]:
      item = ("(" + str(x) + ")" + y)
      if item not in store:
        store.append(item)

  for x in store:
    df[x] = 0

  return df, store

In [185]:
def getTopEntityDic(entity_dic, n):
  print()
  bigstore = {}
  top10entities = {}
  for x in entity_dic:
    first = str(x)[0] + "_"
    bigstore[x] = []
    print(x)
    print("----------------")
    sorted_dic = sorted(entity_dic[x], key=entity_dic[x].get, reverse=True)
    count = 0
    for y in sorted_dic:
      top10entities[y] = entity_dic[x][y]
      print(y.ljust(40), ":" + str(entity_dic[x][y]))
      count += 1
      if count == n:
          break
    top10 = dict(sorted(top10entities.items(), key=lambda item: item[1], reverse = True))
    count = 0
    result = []
    for z in top10:
      count += 1
      result.append(str(z))
      if count == n:
        bigstore[x] = result 
        break
    print()

  return bigstore

In [186]:
def getFinalEntityDF(finalEntitydf, store):
  total_records = finalEntitydf.shape[0]
  for index, row in finalEntitydf.iterrows():
    sys.stdout.write('\rCompletion progress: ' + str(index+1) + ' of ' + str(total_records) + " articles")
    sys.stdout.flush()
    # Person Entities
    json_string = row['person entities']
    dicp = ast.literal_eval(json_string)
    for x in dicp:
      item = "(PERSON)" + str(x)
      if item in store:
        finalEntitydf.loc[index, item] = dicp[x]

    # Location Entities
    json_string = row['location entities']
    dicp = ast.literal_eval(json_string)
    for x in dicp:
      item = "(GPE)" + str(x)
      if item in store:
        finalEntitydf.loc[index, item] = dicp[x]
    
    # GPE Entities
    json_string = row['organisation entities']
    dicp = ast.literal_eval(json_string)
    for x in dicp:
      item = "(ORGANISATION)" + str(x)
      if item in store:
        finalEntitydf.loc[index, item] = dicp[x]

  return finalEntitydf

In [187]:
er_main_df= pd.read_excel('sample_data/covid_real_fake.xlsx')

In [188]:
# Fake News
df1fake = er_main_df[er_main_df['label'] == 0]                                 # 1) Get fake news article dataframe
index_list = df1fake.index.tolist()                                 # 2) Get index of fake dataframe
# fakeEntityDic = getEntityDic(df1fake)                               # 3) Get sorted dict of {fake entities : count}
# fakeTop = getTopEntityDic(fakeEntityDic, 10)                        # 4) Get dic of {entitiy type : [sorted list of top 10 entities]}
df1_fakeEntities = addEntitiesToDataframe2(df1fake)                 # 5) Get DF with all entities, entity type count per article

Completion progress: 13 of 129 articles

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value)


Completion progress: 129 of 129 articles

In [189]:
# Real News
df1true = er_main_df[er_main_df['label'] == 1]                                 # 1) Get real news article dataframe
index_list = df1true.index.tolist()                                 # 2) Get index of real dataframe
# trueEntityDic = getEntityDic(df1true)                               # 3) Get sorted dict of {real entities : count}
# trueTop = getTopEntityDic(trueEntityDic, 10)                        # 4) Get dic of {entitiy type : [sorted list of top 10 entities]}
df1_trueEntities = addEntitiesToDataframe2(df1true)                 # 5) Get DF of entities, entity count per article

Completion progress: 14 of 70 articles

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value)


Completion progress: 70 of 70 articles

In [190]:
df1_trueEntities.head()

Unnamed: 0,index,title,text,label,person entities,organisation entities,location entities,person entity count,organisation entity count,location entity count
0,0,covid,The supposed phrase of the daughter of the pr...,1,{'banco santander': 1},{},{'portugal': 1},1.0,0.0,1.0
1,1,covid,Smokers are less likely to be hospitalized fo...,1,{},{},{},0.0,0.0,0.0
2,2,covid,"A photo of the room of a hotel in AlmerÃ­a, S...",1,{},{},{'spain': 1},0.0,0.0,1.0
3,3,covid,Drugstores in France are forbidden to buy and...,1,{},{},{'france': 1},0.0,0.0,1.0
4,4,covid,It is recommended to disinfect disposable mas...,1,{},{},{},0.0,0.0,0.0


In [191]:
# Combining Real and Fake News Dataframes
finalEntitydf = pd.concat([df1_trueEntities, df1_fakeEntities])     # Union fake & real dataframes
finalEntitydf.sort_index(ascending=True, inplace=True)              # Re-index them to original state

# Attaching Top Entities as Column Headers to Main Dataframe                   
finalEntitydf, store = attachTopEntities(finalEntitydf, fakeTop, trueTop)       

# Get Final DF with Top Entity counts
mainEntitydf = getFinalEntityDF(finalEntitydf, store)
mainEntitydf.head(3)

Completion progress: 199 of 199 articles

Unnamed: 0,index,title,text,label,person entities,organisation entities,location entities,person entity count,organisation entity count,location entity count
0,0,covid,The supposed phrase of the daughter of the pr...,1,{'banco santander': 1},{},{'portugal': 1},1.0,0.0,1.0
1,1,covid,Smokers are less likely to be hospitalized fo...,1,{},{},{},0.0,0.0,0.0
2,2,covid,"A photo of the room of a hotel in AlmerÃ­a, S...",1,{},{},{'spain': 1},0.0,0.0,1.0


In [192]:
del mainEntitydf['person entities']
del mainEntitydf['organisation entities']
del mainEntitydf['location entities']
del emotional_df_stack_groupped_pos['Unnamed: 0']

In [193]:
horizontal_stack = emotional_df_stack_groupped_pos.merge(mainEntitydf, how='left', on='index')
horizontal_stack = horizontal_stack.loc[:,~horizontal_stack.columns.duplicated()]
del horizontal_stack['label']
del horizontal_stack['label_x']
del horizontal_stack['title_y']
del horizontal_stack['text_y']
del horizontal_stack[',']
del horizontal_stack['.']

In [194]:
horizontal_stack.to_excel('final_feature_engineered.xlsx')