In [None]:
!pip install empath

In [None]:
from empath import Empath
import nltk
from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.stem import WordNetLemmatizer
import numpy as np
import pandas as pd
import re
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer 
from sklearn.model_selection import KFold
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.preprocessing import MinMaxScaler
from textblob import TextBlob

In [None]:
nltk.download('punkt')
nltk.download('vader_lexicon')

In [None]:
train = pd.read_csv("../data/fulltrain.csv", names=["Label", "Text"])
train.head()

In [None]:
test = pd.read_csv("../data/balancedtest.csv", names=["Label", "Text"])
test.head()

In [None]:
train.isnull().sum()

In [None]:
train["Label"].value_counts()

### Random sampling the train dataset

In [None]:
train = train.sample(n=10000, random_state=12).reset_index()
train.head()

## Data Preprocessing

In [None]:
def preprocess(data):
    data['Text_Clean'] = data['Text'].apply(text_lower)
    data['Text_Clean'] = data['Text_Clean'].apply(text_remove_special_characters)
    data['Text_Clean'] = data['Text_Clean'].apply(text_remove_stopwords)
    data['Text_Clean'] = data['Text_Clean'].apply(text_lemmatize)
    data['Text_Clean_Tokenized'] = data['Text_Clean'].apply(text_tokenize)
    data['Text_Tokenized'] = data['Text'].apply(text_tokenize)
    return data

def text_lemmatize(text):
    wordnet_lemmatizer = WordNetLemmatizer()
    word_list = text_tokenize(text)
    return " ".join([wordnet_lemmatizer.lemmatize(word) for word in word_list])

def text_lower(text):
    return text.lower()

def text_remove_special_characters(text):
    return re.sub('[^a-zA-Z0-9]',' ', text)

def text_remove_links(text):
    return re.sub('https?://\S+|www\.\S+', '', text)

def text_remove_stopwords(text):
    stopword_list = stopwords.words('english')
    word_list = text_tokenize(text)
    return " ".join([word for word in word_list if word not in stopword_list])

def text_tokenize(text):
    return nltk.word_tokenize(text)

def undersample_majority_class(data, y_col, y_value):
    majority_index = data.index[data[y_col] == y_value].tolist()
    random.seed(10)
    random_sample = random.sample(majority_index, round(len(majority_index) * 0.5))
    return data.drop(random_sample)

In [None]:
preprocess(train)
train.head()

In [None]:
preprocess(test)
test.head()

# Baseline tf-idf NB Model

In [None]:
vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words='english')
vectorized_train_X = vectorizer.fit_transform(train["Text_Clean"])
train_y = train["Label"]

vectorized_test_X = vectorizer.transform(test["Text_Clean"])
test_y = test["Label"]

nb_classifer = MultinomialNB()
nb_classifer.fit(vectorized_train_X, train_y)

pred_y = nb_classifer.predict(vectorized_test_X)
accuracy = metrics.accuracy_score(test_y, pred_y)
print("accuracy:   %0.3f" % accuracy)

print(metrics.classification_report(test_y, pred_y,
                                            target_names=["1 - Satire", "2 - Hoax", "3 - Propaganda", "4 - Reliable News"]))

print("confusion matrix:")
print(metrics.confusion_matrix(test_y, pred_y))

This will be the baseline to which we aim to improve.

From the metrics calculated, we see that Reliable news is being predicted with a precision of 100%. This means that all articles with labelled "Reliable News" were correctly identifies as "Reliable News". However, articles of other lablels scored lower on the metrics. 

## Syntactic Feature Engineering  

#### Number of Characters

In [None]:
def count_chars(text):
    return len(text)

#### Number of Words

In [None]:
def count_words(text):
    return len(text.split())

#### Number of Capital Characters

In [None]:
def count_capital_chars(text):
    count=0
    for i in text:
        if i.isupper():
            count+=1
    return count

#### Number of Capital Words

In [None]:
def count_capital_words(text):
    return sum(map(str.isupper,text.split()))

#### Number of Pronouns

In [None]:
def count_first_person_pronouns(text):
    first_person = len(re.findall(r'\b(I|me|my|mine|we|us|our|ours)\b', text, flags=re.IGNORECASE))
    return first_person

In [None]:
def count_second_person_pronouns(text):
    second_person = len(re.findall(r'\b(you|your|yours)\b', text, flags=re.IGNORECASE))
    return second_person

In [None]:
def count_third_person_pronouns(text):
    third_person = len(re.findall(r'\b(he|him|his|she|her|hers|it|its|they|them|their|theirs)\b', text, flags=re.IGNORECASE))
    return third_person

#### Number of Hedging Words

In [None]:
def count_hedges(text):
    # Load the list of hedging words from a text file
    with open('hedging_words.txt', 'r') as f:
        hedging_words = [line.strip() for line in f]

    # Use NLTK to tokenize the text into words
    words = nltk.word_tokenize(text)

    # Count the number of hedging words in the text
    num_hedges = sum(1 for word in words if word.lower() in hedging_words)

    return num_hedges

#### Number of Boosting Words

In [None]:
def count_boosts(text):
    # Load the list of hedging words from a text file
    with open('boosting_words.txt', 'r') as f:
        boosting_words = [line.strip() for line in f]

    # Use NLTK to tokenize the text into words
    words = nltk.word_tokenize(text)

    # Count the number of hedging words in the text
    num_boosts = sum(1 for word in words if word.lower() in boosting_words)

    return num_boosts

#### Number of Numbers

In [None]:
def count_numbers(text):
    pattern = r"\d{1,3}(,\d{3})*(\.\d+)?"  # regular expression pattern to match numbers
    matches = re.findall(pattern, text)  # find all matches of the pattern in the text
    return len(matches)  # return the count of matches

#### Number of Positive and Negative Words

In [None]:
def count_positive_words(words):
    # Use the SentimentIntensityAnalyzer to get sentiment scores for each word
    sia = SentimentIntensityAnalyzer()
    score = sia.polarity_scores(words)
    scores_pos = [score['pos'] for word in words]

    # Get the total number of positive words
    num_pos_words = sum([1 for score in scores_pos if score > 0])

    return num_pos_words

In [None]:
def count_negative_words(words):
    # Use the SentimentIntensityAnalyzer to get sentiment scores for each word
    sia = SentimentIntensityAnalyzer()
    score = sia.polarity_scores(words)
    scores_neg = [score['neg'] for word in words]

    # Get the total number of positive words
    num_neg_words = sum([1 for score in scores_neg if score > 0])

    return num_neg_words

#### Number of Proper Nouns

In [None]:
def count_proper_nouns(pos_tags):
    num_proper_nouns = sum(1 for word, tag in pos_tags if tag == 'NNP')
    return num_proper_nouns

#### Number of Conjunctions

In [None]:
def count_conjunctions(pos_tags):
    num_conjunctions = sum(1 for word, tag in pos_tags if tag == 'CC')
    return num_conjunctions

#### Number of Superlatives

In [None]:
def count_superlatives(pos_tags):
    num_superlatives = sum(1 for word, tag in pos_tags if tag == "JJS")
    return num_superlatives

### Testing Syntactic Features

In [None]:
def generate_syntactic_features(data):
    data['Char_Count'] = data["Text_Clean"].apply(count_chars)
    data['Word_Count'] = data["Text_Clean"].apply(count_words)
    data['Capital_Chars_Count'] = data["Text"].apply(count_capital_chars)
    data['Capital_Words_Count'] = data["Text"].apply(count_capital_words)
    
    data['First_Person_Pronoun_Count'] = data["Text"].apply(count_third_person_pronouns)
    data['Second_Person_Pronoun_Count'] = data["Text"].apply(count_third_person_pronouns)
    data['Third_Person_Pronoun_Count'] = data["Text"].apply(count_third_person_pronouns)
    data['Boost_Count'] = data["Text_Clean"].apply(count_boosts)
    data['Number_Count'] = data["Text_Clean"].apply(count_numbers)
    
    data['Positive_Word_Count'] = data["Text_Clean"].apply(count_positive_words)
    data['Negative_Word_Count'] = data["Text_Clean"].apply(count_negative_words)
    
    data['pos_tags'] = data["Text_Tokenized"].apply(nltk.pos_tag)
    data['Proper_Noun_Count'] = data["pos_tags"].apply(count_proper_nouns)
    data['Conjunction_Count'] = data["pos_tags"].apply(count_conjunctions)
    data['Superlative_Count'] = data["pos_tags"].apply(count_superlatives)

In [None]:
generate_syntactic_features(train)
train.head()

In [None]:
generate_syntactic_features(test)
test.head()

#### Syntactic Features

In [None]:
features = [
    "Char_Count", 
    "Word_Count",
    "Capital_Chars_Count", 
    "Capital_Words_Count",
    "First_Person_Pronoun_Count",
    "Second_Person_Pronoun_Count",
    "Boost_Count",
    "Number_Count",
    "Positive_Word_Count",
    "Negative_Word_Count",
    "Proper_Noun_Count",
    "Conjunction_Count",
    "Superlative_Count"
]

nb_classifer = MultinomialNB()
nb_classifer.fit(train[features], train_y)

pred_y = nb_classifer.predict(test[features])
accuracy = metrics.accuracy_score(test_y, pred_y)
print("accuracy:   %0.3f" % accuracy)

print(metrics.classification_report(test_y, pred_y,
                                            target_names=["1 - Satire", "2 - Hoax", "3 - Propaganda", "4 - Reliable News"]))

print("confusion matrix:")
print(metrics.confusion_matrix(test_y, pred_y))

#### Syntactic Features + tf-idf

In [None]:
vectorized_train_X_df = pd.DataFrame(vectorized_train_X.toarray())
vectorized_test_X_df = pd.DataFrame(vectorized_test_X.toarray())

train_X = pd.concat([vectorized_train_X_df, train[features]], axis="columns")
test_X = pd.concat([vectorized_test_X_df, test[features]], axis="columns")

In [None]:
nb_classifer = MultinomialNB()
nb_classifer.fit(train_X, train_y)

pred_y = nb_classifer.predict(test_X)
accuracy = metrics.accuracy_score(test_y, pred_y)
print("accuracy:   %0.3f" % accuracy)

print(metrics.classification_report(test_y, pred_y,
                                            target_names=["1 - Satire", "2 - Hoax", "3 - Propaganda", "4 - Reliable News"]))

print("confusion matrix:")
print(metrics.confusion_matrix(test_y, pred_y))

## Semantic Feature Engineering

### Sentiment Analysis

#### TextBlob Sentiment Analysis

In [None]:
def textblob_sentiment_analysis(data):
    data['Blob_Polarity'] = data['Text_Clean'].apply(lambda x: TextBlob(x).sentiment.polarity)
    data['Blob_Subjectivity'] = data['Text_Clean'].apply(lambda x: TextBlob(x).sentiment.subjectivity)
    return data

In [None]:
train = textblob_sentiment_analysis(train)
train.head()

In [None]:
test = textblob_sentiment_analysis(test)
test.head()

In [None]:
features = ["Blob_Polarity", 
            "Blob_Subjectivity",]

train_features = pd.concat([vectorized_train_X_df, train[features]], axis="columns")
test_features = pd.concat([vectorized_test_X_df, test[features]], axis="columns")

scaler = MinMaxScaler()
train_X = scaler.fit_transform(train_features)
test_X = scaler.fit_transform(test_features)

In [None]:
nb_classifer = MultinomialNB()
nb_classifer.fit(train_X, train_y)

pred_y = nb_classifer.predict(test_X)
accuracy = metrics.accuracy_score(test_y, pred_y)
print("accuracy:   %0.3f" % accuracy)

print(metrics.classification_report(test_y, pred_y,
                                            target_names=["1 - Satire", "2 - Hoax", "3 - Propaganda", "4 - Reliable News"]))

print("confusion matrix:")
print(metrics.confusion_matrix(test_y, pred_y))

#### Vader Sentiment Analysis

In [None]:
vader = SentimentIntensityAnalyzer()

def vader_sentiment_analysis(data):
    data['Vader_Scores'] = data['Text_Clean'].apply(lambda x: vader.polarity_scores(x))
    data['Vader_Negative'] = data['Vader_Scores'].apply(lambda x: x['neg'])
    data['Vader_Neutral'] = data['Vader_Scores'].apply(lambda x: x['neu'])
    data['Vader_Positive'] = data['Vader_Scores'].apply(lambda x: x['pos'])
    data['Vader_Compound'] = data['Vader_Scores'].apply(lambda x: x['compound'])
    return data

In [None]:
train = vader_sentiment_analysis(train)
train.head()

In [None]:
test = vader_sentiment_analysis(test)
test.head()

In [None]:
features = ["Vader_Negative", 
            "Vader_Neutral",
            "Vader_Positive",
            "Vader_Compound"]

train_features = pd.concat([vectorized_train_X_df, train[features]], axis="columns")
test_features = pd.concat([vectorized_test_X_df, test[features]], axis="columns")

scaler = MinMaxScaler()
train_X = scaler.fit_transform(train_features)
test_X = scaler.fit_transform(test_features)

In [None]:
nb_classifer = MultinomialNB()
nb_classifer.fit(train_X, train_y)

pred_y = nb_classifer.predict(test_X)
accuracy = metrics.accuracy_score(test_y, pred_y)
print("accuracy:   %0.3f" % accuracy)

print(metrics.classification_report(test_y, pred_y,
                                            target_names=["1 - Satire", "2 - Hoax", "3 - Propaganda", "4 - Reliable News"]))

print("confusion matrix:")
print(metrics.confusion_matrix(test_y, pred_y))

#### Combining Both

In [None]:
features = ["Blob_Polarity",
            "Blob_Subjectivity",
            "Vader_Negative", 
            "Vader_Neutral",
            "Vader_Positive",
            "Vader_Compound"]

train_features = pd.concat([vectorized_train_X_df, train[features]], axis="columns")
test_features = pd.concat([vectorized_test_X_df, test[features]], axis="columns")

scaler = MinMaxScaler()
train_X = scaler.fit_transform(train_features)
test_X = scaler.fit_transform(test_features)

In [None]:
nb_classifer = MultinomialNB()
nb_classifer.fit(train_X, train_y)

pred_y = nb_classifer.predict(test_X)
accuracy = metrics.accuracy_score(test_y, pred_y)
print("accuracy:   %0.3f" % accuracy)

print(metrics.classification_report(test_y, pred_y,
                                            target_names=["1 - Satire", "2 - Hoax", "3 - Propaganda", "4 - Reliable News"]))

print("confusion matrix:")
print(metrics.confusion_matrix(test_y, pred_y))

### Context Incongruity

#### Opposite Polarity N-grams

In [None]:
def generate_N_gram(tokenized,ngram=1):
    temp = zip(*[tokenized[i:] for i in range(0,ngram)])
    ans = [' '.join(ngram) for ngram in temp]
    return ans

def get_N_gram_polarities(n_gram):
    return list(map(lambda x: vader.polarity_scores(x)["compound"], n_gram))
    
def count_context_incongruities(tokenized, N):
    n_grams = generate_N_gram(tokenized, ngram=N)
    n_gram_polarities = get_N_gram_polarities(n_grams)
    
    count = 0
    for i in range(len(n_gram_polarities) - 1):
        if n_gram_polarities[i] * n_gram_polarities[i+1] < 0:
            count += 1
    return count

In [None]:
def get_context_incongruities(data, N):
    data["Context_Incongruity - " + str(N) + "-gram"] = data["Text_Tokenized"].apply(lambda x: count_context_incongruities(x, N))
    return data    

In [None]:
for i in range(1, 6):
    get_context_incongruities(train, i)
    get_context_incongruities(test, i)
    
print(train.head())
print(test.head())

In [None]:
features = ["Context_Incongruity - 1-gram", 
            "Context_Incongruity - 2-gram",
            "Context_Incongruity - 3-gram",
            "Context_Incongruity - 4-gram",
            "Context_Incongruity - 5-gram"]

train_features = pd.concat([vectorized_train_X_df, train[features]], axis="columns")
test_features = pd.concat([vectorized_test_X_df, test[features]], axis="columns")

scaler = MinMaxScaler()
train_X = scaler.fit_transform(train_features)
test_X = scaler.fit_transform(test_features)

In [None]:
nb_classifer = MultinomialNB()
nb_classifer.fit(train_X, train_y)

pred_y = nb_classifer.predict(test_X)
accuracy = metrics.accuracy_score(test_y, pred_y)
print("accuracy:   %0.3f" % accuracy)

print(metrics.classification_report(test_y, pred_y,
                                            target_names=["1 - Satire", "2 - Hoax", "3 - Propaganda", "4 - Reliable News"]))

print("confusion matrix:")
print(metrics.confusion_matrix(test_y, pred_y))

In [None]:
for feature in features:
    train_features = pd.concat([vectorized_train_X_df, train[feature]], axis="columns")
    test_features = pd.concat([vectorized_test_X_df, test[feature]], axis="columns")

    scaler = MinMaxScaler()
    train_X = scaler.fit_transform(train_features)
    test_X = scaler.fit_transform(test_features)
    
    nb_classifer = MultinomialNB()
    nb_classifer.fit(train_X, train_y)

    pred_y = nb_classifer.predict(test_X)
    accuracy = metrics.accuracy_score(test_y, pred_y)
    
    print(feature)
    print("accuracy:   %0.3f" % accuracy)

    print(metrics.classification_report(test_y, pred_y,
                                            target_names=["1 - Satire", "2 - Hoax", "3 - Propaganda", "4 - Reliable News"]))

    print("confusion matrix:")
    print(metrics.confusion_matrix(test_y, pred_y))

### Topic Modeling and Lexicons

#### Lexical Categories Analysis using Empath

An example of what the code below is executing:

In [None]:
lexicon = Empath()
categories = [
    "sarcastic",
    "ironic",
    "contradict",
    "mock",
    "jest",
    "malicious",
    "vinidctive",
    "government",
    "politics",
    "society",
    "money",
    "culture",
    "convince",
    "discredit",
    "fact",
    "honest",
    "trusted",
]

for cat in categories:
    lexicon.create_category(cat, [cat], model="nytimes")

In [None]:
text = "World Champion skier and Olympic gold medal favorite Lindsey Vonn admitted yesterday that the secret to her success is her 'really, really good ski poles.' 'There's no way I would have won 31 World Cup races without these great, great ski poles,' Vonn told reporters during a press conference, noting that without the top-of-the-line ski poles, it would be difficult for her to maintain her balance or change directions during competition. 'I use them a lot because I'm always skiing, and they haven't broken in half or anything. I think they're really expensive too, like over 50 bucks.' Vonn, who said she was unsure if her ski poles were made of graphite or carbon fiber, urged reporters to trust her when she said that 'whatever they're made of is definitely the best.' "
emotion_info = lexicon.analyze(text, categories=["jest"])
print("Emotion Info: \n\n", emotion_info)

dict_vectorizer = DictVectorizer()
vec_emotion_info = dict_vectorizer.fit_transform(emotion_info).toarray()[0][0]
print("\nVectorized: \n\n", vec_emotion_info)
print(type(vec_emotion_info))

dict_vectorizer.get_feature_names_out()

In [None]:
def get_lexical_categories(data):
    lexical_categories = []
    dict_vectorizer = DictVectorizer()
    lexicon = Empath()
    for cat in categories:
        data["Lexicon - " + cat] = data["Text_Clean"].apply(lambda x: dict_vectorizer
                                                        .fit_transform(lexicon.analyze(x, categories=[cat]))
                                                        .toarray()[0][0])

In [None]:
get_lexical_categories(train) 
train.head()

In [None]:
test = get_lexical_categories(test) 
test.head()
# test_lexical_categories = get_lexical_categories(test) 
# test_lexical_categories

In [None]:
for cat in categories:
    features = ["Blob_Polarity", 
            "Blob_Subjectivity",]
    
    features.append("Lexicon - " + cat)

    train_features = pd.concat([vectorized_train_X_df, train[features]], axis="columns")
    test_features = pd.concat([vectorized_test_X_df, test[features]], axis="columns")

    scaler = MinMaxScaler()
    train_X = scaler.fit_transform(train_features)
    test_X = scaler.fit_transform(test_features)
    
    print(features)
    
    nb_classifer = MultinomialNB()
    nb_classifer.fit(train_X, train_y)

    pred_y = nb_classifer.predict(test_X)
    accuracy = metrics.accuracy_score(test_y, pred_y)
    print("accuracy:   %0.3f" % accuracy)

    print(metrics.classification_report(test_y, pred_y,
                                                target_names=["1 - Satire", "2 - Hoax", "3 - Propaganda", "4 - Reliable News"]))

    print("confusion matrix:")
    print(metrics.confusion_matrix(test_y, pred_y))

## Overall Model

In [None]:
features = [
    "Char_Count", 
    "Word_Count",
    "Capital_Chars_Count", 
    "Capital_Words_Count",
    "First_Person_Pronoun_Count",
    "Second_Person_Pronoun_Count",
    "Boost_Count",
    "Number_Count",
    "Positive_Word_Count",
    "Negative_Word_Count",
    "Proper_Noun_Count",
    "Conjunction_Count",
    "Superlative_Count",
    "Blob_Polarity", 
    "Blob_Subjectivity",
    "Vader_Negative", 
    "Vader_Neutral",
    "Vader_Positive",
    "Vader_Compound",
]

for cat in categories:
    features.append("Lexicon - " + cat)

train_features = pd.concat([vectorized_train_X_df, train[features]], axis="columns")
test_features = pd.concat([vectorized_test_X_df, test[features]], axis="columns")

scaler = MinMaxScaler()
train_X = scaler.fit_transform(train_features)
test_X = scaler.fit_transform(test_features)

print(features)

nb_classifer = MultinomialNB()
nb_classifer.fit(train_X, train_y)

pred_y = nb_classifer.predict(test_X)
accuracy = metrics.accuracy_score(test_y, pred_y)
print("accuracy:   %0.3f" % accuracy)

print(metrics.classification_report(test_y, pred_y,
                                            target_names=["1 - Satire", "2 - Hoax", "3 - Propaganda", "4 - Reliable News"]))

print("confusion matrix:")
print(metrics.confusion_matrix(test_y, pred_y))