In [1]:
!pip install empath



In [2]:
from empath import Empath
import nltk
from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.stem import WordNetLemmatizer
import pandas as pd
import re
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer 
from sklearn.model_selection import KFold
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.preprocessing import MinMaxScaler
from textblob import TextBlob


In [3]:
nltk.download('punkt')
nltk.download('vader_lexicon')

[nltk_data] Downloading package punkt to /Users/rachelng/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/rachelng/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [4]:
train = pd.read_csv("data/fulltrain.csv", names=["Label", "Text"])
train.head()

Unnamed: 0,Label,Text
0,1,"A little less than a decade ago, hockey fans w..."
1,1,The writers of the HBO series The Sopranos too...
2,1,Despite claims from the TV news outlet to offe...
3,1,After receiving 'subpar' service and experienc...
4,1,After watching his beloved Seattle Mariners pr...


In [5]:
test = pd.read_csv("data/balancedtest.csv", names=["Label", "Text"])
test.head()

Unnamed: 0,Label,Text
0,1,When so many actors seem content to churn out ...
1,1,In what football insiders are calling an unex...
2,1,In a freak accident following Game 3 of the N....
3,1,North Koreas official news agency announced to...
4,1,The former Alaska Governor Sarah Palin would b...


In [6]:
train.isnull().sum()

Label    0
Text     0
dtype: int64

In [7]:
train["Label"].value_counts()

3    17870
1    14047
4     9995
2     6942
Name: Label, dtype: int64

### Random sampling the train dataset

In [8]:
train = train.sample(n=10000).reset_index()

## Data Preprocessing

In [9]:
def preprocess(data):
    data['Text_Clean'] = data['Text'].apply(text_lower)
    data['Text_Clean'] = data['Text_Clean'].apply(text_remove_special_characters)
    data['Text_Clean'] = data['Text_Clean'].apply(text_remove_stopwords)
    data['Text_Clean'] = data['Text_Clean'].apply(text_lemmatize)
    data['Text_Tokenized'] = data['Text_Clean'].apply(text_tokenize)
    return data

def text_lemmatize(text):
    wordnet_lemmatizer = WordNetLemmatizer()
    word_list = text_tokenize(text)
    return " ".join([wordnet_lemmatizer.lemmatize(word) for word in word_list])

def text_lower(text):
    return text.lower()

def text_remove_special_characters(text):
    return re.sub('[^a-zA-Z0-9]',' ', text)

def text_remove_links(text):
    return re.sub('https?://\S+|www\.\S+', '', text)

def text_remove_stopwords(text):
    stopword_list = stopwords.words('english')
    word_list = text_tokenize(text)
    return " ".join([word for word in word_list if word not in stopword_list])

def text_tokenize(text):
    return nltk.word_tokenize(text)

def undersample_majority_class(data, y_col, y_value):
    majority_index = data.index[data[y_col] == y_value].tolist()
    random.seed(10)
    random_sample = random.sample(majority_index, round(len(majority_index) * 0.5))
    return data.drop(random_sample)

In [10]:
train = preprocess(train)
train.head()

Unnamed: 0,index,Label,Text,Text_Clean,Text_Tokenized
0,24561,3,Third Journalist Killed in 3 Months in Turkey ...,third journalist killed 3 month turkey suspect...,"[third, journalist, killed, 3, month, turkey, ..."
1,19348,2,Welfare Leech With 12 Kids Collects More In Be...,welfare leech 12 kid collect benefit make enti...,"[welfare, leech, 12, kid, collect, benefit, ma..."
2,35758,3,"Study Involving 18,000 People Confirms Acupunc...",study involving 18 000 people confirms acupunc...,"[study, involving, 18, 000, people, confirms, ..."
3,6252,1,"When Enron founder Kenneth Lay died suddenly, ...",enron founder kenneth lay died suddenly le two...,"[enron, founder, kenneth, lay, died, suddenly,..."
4,9815,1,Determined to create the definitive visual doc...,determined create definitive visual document p...,"[determined, create, definitive, visual, docum..."


In [11]:
test = preprocess(test)
test.head()

Unnamed: 0,Label,Text,Text_Clean,Text_Tokenized
0,1,When so many actors seem content to churn out ...,many actor seem content churn performance quic...,"[many, actor, seem, content, churn, performanc..."
1,1,In what football insiders are calling an unex...,football insider calling unexpectedly severe p...,"[football, insider, calling, unexpectedly, sev..."
2,1,In a freak accident following Game 3 of the N....,freak accident following game 3 n b final clev...,"[freak, accident, following, game, 3, n, b, fi..."
3,1,North Koreas official news agency announced to...,north korea official news agency announced tod...,"[north, korea, official, news, agency, announc..."
4,1,The former Alaska Governor Sarah Palin would b...,former alaska governor sarah palin would bring...,"[former, alaska, governor, sarah, palin, would..."


# Baseline tf-idf NB Model

In [12]:
vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words='english')
vectorized_train_X = vectorizer.fit_transform(train["Text_Clean"])
train_y = train["Label"]

vectorized_test_X = vectorizer.transform(test["Text_Clean"])
test_y = test["Label"]

print(vectorized_test_X)

nb_classifer = MultinomialNB()
nb_classifer.fit(vectorized_train_X, train_y)

pred_y = nb_classifer.predict(vectorized_test_X)
accuracy = metrics.accuracy_score(test_y, pred_y)
print("accuracy:   %0.3f" % accuracy)

print(metrics.classification_report(test_y, pred_y,
                                            target_names=["1 - Satire", "2 - Hoax", "3 - Propaganda", "4 - Reliable News"]))

print("confusion matrix:")
print(metrics.confusion_matrix(test_y, pred_y))

  (0, 87515)	0.09386281502534428
  (0, 86720)	0.07645718439808229
  (0, 86627)	0.10266157430241152
  (0, 86443)	0.04637225729814395
  (0, 85140)	0.04837848655577018
  (0, 84942)	0.042101212467724206
  (0, 84921)	0.08730989417128553
  (0, 84809)	0.08802435446842397
  (0, 84438)	0.07935565275111378
  (0, 79712)	0.12443038046976886
  (0, 78959)	0.03243337258916965
  (0, 77945)	0.06235335291495431
  (0, 76648)	0.12122960476222419
  (0, 75874)	0.11813194416985015
  (0, 75086)	0.10396767496707465
  (0, 75051)	0.10896831547236048
  (0, 74293)	0.07339833969598303
  (0, 74103)	0.12502299060411737
  (0, 74054)	0.06472906529647358
  (0, 69048)	0.13366648622594446
  (0, 67293)	0.16589552137120842
  (0, 66821)	0.04254406192307244
  (0, 66175)	0.08186379897416068
  (0, 65214)	0.11010224425145058
  (0, 64418)	0.05014427672585871
  :	:
  (2999, 14668)	0.05263574245414766
  (2999, 14362)	0.04897440433814833
  (2999, 13246)	0.04741251475123183
  (2999, 12976)	0.04632475641915444
  (2999, 11259)	0.042121

This will be the baseline to which we aim to improve.

From the metrics calculated, we see that Reliable news is being predicted with a precision of 100%. This means that all articles with labelled "Reliable News" were correctly identifies as "Reliable News". However, articles of other lablels scored lower on the metrics. 

## Feature Engineering

#### Number of Characters

In [13]:
def count_chars(text):
    return len(text)

#### Number of Words

In [14]:
def count_words(text):
    return len(text.split())

#### Number of Capital Characters

In [15]:
def count_capital_chars(text):
    count=0
    for i in text:
        if i.isupper():
            count+=1
    return count

#### Number of Capital Words

In [16]:
def count_capital_words(text):
    return sum(map(str.isupper,text.split()))

#### Testing Syntactic Features

In [17]:
def generate_syntactic_features(data):
    data['Char_Count'] = data["Text"].apply(count_chars)
    data['Word_Count'] = data["Text"].apply(count_words)
    data['Capital_Chars_Count'] = data["Text"].apply(count_capital_chars)
    data['Capital_Words_Count'] = data["Text"].apply(count_capital_words)
    return data

In [18]:
train = generate_syntactic_features(train)
train.head()

Unnamed: 0,index,Label,Text,Text_Clean,Text_Tokenized,Char_Count,Word_Count,Capital_Chars_Count,Capital_Words_Count
0,24561,3,Third Journalist Killed in 3 Months in Turkey ...,third journalist killed 3 month turkey suspect...,"[third, journalist, killed, 3, month, turkey, ...",6024,951,235,22
1,19348,2,Welfare Leech With 12 Kids Collects More In Be...,welfare leech 12 kid collect benefit make enti...,"[welfare, leech, 12, kid, collect, benefit, ma...",2408,434,66,10
2,35758,3,"Study Involving 18,000 People Confirms Acupunc...",study involving 18 000 people confirms acupunc...,"[study, involving, 18, 000, people, confirms, ...",2706,398,63,1
3,6252,1,"When Enron founder Kenneth Lay died suddenly, ...",enron founder kenneth lay died suddenly le two...,"[enron, founder, kenneth, lay, died, suddenly,...",1956,341,46,1
4,9815,1,Determined to create the definitive visual doc...,determined create definitive visual document p...,"[determined, create, definitive, visual, docum...",1064,163,23,2


In [19]:
test = generate_syntactic_features(test)
test.head()

Unnamed: 0,Label,Text,Text_Clean,Text_Tokenized,Char_Count,Word_Count,Capital_Chars_Count,Capital_Words_Count
0,1,When so many actors seem content to churn out ...,many actor seem content churn performance quic...,"[many, actor, seem, content, churn, performanc...",1356,251,31,4
1,1,In what football insiders are calling an unex...,football insider calling unexpectedly severe p...,"[football, insider, calling, unexpectedly, sev...",1173,202,40,2
2,1,In a freak accident following Game 3 of the N....,freak accident following game 3 n b final clev...,"[freak, accident, following, game, 3, n, b, fi...",979,167,27,1
3,1,North Koreas official news agency announced to...,north korea official news agency announced tod...,"[north, korea, official, news, agency, announc...",814,134,28,2
4,1,The former Alaska Governor Sarah Palin would b...,former alaska governor sarah palin would bring...,"[former, alaska, governor, sarah, palin, would...",1120,177,36,4


In [20]:
features = ["Char_Count", 
            "Word_Count",
            "Capital_Chars_Count", 
            "Capital_Words_Count"]
vectorized_train_X_df = pd.DataFrame(vectorized_train_X.toarray())
vectorized_test_X_df = pd.DataFrame(vectorized_test_X.toarray())

train_X = pd.concat([vectorized_train_X_df, train[features]], axis="columns")
test_X = pd.concat([vectorized_test_X_df, test[features]], axis="columns")

In [21]:
nb_classifer = MultinomialNB()
nb_classifer.fit(train_X, train_y)

pred_y = nb_classifer.predict(test_X)
accuracy = metrics.accuracy_score(test_y, pred_y)
print("accuracy:   %0.3f" % accuracy)

print(metrics.classification_report(test_y, pred_y,
                                            target_names=["1 - Satire", "2 - Hoax", "3 - Propaganda", "4 - Reliable News"]))

print("confusion matrix:")
print(metrics.confusion_matrix(test_y, pred_y))



accuracy:   0.268
                   precision    recall  f1-score   support

       1 - Satire       0.75      0.00      0.01       750
         2 - Hoax       0.93      0.04      0.07       750
   3 - Propaganda       0.25      1.00      0.41       750
4 - Reliable News       0.96      0.03      0.06       750

         accuracy                           0.27      3000
        macro avg       0.72      0.27      0.14      3000
     weighted avg       0.72      0.27      0.14      3000

confusion matrix:
[[  3   1 745   1]
 [  0  28 722   0]
 [  0   0 750   0]
 [  1   1 725  23]]


## Feature Engineering for Semantic Analysis

### Sentiment Analysis

#### TextBlob Sentiment Analysis

In [22]:
def textblob_sentiment_analysis(data):
    data['Blob_Polarity'] = data['Text_Clean'].apply(lambda x: TextBlob(x).sentiment.polarity)
    data['Blob_Subjectivity'] = data['Text_Clean'].apply(lambda x: TextBlob(x).sentiment.subjectivity)
    return data

In [23]:
train = textblob_sentiment_analysis(train)
train.head()

Unnamed: 0,index,Label,Text,Text_Clean,Text_Tokenized,Char_Count,Word_Count,Capital_Chars_Count,Capital_Words_Count,Blob_Polarity,Blob_Subjectivity
0,24561,3,Third Journalist Killed in 3 Months in Turkey ...,third journalist killed 3 month turkey suspect...,"[third, journalist, killed, 3, month, turkey, ...",6024,951,235,22,-0.031443,0.326302
1,19348,2,Welfare Leech With 12 Kids Collects More In Be...,welfare leech 12 kid collect benefit make enti...,"[welfare, leech, 12, kid, collect, benefit, ma...",2408,434,66,10,0.023333,0.408631
2,35758,3,"Study Involving 18,000 People Confirms Acupunc...",study involving 18 000 people confirms acupunc...,"[study, involving, 18, 000, people, confirms, ...",2706,398,63,1,0.133428,0.319934
3,6252,1,"When Enron founder Kenneth Lay died suddenly, ...",enron founder kenneth lay died suddenly le two...,"[enron, founder, kenneth, lay, died, suddenly,...",1956,341,46,1,0.176077,0.563265
4,9815,1,Determined to create the definitive visual doc...,determined create definitive visual document p...,"[determined, create, definitive, visual, docum...",1064,163,23,2,0.202329,0.615476


In [24]:
test = textblob_sentiment_analysis(test)
test.head()

Unnamed: 0,Label,Text,Text_Clean,Text_Tokenized,Char_Count,Word_Count,Capital_Chars_Count,Capital_Words_Count,Blob_Polarity,Blob_Subjectivity
0,1,When so many actors seem content to churn out ...,many actor seem content churn performance quic...,"[many, actor, seem, content, churn, performanc...",1356,251,31,4,0.124375,0.509644
1,1,In what football insiders are calling an unex...,football insider calling unexpectedly severe p...,"[football, insider, calling, unexpectedly, sev...",1173,202,40,2,0.130303,0.543603
2,1,In a freak accident following Game 3 of the N....,freak accident following game 3 n b final clev...,"[freak, accident, following, game, 3, n, b, fi...",979,167,27,1,-0.0625,0.426667
3,1,North Koreas official news agency announced to...,north korea official news agency announced tod...,"[north, korea, official, news, agency, announc...",814,134,28,2,0.003409,0.375054
4,1,The former Alaska Governor Sarah Palin would b...,former alaska governor sarah palin would bring...,"[former, alaska, governor, sarah, palin, would...",1120,177,36,4,0.067614,0.33487


In [25]:
features = ["Blob_Polarity", 
            "Blob_Subjectivity",]

train_features = pd.concat([vectorized_train_X_df, train[features]], axis="columns")
test_features = pd.concat([vectorized_test_X_df, test[features]], axis="columns")

scaler = MinMaxScaler()
train_X = scaler.fit_transform(train_features)
test_X = scaler.fit_transform(test_features)



In [26]:
nb_classifer = MultinomialNB()
nb_classifer.fit(train_X, train_y)

pred_y = nb_classifer.predict(test_X)
accuracy = metrics.accuracy_score(test_y, pred_y)
print("accuracy:   %0.3f" % accuracy)

print(metrics.classification_report(test_y, pred_y,
                                            target_names=["1 - Satire", "2 - Hoax", "3 - Propaganda", "4 - Reliable News"]))

print("confusion matrix:")
print(metrics.confusion_matrix(test_y, pred_y))

accuracy:   0.557
                   precision    recall  f1-score   support

       1 - Satire       0.63      0.63      0.63       750
         2 - Hoax       0.64      0.06      0.11       750
   3 - Propaganda       0.42      0.97      0.59       750
4 - Reliable News       0.96      0.57      0.71       750

         accuracy                           0.56      3000
        macro avg       0.66      0.56      0.51      3000
     weighted avg       0.66      0.56      0.51      3000

confusion matrix:
[[471  25 245   9]
 [115  44 583   8]
 [ 18   0 729   3]
 [142   0 180 428]]


#### Vader Sentiment Analysis

In [27]:
vader = SentimentIntensityAnalyzer()

def vader_sentiment_analysis(data):
    data['Vader_Scores'] = data['Text_Clean'].apply(lambda x: vader.polarity_scores(x))
    data['Vader_Negative'] = data['Vader_Scores'].apply(lambda x: x['neg'])
    data['Vader_Neutral'] = data['Vader_Scores'].apply(lambda x: x['neu'])
    data['Vader_Positive'] = data['Vader_Scores'].apply(lambda x: x['pos'])
    data['Vader_Compound'] = data['Vader_Scores'].apply(lambda x: x['compound'])
    return data

In [28]:
train = vader_sentiment_analysis(train)
train.head()

Unnamed: 0,index,Label,Text,Text_Clean,Text_Tokenized,Char_Count,Word_Count,Capital_Chars_Count,Capital_Words_Count,Blob_Polarity,Blob_Subjectivity,Vader_Scores,Vader_Negative,Vader_Neutral,Vader_Positive,Vader_Compound
0,24561,3,Third Journalist Killed in 3 Months in Turkey ...,third journalist killed 3 month turkey suspect...,"[third, journalist, killed, 3, month, turkey, ...",6024,951,235,22,-0.031443,0.326302,"{'neg': 0.277, 'neu': 0.636, 'pos': 0.087, 'co...",0.277,0.636,0.087,-0.9994
1,19348,2,Welfare Leech With 12 Kids Collects More In Be...,welfare leech 12 kid collect benefit make enti...,"[welfare, leech, 12, kid, collect, benefit, ma...",2408,434,66,10,0.023333,0.408631,"{'neg': 0.153, 'neu': 0.616, 'pos': 0.231, 'co...",0.153,0.616,0.231,0.9573
2,35758,3,"Study Involving 18,000 People Confirms Acupunc...",study involving 18 000 people confirms acupunc...,"[study, involving, 18, 000, people, confirms, ...",2706,398,63,1,0.133428,0.319934,"{'neg': 0.162, 'neu': 0.725, 'pos': 0.113, 'co...",0.162,0.725,0.113,-0.9509
3,6252,1,"When Enron founder Kenneth Lay died suddenly, ...",enron founder kenneth lay died suddenly le two...,"[enron, founder, kenneth, lay, died, suddenly,...",1956,341,46,1,0.176077,0.563265,"{'neg': 0.097, 'neu': 0.63, 'pos': 0.274, 'com...",0.097,0.63,0.274,0.9903
4,9815,1,Determined to create the definitive visual doc...,determined create definitive visual document p...,"[determined, create, definitive, visual, docum...",1064,163,23,2,0.202329,0.615476,"{'neg': 0.116, 'neu': 0.57, 'pos': 0.314, 'com...",0.116,0.57,0.314,0.9788


In [29]:
test = vader_sentiment_analysis(test)
test.head()

Unnamed: 0,Label,Text,Text_Clean,Text_Tokenized,Char_Count,Word_Count,Capital_Chars_Count,Capital_Words_Count,Blob_Polarity,Blob_Subjectivity,Vader_Scores,Vader_Negative,Vader_Neutral,Vader_Positive,Vader_Compound
0,1,When so many actors seem content to churn out ...,many actor seem content churn performance quic...,"[many, actor, seem, content, churn, performanc...",1356,251,31,4,0.124375,0.509644,"{'neg': 0.083, 'neu': 0.728, 'pos': 0.189, 'co...",0.083,0.728,0.189,0.9131
1,1,In what football insiders are calling an unex...,football insider calling unexpectedly severe p...,"[football, insider, calling, unexpectedly, sev...",1173,202,40,2,0.130303,0.543603,"{'neg': 0.256, 'neu': 0.665, 'pos': 0.079, 'co...",0.256,0.665,0.079,-0.9751
2,1,In a freak accident following Game 3 of the N....,freak accident following game 3 n b final clev...,"[freak, accident, following, game, 3, n, b, fi...",979,167,27,1,-0.0625,0.426667,"{'neg': 0.152, 'neu': 0.684, 'pos': 0.164, 'co...",0.152,0.684,0.164,0.128
3,1,North Koreas official news agency announced to...,north korea official news agency announced tod...,"[north, korea, official, news, agency, announc...",814,134,28,2,0.003409,0.375054,"{'neg': 0.149, 'neu': 0.688, 'pos': 0.164, 'co...",0.149,0.688,0.164,0.128
4,1,The former Alaska Governor Sarah Palin would b...,former alaska governor sarah palin would bring...,"[former, alaska, governor, sarah, palin, would...",1120,177,36,4,0.067614,0.33487,"{'neg': 0.082, 'neu': 0.782, 'pos': 0.136, 'co...",0.082,0.782,0.136,0.7579


In [30]:
features = ["Vader_Negative", 
            "Vader_Neutral",
            "Vader_Positive",
            "Vader_Compound"]

train_features = pd.concat([vectorized_train_X_df, train[features]], axis="columns")
test_features = pd.concat([vectorized_test_X_df, test[features]], axis="columns")

scaler = MinMaxScaler()
train_X = scaler.fit_transform(train_features)
test_X = scaler.fit_transform(test_features)



In [31]:
nb_classifer = MultinomialNB()
nb_classifer.fit(train_X, train_y)

pred_y = nb_classifer.predict(test_X)
accuracy = metrics.accuracy_score(test_y, pred_y)
print("accuracy:   %0.3f" % accuracy)

print(metrics.classification_report(test_y, pred_y,
                                            target_names=["1 - Satire", "2 - Hoax", "3 - Propaganda", "4 - Reliable News"]))

print("confusion matrix:")
print(metrics.confusion_matrix(test_y, pred_y))

accuracy:   0.558
                   precision    recall  f1-score   support

       1 - Satire       0.63      0.63      0.63       750
         2 - Hoax       0.64      0.06      0.11       750
   3 - Propaganda       0.42      0.97      0.59       750
4 - Reliable News       0.96      0.57      0.71       750

         accuracy                           0.56      3000
        macro avg       0.66      0.56      0.51      3000
     weighted avg       0.66      0.56      0.51      3000

confusion matrix:
[[474  24 243   9]
 [117  43 582   8]
 [ 19   0 728   3]
 [141   0 181 428]]


#### Combining Both

In [32]:
features = ["Blob_Polarity",
            "Blob_Subjectivity",
            "Vader_Negative", 
            "Vader_Neutral",
            "Vader_Positive",
            "Vader_Compound"]

train_features = pd.concat([vectorized_train_X_df, train[features]], axis="columns")
test_features = pd.concat([vectorized_test_X_df, test[features]], axis="columns")

scaler = MinMaxScaler()
train_X = scaler.fit_transform(train_features)
test_X = scaler.fit_transform(test_features)



In [33]:
nb_classifer = MultinomialNB()
nb_classifer.fit(train_X, train_y)

pred_y = nb_classifer.predict(test_X)
accuracy = metrics.accuracy_score(test_y, pred_y)
print("accuracy:   %0.3f" % accuracy)

print(metrics.classification_report(test_y, pred_y,
                                            target_names=["1 - Satire", "2 - Hoax", "3 - Propaganda", "4 - Reliable News"]))

print("confusion matrix:")
print(metrics.confusion_matrix(test_y, pred_y))

accuracy:   0.558
                   precision    recall  f1-score   support

       1 - Satire       0.63      0.64      0.63       750
         2 - Hoax       0.64      0.06      0.11       750
   3 - Propaganda       0.42      0.97      0.59       750
4 - Reliable News       0.96      0.57      0.71       750

         accuracy                           0.56      3000
        macro avg       0.66      0.56      0.51      3000
     weighted avg       0.66      0.56      0.51      3000

confusion matrix:
[[477  24 241   8]
 [119  43 580   8]
 [ 19   0 728   3]
 [144   0 179 427]]


### Context Incongruity

In [34]:
def generate_N_gram(tokenized,ngram=1):
    temp = zip(*[tokenized[i:] for i in range(0,ngram)])
    ans = [' '.join(ngram) for ngram in temp]
    return ans

def get_N_gram_polarities(n_gram):
    return list(map(lambda x: vader.polarity_scores(x)["compound"], n_gram))
    
def count_context_incongruities(tokenized, N):
    n_grams = generate_N_gram(tokenized, ngram=N)
    n_gram_polarities = get_N_gram_polarities(n_grams)
    
    count = 0
    for i in range(len(n_gram_polarities) - 1):
        if n_gram_polarities[i] * n_gram_polarities[i+1] < 0:
            count += 1
    return count

In [35]:
def get_context_incongruities(data, N):
    data["Context_Incongruity - " + str(N) + "-gram"] = data["Text_Tokenized"].apply(lambda x: count_context_incongruities(x, N))
    return data    

In [36]:
for i in range(1, 6):
    get_context_incongruities(train, i)
    get_context_incongruities(test, i)
    
print(train.head())
print(test.head())

   index  Label                                               Text  \
0  24561      3  Third Journalist Killed in 3 Months in Turkey ...   
1  19348      2  Welfare Leech With 12 Kids Collects More In Be...   
2  35758      3  Study Involving 18,000 People Confirms Acupunc...   
3   6252      1  When Enron founder Kenneth Lay died suddenly, ...   
4   9815      1  Determined to create the definitive visual doc...   

                                          Text_Clean  \
0  third journalist killed 3 month turkey suspect...   
1  welfare leech 12 kid collect benefit make enti...   
2  study involving 18 000 people confirms acupunc...   
3  enron founder kenneth lay died suddenly le two...   
4  determined create definitive visual document p...   

                                      Text_Tokenized  Char_Count  Word_Count  \
0  [third, journalist, killed, 3, month, turkey, ...        6024         951   
1  [welfare, leech, 12, kid, collect, benefit, ma...        2408         434   
2 

In [41]:
features = ["Context_Incongruity - 1-gram", 
            "Context_Incongruity - 2-gram",
            "Context_Incongruity - 3-gram",
            "Context_Incongruity - 4-gram",
            "Context_Incongruity - 5-gram"]

train_features = pd.concat([vectorized_train_X_df, train[features]], axis="columns")
test_features = pd.concat([vectorized_test_X_df, test[features]], axis="columns")

scaler = MinMaxScaler()
train_X = scaler.fit_transform(train_features)
test_X = scaler.fit_transform(test_features)



In [42]:
nb_classifer = MultinomialNB()
nb_classifer.fit(train_X, train_y)

pred_y = nb_classifer.predict(test_X)
accuracy = metrics.accuracy_score(test_y, pred_y)
print("accuracy:   %0.3f" % accuracy)

print(metrics.classification_report(test_y, pred_y,
                                            target_names=["1 - Satire", "2 - Hoax", "3 - Propaganda", "4 - Reliable News"]))

print("confusion matrix:")
print(metrics.confusion_matrix(test_y, pred_y))

accuracy:   0.556
                   precision    recall  f1-score   support

       1 - Satire       0.63      0.62      0.63       750
         2 - Hoax       0.63      0.06      0.11       750
   3 - Propaganda       0.42      0.97      0.59       750
4 - Reliable News       0.95      0.57      0.71       750

         accuracy                           0.56      3000
        macro avg       0.66      0.56      0.51      3000
     weighted avg       0.66      0.56      0.51      3000

confusion matrix:
[[466  26 248  10]
 [113  45 583   9]
 [ 18   0 729   3]
 [142   0 180 428]]


In [40]:
for feature in features:
    train_features = pd.concat([vectorized_train_X_df, train[feature]], axis="columns")
    test_features = pd.concat([vectorized_test_X_df, test[feature]], axis="columns")

    scaler = MinMaxScaler()
    train_X = scaler.fit_transform(train_features)
    test_X = scaler.fit_transform(test_features)
    
    nb_classifer = MultinomialNB()
    nb_classifer.fit(train_X, train_y)

    pred_y = nb_classifer.predict(test_X)
    accuracy = metrics.accuracy_score(test_y, pred_y)
    
    print(feature)
    print("accuracy:   %0.3f" % accuracy)

    print(metrics.classification_report(test_y, pred_y,
                                            target_names=["1 - Satire", "2 - Hoax", "3 - Propaganda", "4 - Reliable News"]))

    print("confusion matrix:")
    print(metrics.confusion_matrix(test_y, pred_y))



Blob_Polarity
accuracy:   0.556
                   precision    recall  f1-score   support

       1 - Satire       0.63      0.62      0.63       750
         2 - Hoax       0.63      0.06      0.11       750
   3 - Propaganda       0.42      0.97      0.59       750
4 - Reliable News       0.95      0.57      0.71       750

         accuracy                           0.56      3000
        macro avg       0.66      0.56      0.51      3000
     weighted avg       0.66      0.56      0.51      3000

confusion matrix:
[[466  26 248  10]
 [113  44 584   9]
 [ 18   0 729   3]
 [142   0 180 428]]




Blob_Subjectivity
accuracy:   0.557
                   precision    recall  f1-score   support

       1 - Satire       0.63      0.62      0.63       750
         2 - Hoax       0.63      0.06      0.11       750
   3 - Propaganda       0.42      0.97      0.59       750
4 - Reliable News       0.95      0.57      0.71       750

         accuracy                           0.56      3000
        macro avg       0.66      0.56      0.51      3000
     weighted avg       0.66      0.56      0.51      3000

confusion matrix:
[[468  26 246  10]
 [113  45 583   9]
 [ 18   0 729   3]
 [142   0 180 428]]




Vader_Negative
accuracy:   0.555
                   precision    recall  f1-score   support

       1 - Satire       0.63      0.62      0.63       750
         2 - Hoax       0.63      0.06      0.11       750
   3 - Propaganda       0.42      0.97      0.58       750
4 - Reliable News       0.95      0.57      0.71       750

         accuracy                           0.56      3000
        macro avg       0.66      0.56      0.51      3000
     weighted avg       0.66      0.56      0.51      3000

confusion matrix:
[[465  26 249  10]
 [112  45 584   9]
 [ 18   0 729   3]
 [142   0 181 427]]




Vader_Neutral
accuracy:   0.556
                   precision    recall  f1-score   support

       1 - Satire       0.63      0.62      0.63       750
         2 - Hoax       0.63      0.06      0.11       750
   3 - Propaganda       0.42      0.97      0.59       750
4 - Reliable News       0.95      0.57      0.71       750

         accuracy                           0.56      3000
        macro avg       0.66      0.56      0.51      3000
     weighted avg       0.66      0.56      0.51      3000

confusion matrix:
[[467  26 247  10]
 [114  44 583   9]
 [ 18   0 729   3]
 [142   0 179 429]]




Vader_Positive
accuracy:   0.556
                   precision    recall  f1-score   support

       1 - Satire       0.63      0.62      0.63       750
         2 - Hoax       0.63      0.06      0.11       750
   3 - Propaganda       0.42      0.97      0.59       750
4 - Reliable News       0.95      0.57      0.71       750

         accuracy                           0.56      3000
        macro avg       0.66      0.56      0.51      3000
     weighted avg       0.66      0.56      0.51      3000

confusion matrix:
[[465  26 249  10]
 [112  45 584   9]
 [ 18   0 729   3]
 [142   0 180 428]]




Vader_Compound
accuracy:   0.557
                   precision    recall  f1-score   support

       1 - Satire       0.63      0.63      0.63       750
         2 - Hoax       0.63      0.06      0.11       750
   3 - Propaganda       0.42      0.97      0.59       750
4 - Reliable News       0.96      0.57      0.72       750

         accuracy                           0.56      3000
        macro avg       0.66      0.56      0.51      3000
     weighted avg       0.66      0.56      0.51      3000

confusion matrix:
[[471  25 245   9]
 [117  43 582   8]
 [ 19   0 728   3]
 [140   0 181 429]]


### Topic Modeling and Lexicons

#### Lexical Categories Analysis using Empath

An example of what the code below is executing:

In [45]:
lexicon = Empath()
categories = [
    "sarcastic",
    "ironic",
    "contradict",
    "mock",
    "jest",
    "malicious",
    "vinidctive",
    "government",
    "politics",
    "society",
    "money",
    "culture",
    "convince",
    "discredit",
    "fact",
    "honest",
    "trusted",
]

for cat in categories:
    lexicon.create_category(cat, [cat], model="nytimes")

["sarcastic", "petulant", "humorless", "boastful", "glib", "sanctimonious", "flippant", "whiny", "mischievous", "snide", "impetuous", "pompous", "shrill", "inarticulate", "manipulative", "childish", "imperious", "condescending", "preachy", "pushy", "excitable", "argumentative", "cantankerous", "overbearing", "haughty", "spiteful", "egotistical", "gruff", "boorish", "diffident", "meek", "impulsive", "acerbic", "brusque", "affectionate", "caustic", "flirtatious", "bombastic", "accusatory", "obnoxious", "laconic", "sardonic", "wry", "curt", "sly", "overwrought", "devious", "endearingly", "goofy", "introspective", "moody", "easygoing", "ingenuous", "pedantic", "morose", "brash", "mawkish", "humorous", "empathetic", "surly", "feckless", "dutiful", "chatty", "bemused", "droll", "irascible", "childlike", "deadpan", "witless", "judgmental", "unlikable", "callow", "temperamental", "jocular", "incoherent", "folksy", "neurotic", "endearing", "comical", "verbose", "sneaky", "abrasive", "narcissist

["society", "American_society", "culture", "societies", "larger_society", "Japanese_society", "Western_culture", "free_society", "community", "American_life", "modern_society", "work_place", "institution", "American_family", "ideals", "democratic_society", "social_order", "educational_system", "larger_community", "underclass", "modern_world", "citizenry", "American_culture", "civil_society", "family_structure", "humankind", "whole_society", "mainstream_society", "religion", "conformity", "own_society", "mankind", "pluralism", "legal_system", "profession", "beliefs", "humanity", "orthodoxy", "prejudices", "Western_civilization", "ethos", "common_good", "the_Catholic_Church", "political_system", "exploitation", "multiculturalism", "social_contract", "assimilation", "tenets", "dogma", "legal_profession", "traditional_values", "injustices", "value_system", "entire_community", "values", "civilization", "family_life", "individualism", "greater_good", "social_change", "American_democracy", "g

In [46]:
text = "World Champion skier and Olympic gold medal favorite Lindsey Vonn admitted yesterday that the secret to her success is her 'really, really good ski poles.' 'There's no way I would have won 31 World Cup races without these great, great ski poles,' Vonn told reporters during a press conference, noting that without the top-of-the-line ski poles, it would be difficult for her to maintain her balance or change directions during competition. 'I use them a lot because I'm always skiing, and they haven't broken in half or anything. I think they're really expensive too, like over 50 bucks.' Vonn, who said she was unsure if her ski poles were made of graphite or carbon fiber, urged reporters to trust her when she said that 'whatever they're made of is definitely the best.' "
emotion_info = lexicon.analyze(text, categories=["jest"])
print("Emotion Info: \n\n", emotion_info)

dict_vectorizer = DictVectorizer()
vec_emotion_info = dict_vectorizer.fit_transform(emotion_info).toarray()[0][0]
print("\nVectorized: \n\n", vec_emotion_info)
print(type(vec_emotion_info))

dict_vectorizer.get_feature_names_out()

Emotion Info: 

 {'jest': 1.0}

Vectorized: 

 1.0
<class 'numpy.float64'>


array(['jest'], dtype=object)

In [47]:
def get_lexical_categories(data):
    lexical_categories = []
    dict_vectorizer = DictVectorizer()
    lexicon = Empath()
    for cat in categories:
        data["Lexicon - " + cat] = data["Text_Clean"].apply(lambda x: dict_vectorizer
                                                        .fit_transform(lexicon.analyze(x, categories=[cat]))
                                                        .toarray()[0][0])
#     vec_emotion_info = (emotion_info).toarray().flatten()
# #     data["Lexicon"] = vec_emotion_info
#     lexical_categories.append(vec_emotion_info)
    return data

In [48]:
train = get_lexical_categories(train) 
train.head()
# train_lexical_categories = get_lexical_categories(train) 
# train_lexical_categories

Unnamed: 0,index,Label,Text,Text_Clean,Text_Tokenized,Char_Count,Word_Count,Capital_Chars_Count,Capital_Words_Count,Blob_Polarity,...,Lexicon - government,Lexicon - politics,Lexicon - society,Lexicon - money,Lexicon - culture,Lexicon - convince,Lexicon - discredit,Lexicon - fact,Lexicon - honest,Lexicon - trusted
0,24561,3,Third Journalist Killed in 3 Months in Turkey ...,third journalist killed 3 month turkey suspect...,"[third, journalist, killed, 3, month, turkey, ...",6024,951,235,22,-0.031443,...,22.0,10.0,2.0,0.0,1.0,0.0,2.0,5.0,3.0,1.0
1,19348,2,Welfare Leech With 12 Kids Collects More In Be...,welfare leech 12 kid collect benefit make enti...,"[welfare, leech, 12, kid, collect, benefit, ma...",2408,434,66,10,0.023333,...,11.0,0.0,2.0,0.0,0.0,0.0,0.0,7.0,3.0,0.0
2,35758,3,"Study Involving 18,000 People Confirms Acupunc...",study involving 18 000 people confirms acupunc...,"[study, involving, 18, 000, people, confirms, ...",2706,398,63,1,0.133428,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,3.0,4.0,0.0
3,6252,1,"When Enron founder Kenneth Lay died suddenly, ...",enron founder kenneth lay died suddenly le two...,"[enron, founder, kenneth, lay, died, suddenly,...",1956,341,46,1,0.176077,...,1.0,1.0,0.0,12.0,1.0,0.0,0.0,2.0,1.0,0.0
4,9815,1,Determined to create the definitive visual doc...,determined create definitive visual document p...,"[determined, create, definitive, visual, docum...",1064,163,23,2,0.202329,...,2.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0


In [49]:
test = get_lexical_categories(test) 
test.head()
# test_lexical_categories = get_lexical_categories(test) 
# test_lexical_categories

Unnamed: 0,Label,Text,Text_Clean,Text_Tokenized,Char_Count,Word_Count,Capital_Chars_Count,Capital_Words_Count,Blob_Polarity,Blob_Subjectivity,...,Lexicon - government,Lexicon - politics,Lexicon - society,Lexicon - money,Lexicon - culture,Lexicon - convince,Lexicon - discredit,Lexicon - fact,Lexicon - honest,Lexicon - trusted
0,1,When so many actors seem content to churn out ...,many actor seem content churn performance quic...,"[many, actor, seem, content, churn, performanc...",1356,251,31,4,0.124375,0.509644,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,3.0,0.0,0.0
1,1,In what football insiders are calling an unex...,football insider calling unexpectedly severe p...,"[football, insider, calling, unexpectedly, sev...",1173,202,40,2,0.130303,0.543603,...,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
2,1,In a freak accident following Game 3 of the N....,freak accident following game 3 n b final clev...,"[freak, accident, following, game, 3, n, b, fi...",979,167,27,1,-0.0625,0.426667,...,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1,North Koreas official news agency announced to...,north korea official news agency announced tod...,"[north, korea, official, news, agency, announc...",814,134,28,2,0.003409,0.375054,...,5.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,1,The former Alaska Governor Sarah Palin would b...,former alaska governor sarah palin would bring...,"[former, alaska, governor, sarah, palin, would...",1120,177,36,4,0.067614,0.33487,...,2.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [50]:
for cat in categories:
    features = ["Blob_Polarity", 
            "Blob_Subjectivity",]
    
    features.append("Lexicon - " + cat)

    train_features = pd.concat([vectorized_train_X_df, train[features]], axis="columns")
    test_features = pd.concat([vectorized_test_X_df, test[features]], axis="columns")

    scaler = MinMaxScaler()
    train_X = scaler.fit_transform(train_features)
    test_X = scaler.fit_transform(test_features)
    
    print(features)
    
    nb_classifer = MultinomialNB()
    nb_classifer.fit(train_X, train_y)

    pred_y = nb_classifer.predict(test_X)
    accuracy = metrics.accuracy_score(test_y, pred_y)
    print("accuracy:   %0.3f" % accuracy)

    print(metrics.classification_report(test_y, pred_y,
                                                target_names=["1 - Satire", "2 - Hoax", "3 - Propaganda", "4 - Reliable News"]))

    print("confusion matrix:")
    print(metrics.confusion_matrix(test_y, pred_y))



['Blob_Polarity', 'Blob_Subjectivity', 'Lexicon - sarcastic']
accuracy:   0.557
                   precision    recall  f1-score   support

       1 - Satire       0.63      0.63      0.63       750
         2 - Hoax       0.64      0.06      0.11       750
   3 - Propaganda       0.42      0.97      0.59       750
4 - Reliable News       0.96      0.57      0.71       750

         accuracy                           0.56      3000
        macro avg       0.66      0.56      0.51      3000
     weighted avg       0.66      0.56      0.51      3000

confusion matrix:
[[471  25 245   9]
 [115  44 583   8]
 [ 18   0 729   3]
 [142   0 181 427]]




['Blob_Polarity', 'Blob_Subjectivity', 'Lexicon - ironic']
accuracy:   0.557
                   precision    recall  f1-score   support

       1 - Satire       0.63      0.63      0.63       750
         2 - Hoax       0.64      0.06      0.11       750
   3 - Propaganda       0.42      0.97      0.59       750
4 - Reliable News       0.96      0.57      0.71       750

         accuracy                           0.56      3000
        macro avg       0.66      0.56      0.51      3000
     weighted avg       0.66      0.56      0.51      3000

confusion matrix:
[[472  25 244   9]
 [115  44 583   8]
 [ 18   0 729   3]
 [142   0 181 427]]




['Blob_Polarity', 'Blob_Subjectivity', 'Lexicon - contradict']
accuracy:   0.558
                   precision    recall  f1-score   support

       1 - Satire       0.63      0.63      0.63       750
         2 - Hoax       0.64      0.06      0.11       750
   3 - Propaganda       0.42      0.97      0.59       750
4 - Reliable News       0.96      0.57      0.71       750

         accuracy                           0.56      3000
        macro avg       0.66      0.56      0.51      3000
     weighted avg       0.66      0.56      0.51      3000

confusion matrix:
[[472  25 244   9]
 [115  44 583   8]
 [ 18   0 729   3]
 [142   0 180 428]]




['Blob_Polarity', 'Blob_Subjectivity', 'Lexicon - mock']
accuracy:   0.558
                   precision    recall  f1-score   support

       1 - Satire       0.63      0.63      0.63       750
         2 - Hoax       0.64      0.06      0.11       750
   3 - Propaganda       0.42      0.97      0.59       750
4 - Reliable News       0.96      0.57      0.71       750

         accuracy                           0.56      3000
        macro avg       0.66      0.56      0.51      3000
     weighted avg       0.66      0.56      0.51      3000

confusion matrix:
[[472  25 244   9]
 [115  44 583   8]
 [ 18   0 729   3]
 [142   0 180 428]]




['Blob_Polarity', 'Blob_Subjectivity', 'Lexicon - jest']
accuracy:   0.557
                   precision    recall  f1-score   support

       1 - Satire       0.63      0.63      0.63       750
         2 - Hoax       0.64      0.06      0.11       750
   3 - Propaganda       0.42      0.97      0.59       750
4 - Reliable News       0.96      0.57      0.71       750

         accuracy                           0.56      3000
        macro avg       0.66      0.56      0.51      3000
     weighted avg       0.66      0.56      0.51      3000

confusion matrix:
[[471  25 245   9]
 [115  44 583   8]
 [ 18   0 729   3]
 [142   0 180 428]]




['Blob_Polarity', 'Blob_Subjectivity', 'Lexicon - malicious']
accuracy:   0.558
                   precision    recall  f1-score   support

       1 - Satire       0.63      0.63      0.63       750
         2 - Hoax       0.64      0.06      0.11       750
   3 - Propaganda       0.42      0.97      0.59       750
4 - Reliable News       0.96      0.57      0.71       750

         accuracy                           0.56      3000
        macro avg       0.66      0.56      0.51      3000
     weighted avg       0.66      0.56      0.51      3000

confusion matrix:
[[472  25 244   9]
 [115  44 583   8]
 [ 18   0 729   3]
 [142   0 180 428]]




['Blob_Polarity', 'Blob_Subjectivity', 'Lexicon - vinidctive']
accuracy:   0.557
                   precision    recall  f1-score   support

       1 - Satire       0.63      0.63      0.63       750
         2 - Hoax       0.64      0.06      0.11       750
   3 - Propaganda       0.42      0.97      0.59       750
4 - Reliable News       0.96      0.57      0.71       750

         accuracy                           0.56      3000
        macro avg       0.66      0.56      0.51      3000
     weighted avg       0.66      0.56      0.51      3000

confusion matrix:
[[471  25 245   9]
 [115  44 583   8]
 [ 18   0 729   3]
 [142   0 180 428]]




['Blob_Polarity', 'Blob_Subjectivity', 'Lexicon - government']
accuracy:   0.557
                   precision    recall  f1-score   support

       1 - Satire       0.63      0.63      0.63       750
         2 - Hoax       0.64      0.06      0.11       750
   3 - Propaganda       0.42      0.97      0.59       750
4 - Reliable News       0.96      0.57      0.71       750

         accuracy                           0.56      3000
        macro avg       0.66      0.56      0.51      3000
     weighted avg       0.66      0.56      0.51      3000

confusion matrix:
[[471  25 245   9]
 [115  44 583   8]
 [ 18   0 729   3]
 [142   0 182 426]]




['Blob_Polarity', 'Blob_Subjectivity', 'Lexicon - politics']
accuracy:   0.557
                   precision    recall  f1-score   support

       1 - Satire       0.63      0.63      0.63       750
         2 - Hoax       0.64      0.06      0.11       750
   3 - Propaganda       0.42      0.97      0.59       750
4 - Reliable News       0.96      0.57      0.71       750

         accuracy                           0.56      3000
        macro avg       0.66      0.56      0.51      3000
     weighted avg       0.66      0.56      0.51      3000

confusion matrix:
[[471  25 245   9]
 [115  44 583   8]
 [ 18   0 729   3]
 [142   0 180 428]]




['Blob_Polarity', 'Blob_Subjectivity', 'Lexicon - society']
accuracy:   0.557
                   precision    recall  f1-score   support

       1 - Satire       0.63      0.63      0.63       750
         2 - Hoax       0.64      0.06      0.11       750
   3 - Propaganda       0.42      0.97      0.59       750
4 - Reliable News       0.96      0.57      0.71       750

         accuracy                           0.56      3000
        macro avg       0.66      0.56      0.51      3000
     weighted avg       0.66      0.56      0.51      3000

confusion matrix:
[[472  25 244   9]
 [115  44 583   8]
 [ 18   0 729   3]
 [142   0 181 427]]




['Blob_Polarity', 'Blob_Subjectivity', 'Lexicon - money']
accuracy:   0.558
                   precision    recall  f1-score   support

       1 - Satire       0.63      0.63      0.63       750
         2 - Hoax       0.64      0.06      0.11       750
   3 - Propaganda       0.42      0.97      0.59       750
4 - Reliable News       0.96      0.57      0.71       750

         accuracy                           0.56      3000
        macro avg       0.66      0.56      0.51      3000
     weighted avg       0.66      0.56      0.51      3000

confusion matrix:
[[472  25 244   9]
 [115  44 583   8]
 [ 18   0 729   3]
 [142   0 180 428]]




['Blob_Polarity', 'Blob_Subjectivity', 'Lexicon - culture']
accuracy:   0.558
                   precision    recall  f1-score   support

       1 - Satire       0.63      0.63      0.63       750
         2 - Hoax       0.64      0.06      0.11       750
   3 - Propaganda       0.42      0.97      0.59       750
4 - Reliable News       0.96      0.57      0.71       750

         accuracy                           0.56      3000
        macro avg       0.66      0.56      0.51      3000
     weighted avg       0.66      0.56      0.51      3000

confusion matrix:
[[472  25 244   9]
 [115  44 583   8]
 [ 18   0 729   3]
 [142   0 180 428]]




['Blob_Polarity', 'Blob_Subjectivity', 'Lexicon - convince']
accuracy:   0.558
                   precision    recall  f1-score   support

       1 - Satire       0.63      0.63      0.63       750
         2 - Hoax       0.64      0.06      0.11       750
   3 - Propaganda       0.42      0.97      0.59       750
4 - Reliable News       0.96      0.57      0.71       750

         accuracy                           0.56      3000
        macro avg       0.66      0.56      0.51      3000
     weighted avg       0.66      0.56      0.51      3000

confusion matrix:
[[472  25 244   9]
 [115  44 583   8]
 [ 18   0 729   3]
 [142   0 180 428]]




['Blob_Polarity', 'Blob_Subjectivity', 'Lexicon - discredit']
accuracy:   0.557
                   precision    recall  f1-score   support

       1 - Satire       0.63      0.63      0.63       750
         2 - Hoax       0.64      0.06      0.11       750
   3 - Propaganda       0.42      0.97      0.59       750
4 - Reliable News       0.95      0.57      0.71       750

         accuracy                           0.56      3000
        macro avg       0.66      0.56      0.51      3000
     weighted avg       0.66      0.56      0.51      3000

confusion matrix:
[[470  25 246   9]
 [114  44 583   9]
 [ 18   0 729   3]
 [142   0 180 428]]




['Blob_Polarity', 'Blob_Subjectivity', 'Lexicon - fact']
accuracy:   0.557
                   precision    recall  f1-score   support

       1 - Satire       0.63      0.63      0.63       750
         2 - Hoax       0.64      0.06      0.11       750
   3 - Propaganda       0.42      0.97      0.59       750
4 - Reliable News       0.96      0.57      0.71       750

         accuracy                           0.56      3000
        macro avg       0.66      0.56      0.51      3000
     weighted avg       0.66      0.56      0.51      3000

confusion matrix:
[[471  25 245   9]
 [115  44 583   8]
 [ 18   0 729   3]
 [142   0 181 427]]




['Blob_Polarity', 'Blob_Subjectivity', 'Lexicon - honest']
accuracy:   0.557
                   precision    recall  f1-score   support

       1 - Satire       0.63      0.63      0.63       750
         2 - Hoax       0.64      0.06      0.11       750
   3 - Propaganda       0.42      0.97      0.59       750
4 - Reliable News       0.96      0.57      0.71       750

         accuracy                           0.56      3000
        macro avg       0.66      0.56      0.51      3000
     weighted avg       0.66      0.56      0.51      3000

confusion matrix:
[[471  25 245   9]
 [115  44 583   8]
 [ 18   0 729   3]
 [142   0 180 428]]




['Blob_Polarity', 'Blob_Subjectivity', 'Lexicon - trusted']
accuracy:   0.557
                   precision    recall  f1-score   support

       1 - Satire       0.63      0.63      0.63       750
         2 - Hoax       0.64      0.06      0.11       750
   3 - Propaganda       0.42      0.97      0.59       750
4 - Reliable News       0.96      0.57      0.71       750

         accuracy                           0.56      3000
        macro avg       0.66      0.56      0.51      3000
     weighted avg       0.66      0.56      0.51      3000

confusion matrix:
[[471  25 245   9]
 [115  44 583   8]
 [ 18   0 729   3]
 [142   0 180 428]]
