In [1]:
import pandas as pd
from sklearn.metrics import f1_score
import string
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
import datetime
from sklearn.preprocessing import MinMaxScaler

train = pd.read_csv('./raw_data/xtrain.csv', names=['Verdict', "Text"])
X = train['Text']
y= train['Verdict']

x_train = X
y_train = y

In [2]:
import pickle
# You may load the pickle files from a google drive such that you do not need to run the feature engineering from scratch. 
# Because of the big size of pickle files, we are only demonstrating it using xtrain.csv and balancedtest.csv instead of full train.
# You may then put the respective pickle files into the correct directory.



train_df = pd.read_pickle("./pickles/compiled.pkl")
test_df = pd.read_pickle("./pickles/compiled_test.pkl")
train_df = train_df.loc[:,~train_df.columns.duplicated()].copy()

In [3]:
def get_feature_from_df(df, columns):   
    features = []
    for col in columns:
        features.append(list(df[col]))
        
    return features

all_columns = ['sentiment_compound_score',
       'number_of_words', 'number_of_characters', 'number_of_sentence',
        'DATE',
       'LANGUAGE', 'GPE', 'WORK_OF_ART', 'NORP', 'ORDINAL', 'LOC', 'CARDINAL',
       'FAC', 'PERCENT', 'LAW', 'QUANTITY', 'EVENT', 'PERSON', 'PRODUCT',
       'MONEY', 'ORG', 'TIME', 'total_entities', 'readability']

In [4]:
def train_and_test(df_train, df_test, feature_columns_interested, classes_interested, class_weights=None):
    df_train = df_train[train_df['Verdict'].isin(classes_interested)]
    df_test = df_test[test_df['Verdict'].isin(classes_interested)]

    features_train = get_feature_from_df(df_train, feature_columns_interested)
    features_train = list(map(list, zip(*features_train)))

    scaler = MinMaxScaler()
    features_train = scaler.fit_transform(features_train)

    X_train = features_train
    y_train = df_train['Verdict']

    if class_weights is not None:
        model = LogisticRegression(class_weight=class_weights)
    else:
        model = LogisticRegression()
        
    model.fit(X_train, y_train)

    # test out
    features_test = get_feature_from_df(df_test, feature_columns_interested)
    features_test = list(map(list, zip(*features_test)))

    features_test = scaler.transform(features_test)
    X_test = features_test

    y_test = df_test['Verdict']
    y_pred = model.predict(X_test)

    print(classification_report(y_test, y_pred))

## 1.1 Investigating Length Related Features

### Distribution 


In [26]:
columns =  [ 'number_of_words', 'number_of_characters', 'number_of_sentence']
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)

for column in columns:
    print(column)
    print(train_df.groupby('Verdict')[column].describe())

number_of_words
          count        mean          std  min     25%    50%      75%      max
Verdict                                                                       
1        3981.0  293.893243   249.185916  2.0  123.00  166.0   532.00   1255.0
2        4014.0  194.705531    78.497109  2.0  147.00  185.0   233.00   1152.0
3        4008.0  907.681138  1116.982475  2.0  250.75  682.0  1201.25  20894.0
4        3997.0  444.215412   360.012765  3.0  194.00  347.0   616.00   4545.0
number_of_characters
          count         mean          std   min      25%     50%     75%       max
Verdict                                                                           
1        3981.0  1769.675961  1506.681982  16.0   737.00   994.0  3193.0    7599.0
2        4014.0  1144.731440   458.957049  10.0   862.00  1097.0  1371.0    7418.0
3        4008.0  5510.928393  6612.905755  10.0  1524.25  4203.5  7363.5  118990.0
4        3997.0  2700.588692  2147.276997  14.0  1210.00  2124.0  3733.0  

We can very clearly see that, number of words, and number of characters and number of sentence has is quite distinct for class 3(propaganda) and 4(reliable news) in comparison to the rest. So ideally, if we use these features, we should be able to perform better for than random guess

In [27]:
columns = ['number_of_words', 'number_of_characters', 'number_of_sentence']
features_train = get_feature_from_df(train_df, columns)

features_train = list(map(list, zip(*features_train)))

In [28]:
# import minmax
scaler = MinMaxScaler()
features_train = scaler.fit_transform(features_train)

X_train = features_train
y_train = train_df['Verdict']

model = LogisticRegression()
model.fit(X_train, y_train)

In [29]:
# test out
features_test = get_feature_from_df(test_df, columns)
features_test = list(map(list, zip(*features_test)))

features_test = scaler.transform(features_test)
X_test = features_test
y_test = test_df['Verdict']

y_pred = model.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           1       0.24      0.07      0.11       750
           2       0.19      0.27      0.22       750
           3       0.50      0.68      0.58       750
           4       0.28      0.24      0.26       750

    accuracy                           0.32      3000
   macro avg       0.30      0.32      0.29      3000
weighted avg       0.30      0.32      0.29      3000



We can see clearly that with length related features, class 3 and 4 performs better than random guessing. Now we need to move on and find features that can better differentiate 1 and 2

## 1.2 Investigating Sentiment Analysis

### Distribution 


In [11]:
columns =  ['sentiment_compound_score']
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)

for column in columns:
    print(column)
    print(train_df.groupby('Verdict')[column].describe())

sentiment_compound_score
          count      mean       std     min       25%      50%       75%     max
Verdict                                                                         
1        3981.0  0.244429  0.775245 -0.9994 -0.655200  0.63110  0.945900  0.9998
2        4014.0 -0.177907  0.806081 -0.9989 -0.939500 -0.59455  0.777525  0.9998
3        4008.0 -0.078397  0.852430 -1.0000 -0.985125 -0.12800  0.952150  1.0000
4        3997.0  0.235083  0.819811 -0.9998 -0.768400  0.70960  0.976900  0.9999


From negatve to positive, we have hoax, propaganda, reliable news and satire. Analyzing it more qualitatively, this makes sense as hoaxes are usualy used to spread fear - which is associated with negativity, and satire uses humour, which may contribute to its higher score.

In [12]:
columns =  ['sentiment_compound_score']
features_train = get_feature_from_df(train_df, columns)
features_train = list(map(list, zip(*features_train)))

In [13]:
# import minmax
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
features_train = scaler.fit_transform(features_train)

X_train = features_train
y_train = train_df['Verdict']

model = LogisticRegression()
model.fit(X_train, y_train)

In [14]:
# test out
features_test = get_feature_from_df(test_df, columns)
features_test = list(map(list, zip(*features_test)))

features_test = scaler.transform(features_test)
X_test = features_test
y_test = test_df['Verdict']

y_pred = model.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           1       0.27      0.53      0.36       750
           2       0.31      0.56      0.40       750
           3       0.08      0.01      0.01       750
           4       0.23      0.05      0.08       750

    accuracy                           0.28      3000
   macro avg       0.22      0.28      0.21      3000
weighted avg       0.22      0.28      0.21      3000



From the results, we can see that sentiment analysis is great at differentiating hoaxes(2) and satire(1), which is at the extreme ends of the sentiment analysis scale. We can verify by just training and testing on both.

In [35]:
train_and_test(train_df, test_df, ['sentiment_compound_score'], [1,2])
train_and_test(train_df, test_df, ['sentiment_compound_score'], [3,4])

train_and_test(train_df, test_df, ['sentiment_compound_score'], [1,3])
train_and_test(train_df, test_df, ['sentiment_compound_score'], [2,4])

train_and_test(train_df, test_df, ['sentiment_compound_score'], [1,4])
train_and_test(train_df, test_df, ['sentiment_compound_score'], [2,3])


              precision    recall  f1-score   support

           1       0.60      0.62      0.61       750
           2       0.60      0.58      0.59       750

    accuracy                           0.60      1500
   macro avg       0.60      0.60      0.60      1500
weighted avg       0.60      0.60      0.60      1500

              precision    recall  f1-score   support

           3       0.52      0.47      0.49       750
           4       0.51      0.56      0.53       750

    accuracy                           0.51      1500
   macro avg       0.51      0.51      0.51      1500
weighted avg       0.51      0.51      0.51      1500

              precision    recall  f1-score   support

           1       0.53      0.61      0.57       750
           3       0.54      0.47      0.50       750

    accuracy                           0.54      1500
   macro avg       0.54      0.54      0.54      1500
weighted avg       0.54      0.54      0.54      1500

              preci

## 1.3 Investigating Readability Score

### Distribution 


In [60]:
columns =  ['readability']
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)

for column in columns:
    print(column)
    print(train_df.groupby('Verdict')[column].describe())

readability
          count       mean        std     min    25%     50%    75%     max
Verdict                                                                    
1        3981.0  55.548111  14.751342  -84.84  47.19  56.390  65.15  102.10
2        4014.0  64.723787  11.430079 -227.85  58.72  64.810  72.05   96.18
3        4008.0  53.064671  17.277307  -87.11  45.29  54.135  62.48  120.21
4        3997.0  55.977611  16.221884 -426.90  47.52  56.390  64.34  111.07


We can see that class 2 (hoax) has the highest readability score - which is so that it caters a wider target audience, which a hoax aims to be.
Class 3 (propaganda) on the other hand has the lowest readability score - this might be because it often involves a lot of fake technical terms etc and jargon to make it seem more plausible.

In [83]:
columns =  ['readability']
features_train = get_feature_from_df(train_df, columns)
features_train = list(map(list, zip(*features_train)))

In [84]:
# import minmax
from sklearn.preprocessing import StandardScaler, MinMaxScaler
scaler = MinMaxScaler()
features_train = scaler.fit_transform(features_train)

X_train = features_train
y_train = train_df['Verdict']

model = LogisticRegression()
model.fit(X_train, y_train)

In [85]:
# test out
features_test = get_feature_from_df(test_df, columns)
features_test = list(map(list, zip(*features_test)))

features_test = scaler.transform(features_test)
X_test = features_test
y_test = test_df['Verdict']

y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           1       0.00      0.00      0.00       750
           2       0.32      0.63      0.43       750
           3       0.48      0.73      0.58       750
           4       0.29      0.16      0.20       750

    accuracy                           0.38      3000
   macro avg       0.27      0.38      0.30      3000
weighted avg       0.27      0.38      0.30      3000



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


We can say that readability score helps in classifying class 2 and 3.

In [86]:
# distribution of test
print(test_df.groupby('Verdict')['readability'].describe())

         count       mean        std    min      25%    50%     75%     max
Verdict                                                                    
1        750.0  61.983013  10.174617  19.24  55.7800  61.92  68.810   89.79
2        750.0  63.642413  11.205983   3.40  56.6075  62.83  71.015  102.61
3        750.0  49.603013  10.428659  16.59  42.6200  49.65  56.280   74.59
4        750.0  60.284520  10.096960  33.28  53.3100  59.96  66.940   98.51


We can see that in the test dataset, class 2 and 3 still is the highest and lowest respectively, however class 1 and class 4 has increased quite a lot. Which does not really follow the distribution of the training dataset, hence this feature is only good enough to differentiate 2 and 3.

## 1.4 Investigating Custom Vocabulary

In [112]:
from collections import Counter
# Initialize a dictionary to hold our class-specific counters
class_counters = {class_label: Counter() for class_label in train_df['Verdict'].unique()}

# Populate the counters with token counts for each class
for index, row in train_df.iterrows():
    class_counters[row['Verdict']].update(row['tokens'])

In [123]:
# Extract top 20 tokens for each class
top_tokens_per_class = {class_label: counter.most_common(100) for class_label, counter in class_counters.items()}
import nltk
stop_words_and_punctuation = set(nltk.corpus.stopwords.words('english'))
stop_words_and_punctuation.update(string.punctuation)

# remove stop words
top_tokens_per_class = {class_label: [(token, count) for token, count in top_tokens if token.lower() not in stop_words_and_punctuation] for class_label, top_tokens in top_tokens_per_class.items()}

In [124]:
# Display the top tokens for each class
for class_label, top_tokens in top_tokens_per_class.items():
    print(f"Class {class_label}:")
    for token, count in top_tokens:
        print(f"{token}: {count}")
    print("\n")

Class 1:
's: 10050
said: 9937
n't: 3200
time: 3163
one: 3054
would: 2945
like: 2544
year: 2022
could: 1902
even: 1755
get: 1701
new: 1522
added: 1514
Monday: 1454
people: 1444
know: 1413
old: 1400
nt: 1397
really: 1393
back: 1376
first: 1354


Class 3:
nt: 10470
would: 8852
people: 8684
one: 7466
government: 7409
like: 5927
US: 5573
also: 5242
time: 4987
even: 4788
world: 4523
could: 4278
said: 4232
many: 4155
U.S.: 4077
years: 3984
us: 3920


Class 4:
's: 19606
said: 15814
--: 5511
would: 4377
year: 3955
percent: 3699
one: 3499
n't: 3245
also: 3238
two: 3117
Taiwan: 2987
people: 2898
government: 2724
first: 2563
could: 2325
new: 2321
time: 2306
years: 2255
last: 2231
China: 2208
U.S.: 1992
like: 1903
million: 1878


Class 2:
Obama: 4110
think: 3601
nt: 2817
Trump: 2714
one: 1706
According: 1682
would: 1590
people: 1575
reports: 1495
time: 1398
President: 1304
Clinton: 1289
told: 1284
also: 1251
Hillary: 1235
recent: 1215
like: 1201
country: 1192
said: 1191
American: 1184
us: 1091
vide

In [125]:
# Get the top tokens and put in a list
top_tokens = []
for class_label, top_tokens_class in top_tokens_per_class.items():
    top_tokens.extend([token for token, count in top_tokens_class])

# Get the unique tokens
top_tokens = set(top_tokens)

### Wikitionary - we get lists of words that are quite dramatic in nature, and try to see if we can use it to compare satire/hoax etc 


In [131]:
# Load custom vocabulary
wikitionary_dir = "./wiktionarylists"

# list all text files
import os
wikitionary_files = os.listdir(wikitionary_dir)

# read all files
wikitionary_words = {}
for file in wikitionary_files:
    if file == ".DS_Store":
        continue
    category_words = []
    with open(f"{wikitionary_dir}/{file}") as f:
        category_words += f.read().splitlines()

    # lowercase
    category_words = [word.lower() for word in category_words]
    wikitionary_words[file] = category_words

# print
print(wikitionary_words)

# get all words
all_words = set()
for category, words in wikitionary_words.items():
    all_words.update(words)



{'comparative_forms.txt': ['wilder', 'touchier', 'weer', 'less', 'number', 'later', 'liver', 'damper', 'greater', 'hinder', 'archer', 'sooner', 'rasher', 'faster', 'worse', 'madder', 'further', 'lither', 'mazier', 'terser', 'higher', 'tamer', 'lighter', 'fresher', 'stranger', 'smaller', 'lesser', 'lamer', 'balder', 'fuller', 'longer', 'abler', 'cheaper', 'bummer', 'flatter', 'closer', 'taller', 'kinder', 'feller', 'bolder', 'liever', 'thinner', 'braver', 'duffer', 'wider', 'cooler', 'younger', 'farther', 'shorter', 'cleaner', 'planer', 'broker', 'camper', 'richer', 'nobler', 'happier', 'quicker', 'lower', 'absenter', 'purer', 'prettier', 'crapper', 'older', 'fancier', 'saucier', 'earlier', 'dafter', 'mangier', 'juster', 'newer', 'dishier', 'simpler', 'shitter', 'profaner', 'stupider', 'louder', 'clearer', 'dimmer', 'gamer', 'fruitier', 'holier', 'badder', 'plumper', 'achier', 'commoner', 'vaster', 'goodlier', 'shyer', 'buffer', 'fader', 'tarrier', 'hotter', 'remoter', 'broader', 'funne

In [149]:
# from class counters, let's see how many words are in the wikitionary
final = {}
for i in [1,2,3,4]:
    counter = class_counters[i]
    total_words = sum(counter.values())
    
    words_in_wikitionary_separated_by_type = {category: 0 for category in wikitionary_words.keys()}
    for word, count in counter.items():
        for category, words in wikitionary_words.items():
            if word.lower() in words:
                words_in_wikitionary_separated_by_type[category] += count
    final[i] = words_in_wikitionary_separated_by_type
    
    
   

In [152]:
# normalize it
final_normalized = {}
for i in [1,2,3,4]:
    counter = class_counters[i]
    total_words = sum(counter.values())
    
    final_normalized[i] = {category: 0 for category in wikitionary_words.keys()}
    for category, count in final[i].items():
        final_normalized[i][category] = (count / total_words * 100)


In [153]:
final_normalized

{1: {'comparative_forms.txt': 0.27949180713781274,
  'modal_adverbs.txt': 0.4680310736679911,
  'act_adverbs.txt': 0.012055670088384469,
  'manner_adverbs.txt': 0.32742914618748353,
  'superlative_forms.txt': 0.2517423653367384},
 2: {'comparative_forms.txt': 0.20980279216509917,
  'modal_adverbs.txt': 0.3313390374646227,
  'act_adverbs.txt': 0.00984511484269883,
  'manner_adverbs.txt': 0.2774738114287072,
  'superlative_forms.txt': 0.17404352446058388},
 3: {'comparative_forms.txt': 0.3062648397237359,
  'modal_adverbs.txt': 0.3554044377357323,
  'act_adverbs.txt': 0.012925115504630331,
  'manner_adverbs.txt': 0.32175081923489113,
  'superlative_forms.txt': 0.24139767125657247},
 4: {'comparative_forms.txt': 0.33562906291274935,
  'modal_adverbs.txt': 0.16437021952226438,
  'act_adverbs.txt': 0.006171058881947643,
  'manner_adverbs.txt': 0.17967827256275465,
  'superlative_forms.txt': 0.25837123272402496}}

We can see that for satire - adverbs appear the most times - modal_adverbs, manner_adverbs. 

In [157]:
# Analyze which of the wikitionary words are most common in each class
# from class counters, let's see how many words are in the wikitionary
from collections import defaultdict
final_words = {}
for i in [1,2,3,4]:
    counter = class_counters[i]
    total_words = sum(counter.values())
    
    words_in_wikitionary_separated_by_type = {category: {} for category in wikitionary_words.keys()}
    for word, count in counter.items():
        for category, words in wikitionary_words.items():
            if word.lower() in words:
                if word not in words_in_wikitionary_separated_by_type[category]:
                    words_in_wikitionary_separated_by_type[category][word] = 0
                words_in_wikitionary_separated_by_type[category][word] += count
    final_words[i] = words_in_wikitionary_separated_by_type

In [158]:
final_words

{1: {'comparative_forms.txt': {'faster': 39,
   'crapper': 1,
   'number': 441,
   'better': 475,
   'lower': 98,
   'commoner': 1,
   'deeper': 33,
   'harder': 33,
   'Further': 26,
   'darker': 6,
   'greater': 108,
   'further': 208,
   'longer': 237,
   'easier': 44,
   'shitter': 3,
   'Earlier': 16,
   'Lower': 11,
   'littler': 1,
   'younger': 75,
   'higher': 92,
   'closer': 71,
   'older': 83,
   'cooler': 9,
   'later': 417,
   'worse': 80,
   'quicker': 8,
   'dryer': 10,
   'louder': 10,
   'meaner': 4,
   'sharper': 3,
   'bigger': 39,
   'Bigger': 3,
   'hotter': 5,
   'Fuller': 21,
   'stranger': 15,
   'elder': 14,
   'trickier': 2,
   'less': 304,
   'earlier': 140,
   'cleaner': 10,
   'broader': 13,
   'nicer': 8,
   'Sharper': 5,
   'classier': 1,
   'larger': 55,
   'heavier': 6,
   'sooner': 35,
   'Better': 12,
   'safer': 11,
   'sticker': 12,
   'happier': 15,
   'thicker': 5,
   'Safer': 1,
   'Broker': 1,
   'farther': 9,
   'leaner': 3,
   'healthier': 7,

In [159]:
# top 20 in total of dramatic final_words
top_words = {}
for i in [1,2,3,4]:
    top_words[i] = {}
    all_words_with_count_and_class = []
    for category, words in final_words[i].items():
        for word, count in words.items():
            all_words_with_count_and_class.append((word, count, category))
    
    all_words_with_count_and_class = sorted(all_words_with_count_and_class, key=lambda x: x[1], reverse=True)
    top_words[i] = all_words_with_count_and_class[:20]

In [160]:
top_words

{1: [('really', 1393, 'modal_adverbs.txt'),
  ('most', 1190, 'superlative_forms.txt'),
  ('reportedly', 1115, 'modal_adverbs.txt'),
  ('well', 868, 'manner_adverbs.txt'),
  ('best', 587, 'superlative_forms.txt'),
  ('better', 475, 'comparative_forms.txt'),
  ('sure', 471, 'modal_adverbs.txt'),
  ('actually', 462, 'modal_adverbs.txt'),
  ('number', 441, 'comparative_forms.txt'),
  ('least', 437, 'superlative_forms.txt'),
  ('hard', 432, 'manner_adverbs.txt'),
  ('later', 417, 'comparative_forms.txt'),
  ('completely', 383, 'manner_adverbs.txt'),
  ('finally', 344, 'manner_adverbs.txt'),
  ('probably', 329, 'modal_adverbs.txt'),
  ('less', 304, 'comparative_forms.txt'),
  ('simply', 281, 'manner_adverbs.txt'),
  ('maybe', 266, 'modal_adverbs.txt'),
  ('longer', 237, 'comparative_forms.txt'),
  ('likely', 237, 'modal_adverbs.txt')],
 2: [('most', 546, 'superlative_forms.txt'),
  ('well', 392, 'manner_adverbs.txt'),
  ('reportedly', 344, 'modal_adverbs.txt'),
  ('really', 336, 'modal_adver

In [201]:
custom_vocab_vectorizer = TfidfVectorizer(vocabulary=all_words.union(top_tokens))
# fit and transform
X_train = custom_vocab_vectorizer.fit_transform(x_train)
y_train = train_df['Verdict']
# we want to combine class 1,4 and  2,3 together
# y_train = y_train.apply(lambda x: 0 if x in [1,4] else 1)

model = LogisticRegression()
model.fit(X_train, y_train)

# test out
X_test = custom_vocab_vectorizer.transform(test_df['Text'])
y_test = test_df['Verdict']
# y_test = y_test.apply(lambda x: 0 if x in [1,4] else 1)

y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))




              precision    recall  f1-score   support

           1       0.58      0.49      0.53       750
           2       0.56      0.20      0.29       750
           3       0.43      0.65      0.52       750
           4       0.54      0.70      0.61       750

    accuracy                           0.51      3000
   macro avg       0.53      0.51      0.49      3000
weighted avg       0.53      0.51      0.49      3000



## 1.5 Investigating Named Entity Recognition

### Distribution 


In [166]:
columns = ['DATE',
       'LANGUAGE', 'GPE', 'WORK_OF_ART', 'NORP', 'ORDINAL', 'LOC', 'CARDINAL',
       'FAC', 'PERCENT', 'LAW', 'QUANTITY', 'EVENT', 'PERSON', 'PRODUCT',
       'MONEY', 'ORG', 'TIME', 'total_entities']

pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)

temp = []

for column in columns:
    print(column)
    print(train_df.groupby('Verdict')[column].describe())
    # store the highest mean among the classes
    temp.append((column, train_df.groupby('Verdict')[column].mean().max()))

print(temp)
# sort
temp = sorted(temp, key=lambda x: x[1], reverse=True)
print(temp)

DATE
          count      mean       std  min  25%  50%   75%    max
Verdict                                                        
1        3981.0  3.933936  3.752909  0.0  1.0  3.0   5.0   42.0
2        4014.0  1.570005  1.719404  0.0  0.0  1.0   2.0   21.0
3        4008.0  7.257485  9.968259  0.0  1.0  4.0  10.0  146.0
4        3997.0  7.950463  6.692521  0.0  3.0  6.0  11.0   74.0
LANGUAGE
          count      mean       std  min  25%  50%  75%   max
Verdict                                                      
1        3981.0  0.024868  0.198321  0.0  0.0  0.0  0.0   4.0
2        4014.0  0.010463  0.186486  0.0  0.0  0.0  0.0   8.0
3        4008.0  0.039920  0.288544  0.0  0.0  0.0  0.0   7.0
4        3997.0  0.071554  0.453971  0.0  0.0  0.0  0.0  16.0
GPE
          count       mean        std  min  25%  50%   75%    max
Verdict                                                          
1        3981.0   2.059784   3.593537  0.0  0.0  1.0   2.0   69.0
2        4014.0   2.196313  

In [52]:
ner_best = [('total_entities', 64.09955089820359), ('ORG', 16.314121756487026), ('GPE', 11.105289421157684), ('PERSON', 10.505489021956087), ('DATE', 7.950462847135351), ('CARDINAL', 6.17814371257485), ('NORP', 5.164421157684631), ('LOC', 1.3038922155688624), ('ORDINAL', 1.158932135728543), ('MONEY', 1.158932135728543), ('PERCENT', 0.9161871403552665), ('TIME', 0.7749309218789249), ('WORK_OF_ART', 0.5746007984031936), ('LAW', 0.5047405189620758), ('QUANTITY', 0.4711033274956217), ('PRODUCT', 0.4426147704590818), ('FAC', 0.38929196897673257), ('EVENT', 0.37300399201596807), ('LANGUAGE', 0.07155366524893671)]
top_7_ner = [x[0] for x in ner_best[:7]]

Propaganda and real news have the most number of entities. MONEY, ORG, PERSON, DATE being the highest ones

In [168]:
sorted_columns_to_test = [x[0] for x in temp]
print(sorted_columns_to_test)

for column in sorted_columns_to_test:
    print(column)
    train_and_test(train_df, test_df, [column], [1,2,3,4])

['total_entities', 'ORG', 'GPE', 'PERSON', 'DATE', 'CARDINAL', 'NORP', 'LOC', 'ORDINAL', 'MONEY', 'PERCENT', 'TIME', 'WORK_OF_ART', 'LAW', 'QUANTITY', 'PRODUCT', 'FAC', 'EVENT', 'LANGUAGE']
total_entities
              precision    recall  f1-score   support

           1       0.25      0.15      0.19       750
           2       0.23      0.39      0.29       750
           3       0.26      0.25      0.26       750
           4       0.25      0.18      0.21       750

    accuracy                           0.24      3000
   macro avg       0.25      0.24      0.24      3000
weighted avg       0.25      0.24      0.24      3000

ORG
              precision    recall  f1-score   support

           1       0.21      0.05      0.08       750
           2       0.25      0.50      0.33       750
           3       0.38      0.42      0.40       750
           4       0.24      0.16      0.19       750

    accuracy                           0.28      3000
   macro avg       0.27      0

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


              precision    recall  f1-score   support

           1       0.23      0.15      0.18       750
           2       0.30      0.62      0.41       750
           3       0.32      0.12      0.17       750
           4       0.59      0.53      0.56       750

    accuracy                           0.35      3000
   macro avg       0.36      0.35      0.33      3000
weighted avg       0.36      0.35      0.33      3000

CARDINAL
              precision    recall  f1-score   support

           1       0.19      0.09      0.12       750
           2       0.30      0.61      0.40       750
           3       0.37      0.43      0.40       750
           4       0.26      0.08      0.13       750

    accuracy                           0.30      3000
   macro avg       0.28      0.30      0.26      3000
weighted avg       0.28      0.30      0.26      3000

NORP
              precision    recall  f1-score   support

           1       0.30      0.63      0.41       750
       

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize

              precision    recall  f1-score   support

           1       0.00      0.00      0.00       750
           2       0.27      0.85      0.41       750
           3       0.27      0.24      0.25       750
           4       0.00      0.00      0.00       750

    accuracy                           0.27      3000
   macro avg       0.13      0.27      0.17      3000
weighted avg       0.13      0.27      0.17      3000

PERCENT
              precision    recall  f1-score   support

           1       0.00      0.00      0.00       750
           2       0.28      0.90      0.43       750
           3       0.50      0.17      0.25       750
           4       0.41      0.20      0.27       750

    accuracy                           0.32      3000
   macro avg       0.30      0.32      0.24      3000
weighted avg       0.30      0.32      0.24      3000

TIME
              precision    recall  f1-score   support

           1       0.30      0.36      0.33       750
        

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize

              precision    recall  f1-score   support

           1       0.00      0.00      0.00       750
           2       0.23      0.74      0.35       750
           3       0.26      0.19      0.22       750
           4       0.00      0.00      0.00       750

    accuracy                           0.23      3000
   macro avg       0.12      0.23      0.14      3000
weighted avg       0.12      0.23      0.14      3000

FAC
              precision    recall  f1-score   support

           1       0.00      0.00      0.00       750
           2       0.25      0.81      0.38       750
           3       0.00      0.00      0.00       750
           4       0.35      0.25      0.29       750

    accuracy                           0.26      3000
   macro avg       0.15      0.26      0.17      3000
weighted avg       0.15      0.26      0.17      3000

EVENT
              precision    recall  f1-score   support

           1       0.00      0.00      0.00       750
           

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [186]:
train_and_test(train_df, test_df, sorted_columns_to_test, [1,2,3,4])

              precision    recall  f1-score   support

           1       0.46      0.37      0.41       750
           2       0.33      0.47      0.39       750
           3       0.47      0.44      0.45       750
           4       0.60      0.49      0.54       750

    accuracy                           0.44      3000
   macro avg       0.46      0.44      0.45      3000
weighted avg       0.46      0.44      0.45      3000



We can see clearly it best classifies class 4 and class 3, which is the real news class as it has the most named entity, and also hoax.

Putting it altogether, we have 
1. length related features are good at classifying 3,4
2. Sentiment analysis is good at classifying 1,2 which is hoax and satire
3. Readability score is good at classifying 2,3 from the rest
4. Named Entity Recognition is good at classifying class 3 and 4

In [194]:
all_columns = ['sentiment_compound_score',
       'number_of_words', 'number_of_characters', 'number_of_sentence',
        'DATE',
       'LANGUAGE', 'GPE', 'WORK_OF_ART', 'NORP', 'ORDINAL', 'LOC', 'CARDINAL',
       'FAC', 'PERCENT', 'LAW', 'QUANTITY', 'EVENT', 'PERSON', 'PRODUCT',
       'MONEY', 'ORG', 'TIME', 'total_entities', 'readability']
train_and_test(train_df, test_df, all_columns, [1,2,3,4])

              precision    recall  f1-score   support

           1       0.50      0.48      0.49       750
           2       0.42      0.47      0.44       750
           3       0.60      0.66      0.63       750
           4       0.68      0.55      0.61       750

    accuracy                           0.54      3000
   macro avg       0.55      0.54      0.54      3000
weighted avg       0.55      0.54      0.54      3000



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [197]:
columns = all_columns
features_train = get_feature_from_df(train_df, columns)
features_train = list(map(list, zip(*features_train)))

scaler = MinMaxScaler()
features_train = scaler.fit_transform(features_train)

X_train = features_train
y_train = train_df['Verdict']

model = LogisticRegression()
model.fit(X_train, y_train)

features_test = get_feature_from_df(test_df, columns)
features_test = list(map(list, zip(*features_test)))

features_test = scaler.transform(features_test)
X_test = features_test
y_test = test_df['Verdict']

y_pred = model.predict(X_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [199]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           1       0.50      0.48      0.49       750
           2       0.42      0.47      0.44       750
           3       0.60      0.66      0.63       750
           4       0.68      0.55      0.61       750

    accuracy                           0.54      3000
   macro avg       0.55      0.54      0.54      3000
weighted avg       0.55      0.54      0.54      3000



In [198]:
labels = {1: 'Satire', 2: 'Hoax', 3: 'Propaganda', 4: 'Reliable News'}
test_df["predicted"] = y_pred
misclassified = test_df[test_df["Verdict"] != y_pred]

for i in range(1,5):
    misclassified_i =  misclassified[misclassified['Verdict'] == i]
    print(f'Class {i} ({labels[i]}) misclassified: {len(misclassified_i)}')
    for j in range(1,5):
        if i == j:
            continue
        print(f'Mispredicted as class {j} ({labels[j]}): {len(misclassified_i[misclassified_i["predicted"] == j])}')
    
    print('')

Class 1 (Satire) misclassified: 391
Mispredicted as class 2 (Hoax): 350
Mispredicted as class 3 (Propaganda): 6
Mispredicted as class 4 (Reliable News): 35

Class 2 (Hoax) misclassified: 395
Mispredicted as class 1 (Satire): 123
Mispredicted as class 3 (Propaganda): 194
Mispredicted as class 4 (Reliable News): 78

Class 3 (Propaganda) misclassified: 256
Mispredicted as class 1 (Satire): 140
Mispredicted as class 2 (Hoax): 36
Mispredicted as class 4 (Reliable News): 80

Class 4 (Reliable News) misclassified: 340
Mispredicted as class 1 (Satire): 99
Mispredicted as class 2 (Hoax): 107
Mispredicted as class 3 (Propaganda): 134



In [18]:
# Investigate hyperlinks
train_df["hyperlink_counts"] = train_df["Text"].apply(lambda x: x.count("http"))
train_df["capitalization_counts"] = train_df["Text"].apply(lambda x: sum(1 for c in x if c.isupper()))
train_df["capitalization_ratio"] = train_df["capitalization_counts"] / train_df["number_of_words"]
train_df["hyperlink_ratio"] = train_df["hyperlink_counts"] / train_df["number_of_words"]


test_df["hyperlink_counts"] = test_df["Text"].apply(lambda x: x.count("http"))
test_df["capitalization_counts"] = test_df["Text"].apply(lambda x: sum(1 for c in x if c.isupper()))
test_df["capitalization_ratio"] = test_df["capitalization_counts"] / test_df["number_of_words"]
test_df["hyperlink_ratio"] = test_df["hyperlink_counts"] / test_df["number_of_words"]

## 1.6 Hyperlinks

### Distribution 


In [8]:
columns =  ['hyperlink_counts', 'hyperlink_ratio']
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)

for column in columns:
    print(column)
    print(train_df.groupby('Verdict')[column].describe())
    

hyperlink_counts
          count      mean       std  min  25%  50%  75%   max
Verdict                                                      
1        3981.0  0.001507  0.050103  0.0  0.0  0.0  0.0   2.0
2        4014.0  0.005730  0.100916  0.0  0.0  0.0  0.0   4.0
3        4008.0  0.257984  1.128973  0.0  0.0  0.0  0.0  22.0
4        3997.0  0.006255  0.145713  0.0  0.0  0.0  0.0   8.0
hyperlink_ratio
          count      mean       std  min  25%  50%  75%       max
Verdict                                                          
1        3981.0  0.000004  0.000149  0.0  0.0  0.0  0.0  0.007874
2        4014.0  0.000056  0.001623  0.0  0.0  0.0  0.0  0.085106
3        4008.0  0.000408  0.003901  0.0  0.0  0.0  0.0  0.157895
4        3997.0  0.000019  0.000583  0.0  0.0  0.0  0.0  0.034483


We can very clearly see that, number of words, and number of characters and number of sentence has is quite distinct for class 3(propaganda) and 4(reliable news) in comparison to the rest. So ideally, if we use these features, we should be able to perform better for than random guess

In [15]:
train_and_test(train_df, test_df, ['hyperlink_counts'], [1,2,3,4])
train_and_test(train_df, test_df, ['hyperlink_ratio'], [1,2,3,4])

              precision    recall  f1-score   support

           1       0.00      0.00      0.00       750
           2       0.26      0.88      0.40       750
           3       0.77      0.48      0.59       750
           4       0.00      0.00      0.00       750

    accuracy                           0.34      3000
   macro avg       0.26      0.34      0.25      3000
weighted avg       0.26      0.34      0.25      3000

              precision    recall  f1-score   support

           1       0.00      0.00      0.00       750
           2       0.26      0.88      0.40       750
           3       0.77      0.48      0.59       750
           4       0.00      0.00      0.00       750

    accuracy                           0.34      3000
   macro avg       0.26      0.34      0.25      3000
weighted avg       0.26      0.34      0.25      3000



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


It is good in differentiating class 2 and class 3

## 1.7 Capitalization Count

In [19]:
columns = ['capitalization_counts', 'capitalization_ratio']
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)

for column in columns:
    print(column)
    print(train_df.groupby('Verdict')[column].describe())

capitalization_counts
          count        mean         std  min   25%    50%    75%     max
Verdict                                                                 
1        3981.0   46.287867   45.167956  1.0  17.0   26.0   66.0   694.0
2        4014.0   48.484056   17.572210  0.0  37.0   47.0   57.0   347.0
3        4008.0  183.439122  246.440156  2.0  50.0  129.0  235.0  5128.0
4        3997.0   85.460345   73.631762  0.0  38.0   67.0  113.0   821.0
capitalization_ratio
          count      mean       std       min       25%       50%       75%       max
Verdict                                                                              
1        3981.0  0.169902  0.109548  0.026316  0.111111  0.151515  0.200286  2.360000
2        4014.0  0.268093  0.111387  0.000000  0.210898  0.251121  0.300669  2.250000
3        4008.0  0.367140  0.385089  0.060943  0.164618  0.216317  0.310018  6.000000
4        3997.0  0.217261  0.115778  0.000000  0.152807  0.191892  0.244992  1.553571


In [20]:
train_and_test(train_df, test_df, ['capitalization_counts'], [1,2,3,4])
train_and_test(train_df, test_df, ['capitalization_ratio'], [1,2,3,4])

              precision    recall  f1-score   support

           1       0.40      0.07      0.12       750
           2       0.20      0.39      0.27       750
           3       0.33      0.35      0.34       750
           4       0.26      0.20      0.22       750

    accuracy                           0.25      3000
   macro avg       0.30      0.25      0.24      3000
weighted avg       0.30      0.25      0.24      3000

              precision    recall  f1-score   support

           1       0.25      0.66      0.36       750
           2       0.36      0.17      0.23       750
           3       0.05      0.01      0.02       750
           4       0.26      0.15      0.19       750

    accuracy                           0.25      3000
   macro avg       0.23      0.25      0.20      3000
weighted avg       0.23      0.25      0.20      3000



Not too useful - worst than random

# 1.8 Emotions

In [125]:
emotion = {'anger': 0,
 'anticipation': 0,
 'disgust': 0,
 'fear': 0,
 'joy': 0,
 'negative': 0,
 'positive': 0,
 'sadness': 0,
 'surprise': 0,
 'trust': 0}

emotion_columns = list(emotion.keys())

In [123]:
columns = emotion_columns
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)

for column in columns:
    print(column)
    print(train_df.groupby('Verdict')[column].describe())

anger
          count       mean        std  min  25%  50%    75%    max
Verdict                                                           
1        3981.0   3.040191   3.790612  0.0  0.0  2.0   4.00   32.0
2        4014.0   2.773543   2.521433  0.0  1.0  2.0   4.00   28.0
3        4008.0  12.544910  18.276486  0.0  1.0  7.0  16.25  284.0
4        3997.0   4.478109   5.451427  0.0  1.0  3.0   6.00   53.0
anticipation
          count       mean        std  min  25%   50%   75%    max
Verdict                                                           
1        3981.0   5.433811   5.367261  0.0  2.0   3.0   8.0   38.0
2        4014.0   3.040857   2.438544  0.0  1.0   3.0   4.0   30.0
3        4008.0  14.306138  18.028423  0.0  3.0  10.0  20.0  290.0
4        3997.0   7.628972   7.015655  0.0  2.0   6.0  11.0   64.0
disgust
          count      mean        std  min  25%  50%  75%    max
Verdict                                                        
1        3981.0  2.087415   2.801901  0.0

In [126]:
train_and_test(train_df, test_df, emotion_columns, [1,2,3,4])

              precision    recall  f1-score   support

           1       0.27      0.16      0.20       750
           2       0.24      0.47      0.32       750
           3       0.26      0.23      0.25       750
           4       0.25      0.14      0.18       750

    accuracy                           0.25      3000
   macro avg       0.26      0.25      0.24      3000
weighted avg       0.26      0.25      0.24      3000



# Combining TF-IDF with our custom feature

In [153]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import chi2, SelectKBest
from sklearn.preprocessing import StandardScaler, MinMaxScaler

# Assuming df["Notes"] contains your textual data
# and df["Class"] contains your target variable with four classes
df = pd.read_csv('./raw_data/xtrain.csv', names=['Verdict', "Text"])

pickle1 = pd.read_pickle("./pickles/sentiment_ner_length_readability_related_readability.pkl")
pickle2 = pd.read_pickle("./pickles/sentiment_ner_length_related_test.pkl")

# # remove samples that has more less than 10 words or more than 10000 words
# pickle1["Text"] = pickle1["Text"].apply(lambda x: x if len(x.split()) > 10 and len(x.split()) < 10000 else None)
# pickle1 = pickle1.dropna()
# df["Text"] = df["Text"].apply(lambda x: x if len(x.split()) > 10 and len(x.split()) < 10000 else None)
# df = df.dropna()

# Investigate hyperlinks
# pickle1["hyperlink_counts"] = pickle1["Text"].apply(lambda x: x.count("http"))
# pickle1["capitalization_counts"] = pickle1["Text"].apply(lambda x: sum(1 for c in x if c.isupper()))
# pickle1["capitalization_ratio"] = pickle1["capitalization_counts"] / pickle1["number_of_words"]
# pickle1["hyperlink_ratio"] = pickle1["hyperlink_counts"] / pickle1["number_of_words"]


# pickle2["hyperlink_counts"] = pickle2["Text"].apply(lambda x: x.count("http"))
# pickle2["capitalization_counts"] = pickle2["Text"].apply(lambda x: sum(1 for c in x if c.isupper()))
# pickle2["capitalization_ratio"] = pickle2["capitalization_counts"] / pickle2["number_of_words"]
# pickle2["hyperlink_ratio"] = pickle2["hyperlink_counts"] / pickle2["number_of_words"]



def get_feature_from_df(df):
    columns = ['sentiment_compound_score',
       'number_of_words', 'number_of_characters', 'number_of_sentence',
        'DATE', 
       'LANGUAGE', 'GPE', 'WORK_OF_ART', 'NORP', 'ORDINAL', 'LOC', 'CARDINAL',
       'FAC', 'PERCENT', 'LAW', 'QUANTITY', 'EVENT', 'PERSON', 'PRODUCT',
       'MONEY', 'ORG', 'TIME', 'total_entities', 'readability']

    features = []
    for col in columns:
        features.append(list(df[col]))
    # features.append(list(number_of_capitalized))
    return features
df = df.loc[:,~df.columns.duplicated()].copy()
features_train = get_feature_from_df(pickle1)

# we need to reshape it to become (n_samples, n_features)
features_train = list(map(list, zip(*features_train)))
scaler = MinMaxScaler()
features_train = scaler.fit_transform(features_train)

# Step 1: Convert texts to TF-IDF features
vectorizer = TfidfVectorizer(stop_words='english', lowercase=True)
tfidf_train = vectorizer.fit_transform(df["Text"])

# Combine the text features with the extracted features
from scipy.sparse import hstack
X = hstack([tfidf_train, features_train])

In [154]:
print("features_train start from " + str(tfidf_train.shape[1]) + " to " + str(X.shape[1]))

features_train start from 113852 to 113876


In [155]:
# Step 2: Apply Chi-squared test
chi2score = chi2(X, df['Verdict'])

# Step 3: Select the top 60k features
chi2_selector = SelectKBest(chi2, k=60000)
X_kbest_features = chi2_selector.fit_transform(X, df['Verdict'])
# X_kbest_features = X

# get the selected feature names

feature_names = chi2_selector.get_feature_names_out()

In [156]:
feature_cols = ['sentiment_compound_score',
       'number_of_words', 'number_of_characters', 'number_of_sentence',
        'DATE', 
       'LANGUAGE', 'GPE', 'WORK_OF_ART', 'NORP', 'ORDINAL', 'LOC', 'CARDINAL',
       'FAC', 'PERCENT', 'LAW', 'QUANTITY', 'EVENT', 'PERSON', 'PRODUCT',
       'MONEY', 'ORG', 'TIME', 'total_entities', 'readability']
# see the selected features
for i in range(len(feature_names)):
    name = feature_names[i]
    number = name.split('x')[1]
    if int(number) in range(tfidf_train.shape[1], X.shape[1]):
        print("tfidf: " + str(feature_names[i]))

        # index = int(str(feature_names[i]).split('x')[1]) - tfidf_train.shape[1]
        # print(feature_cols[index])



tfidf: x113852
tfidf: x113853
tfidf: x113854
tfidf: x113855
tfidf: x113856
tfidf: x113857
tfidf: x113858
tfidf: x113859
tfidf: x113860
tfidf: x113861
tfidf: x113862
tfidf: x113863
tfidf: x113864
tfidf: x113865
tfidf: x113866
tfidf: x113867
tfidf: x113868
tfidf: x113869
tfidf: x113870
tfidf: x113871
tfidf: x113872
tfidf: x113873
tfidf: x113874
tfidf: x113875


In [160]:
class_weights = {1: 2,  # Class 0
                 2: 4,  # Class 1
                 3: 2, # Class 2 (Assuming this is the underrepresented class)
                 4: 2}  # Class 3
model = LogisticRegression(class_weight=class_weights)
model.fit(X_kbest_features, df['Verdict'])
# Now you can use this model to predict the class of new texts

# Assuming df_test["Notes"] contains the new textual data
df_test = pd.read_csv('./raw_data/balancedtest.csv', names=['Verdict', "Text"])

# Step 1: Convert texts to TF-IDF features
X_test = vectorizer.transform(df_test["Text"])
features_test = get_feature_from_df(pickle2)

# we need to reshape it to become (n_samples, n_features)
features_test = list(map(list, zip(*features_test)))
features_test = scaler.transform(features_test)

X_test = hstack([X_test, features_test])

# Get the top 1000 features as above
X_test_kbest_features = chi2_selector.transform(X_test)
# X_test_kbest_features = X_test

# Step 4: Predict the class of the new texts
predicted_classes = model.predict(X_test_kbest_features)

# Assuming df_test["Class"] contains the true class of the new texts
print(classification_report(df_test["Verdict"], predicted_classes))
print(f1_score(df_test["Verdict"], predicted_classes, average='macro'))


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


              precision    recall  f1-score   support

           1       0.83      0.72      0.77       750
           2       0.74      0.70      0.72       750
           3       0.75      0.83      0.79       750
           4       0.83      0.89      0.86       750

    accuracy                           0.79      3000
   macro avg       0.79      0.79      0.78      3000
weighted avg       0.79      0.79      0.78      3000

0.7843630006585706


In [169]:
df_test

Unnamed: 0,Verdict,Text,predicted
0,1,When so many actors seem content to churn out ...,1
1,1,In what football insiders are calling an unex...,1
2,1,In a freak accident following Game 3 of the N....,1
3,1,North Koreas official news agency announced to...,4
4,1,The former Alaska Governor Sarah Palin would b...,1
...,...,...,...
2995,4,The Air Force mistakenly gave rival companies ...,4
2996,4,The United Nations climate chief on Friday cha...,4
2997,4,River Plate midfielder Diego Buonanotte has un...,4
2998,4,Lawmakers were on the brink Tuesday of exempti...,4


In [46]:
labels = {1: 'Satire', 2: 'Hoax', 3: 'Propaganda', 4: 'Reliable News'}
df_test["predicted"] = predicted_classes
misclassified = df_test[df_test["Verdict"] != df_test["predicted"]]
for i in range(1,5):
    misclassified_i =  misclassified[misclassified['Verdict'] == i]
    print(f'Class {i} ({labels[i]}) misclassified: {len(misclassified_i)}')
    for j in range(1,5):
        if i == j:
            continue
        print(f'Mispredicted as class {j} ({labels[j]}): {len(misclassified_i[misclassified_i["predicted"] == j])}')
    print('')

Class 1 (Satire) misclassified: 211
Mispredicted as class 2 (Hoax): 154
Mispredicted as class 3 (Propaganda): 30
Mispredicted as class 4 (Reliable News): 27

Class 2 (Hoax) misclassified: 222
Mispredicted as class 1 (Satire): 38
Mispredicted as class 3 (Propaganda): 153
Mispredicted as class 4 (Reliable News): 31

Class 3 (Propaganda) misclassified: 129
Mispredicted as class 1 (Satire): 34
Mispredicted as class 2 (Hoax): 16
Mispredicted as class 4 (Reliable News): 79

Class 4 (Reliable News) misclassified: 81
Mispredicted as class 1 (Satire): 36
Mispredicted as class 2 (Hoax): 17
Mispredicted as class 3 (Propaganda): 28

