In [39]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

| Label | Description     |
|-------|-----------------|
| 1     | Satire          |
| 2     | Hoax            |
| 3     | Propaganda      |
| 4     | Reliable News   |

In [19]:
column_labels = ['label', 'text']

In [20]:
train_df = pd.read_csv('data/fulltrain.csv', header=None, names=column_labels)

In [22]:
test__df = pd.read_csv('data/balancedtest.csv', header=None, names=column_labels)

In [21]:
train_df.head()
# test__df.head()

Unnamed: 0,label,text
0,1,"A little less than a decade ago, hockey fans w..."
1,1,The writers of the HBO series The Sopranos too...
2,1,Despite claims from the TV news outlet to offe...
3,1,After receiving 'subpar' service and experienc...
4,1,After watching his beloved Seattle Mariners pr...


In [11]:
train_df.shape

(48854, 2)

In [12]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48854 entries, 0 to 48853
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   label   48854 non-null  int64 
 1   text    48854 non-null  object
dtypes: int64(1), object(1)
memory usage: 763.5+ KB


In [25]:
train_df["label"].value_counts()

label
3    17870
1    14047
4     9995
2     6942
Name: count, dtype: int64

In [26]:
test__df.head()

Unnamed: 0,label,text
0,1,When so many actors seem content to churn out ...
1,1,In what football insiders are calling an unex...
2,1,In a freak accident following Game 3 of the N....
3,1,North Koreas official news agency announced to...
4,1,The former Alaska Governor Sarah Palin would b...


In [27]:
test__df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000 entries, 0 to 2999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   label   3000 non-null   int64 
 1   text    3000 non-null   object
dtypes: int64(1), object(1)
memory usage: 47.0+ KB


In [28]:
test__df["label"].value_counts()

label
1    750
2    750
3    750
4    750
Name: count, dtype: int64

In [29]:
test__df.shape

(3000, 2)

In [30]:
data_df = pd.concat([train_df, test__df], ignore_index=True)

In [31]:
data_df.head()

Unnamed: 0,label,text
0,1,"A little less than a decade ago, hockey fans w..."
1,1,The writers of the HBO series The Sopranos too...
2,1,Despite claims from the TV news outlet to offe...
3,1,After receiving 'subpar' service and experienc...
4,1,After watching his beloved Seattle Mariners pr...


In [32]:
data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51854 entries, 0 to 51853
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   label   51854 non-null  int64 
 1   text    51854 non-null  object
dtypes: int64(1), object(1)
memory usage: 810.3+ KB


In [33]:
data_df.shape

(51854, 2)

In [34]:
data_df["label"].value_counts()

label
3    18620
1    14797
4    10745
2     7692
Name: count, dtype: int64

In [36]:
data_labels_df = data_df["label"]

In [37]:
data_texts__df = data_df["text"]

In [40]:
X_train, X_test, y_train, y_test = train_test_split(
    data_texts__df,
    data_labels_df,
    test_size=0.2,
    random_state=45,
    stratify=data_labels_df
)

In [41]:
X_train.head()

37267    Wal-Marts List of Crimes Expands by Adding Bri...
17800    Supreme Court Justice John Roberts Signs Off O...
22957    Defiant Filipino President Just Dared the US t...
11572    Following an onboard fire that has left more t...
Name: text, dtype: object

In [42]:
y_train.head()

37267    3
17800    2
22957    3
34345    3
11572    1
Name: label, dtype: int64

In [43]:
X_train.info()

<class 'pandas.core.series.Series'>
Index: 41483 entries, 37267 to 28622
Series name: text
Non-Null Count  Dtype 
--------------  ----- 
41483 non-null  object
dtypes: object(1)
memory usage: 648.2+ KB


In [44]:
X_test.info()

<class 'pandas.core.series.Series'>
Index: 10371 entries, 10623 to 32994
Series name: text
Non-Null Count  Dtype 
--------------  ----- 
10371 non-null  object
dtypes: object(1)
memory usage: 162.0+ KB


In [46]:
y_train.value_counts()

label
3    14896
1    11837
4     8596
2     6154
Name: count, dtype: int64

In [47]:
y_test.value_counts()

label
3    3724
1    2960
4    2149
2    1538
Name: count, dtype: int64

At this point, the training and testing data is ready

In [49]:
# Import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import re

token_pattern = r'(?u)\b[A-Za-z][A-Za-z]+\b'

# Initialize a TfidfVectorizer object: tfidf_vectorizer
tfidf_vectoriser = TfidfVectorizer(token_pattern=token_pattern, stop_words='english', max_df=0.9)

# Transform the training data: tfidf_train 
tfidf_train = tfidf_vectoriser.fit_transform(X_train)

# Transform the test data: tfidf_test 
tfidf_test = tfidf_vectoriser.transform(X_test)

# Print the first 10 features
print(tfidf_vectoriser.get_feature_names_out()[:400])

# Print the first 5 vectors of the tfidf training data
print(tfidf_train.A[:5])

['aa' 'aaa' 'aaaaaaaaaaahhhhhhhh' 'aaaaaahhhh' 'aaaaahhhh' 'aaaaand'
 'aaaaarrrggggghhhhhhhh' 'aaaawwwwgagagaaaaahhhhhh' 'aaae' 'aaah'
 'aaahahahhaaaa' 'aaahhh' 'aaai' 'aaamen' 'aaanet' 'aaanndd' 'aaapg'
 'aaas' 'aab' 'aabar' 'aaby' 'aabys' 'aac' 'aachen' 'aacps' 'aacrjournals'
 'aad' 'aadil' 'aadministration' 'aaem' 'aaemonline' 'aaems' 'aaeon'
 'aafa' 'aafedt' 'aafia' 'aah' 'aahad' 'aahed' 'aahf' 'aahhh' 'aahhhhh'
 'aahing' 'aahs' 'aai' 'aaj' 'aal' 'aalam' 'aaliyah' 'aalst' 'aalto'
 'aalw' 'aam' 'aama' 'aamc' 'aamer' 'aamers' 'aamir' 'aamodt' 'aamof'
 'aamva' 'aan' 'aand' 'aang' 'aani' 'aans' 'aao' 'aap' 'aapl' 'aapp'
 'aappublications' 'aaps' 'aar' 'aarabi' 'aarabis' 'aarde' 'aargh'
 'aarhus' 'aarif' 'aarin' 'aaron' 'aaronessa' 'aaronovitch' 'aarons'
 'aaronsenvironmental' 'aaronson' 'aaronsons' 'aaroob' 'aarp' 'aarron'
 'aars' 'aarthi' 'aarti' 'aas' 'aasia' 'aasp' 'aassi' 'aat' 'aatif'
 'aatoxin' 'aau' 'aauw' 'aav' 'aavp' 'aayda' 'aazab' 'aazaz' 'ab' 'aba'
 'abaaoud' 'ababa' 'ababb

In [50]:
# Number of features
num_features = len(tfidf_vectoriser.vocabulary_)
print(f'Number of features: {num_features}')

Number of features: 196721


In [51]:
# Shape of the TF-IDF matrix
print(f'Shape of TF-IDF matrix: {tfidf_train.shape}')
print(f'Number of features: {tfidf_train.shape[1]}')

Shape of TF-IDF matrix: (41483, 196721)
Number of features: 196721


In [53]:
print(tfidf_vectoriser.get_feature_names_out()[189900:190000])


['wenzel' 'wenzhou' 'weo' 'weoskdif' 'wep' 'weperceive' 'weponds' 'wept'
 'wer' 'weragama' 'werb' 'werd' 'werdenberg' 'werder' 'werdlick' 'wereabu'
 'wereaccording' 'wereadily' 'wereand' 'wereas' 'werebasically'
 'werebefore' 'werebrought' 'wereby' 'wereengaging' 'wereentering'
 'wereessentially' 'werein' 'wereirradiated' 'wereit' 'werely'
 'weremainly' 'weremeeting' 'weremore' 'weren' 'werent' 'werentthat'
 'wereplanning' 'wereresponsible' 'wereseen' 'weresignicant'
 'weresupposed' 'weret' 'weretalking' 'wereunarmed' 'wereusing' 'werewolf'
 'werewolves' 'werewrong' 'werfe' 'werkmeister' 'werkx' 'werlhof'
 'wermers' 'wermke' 'wernecke' 'werner' 'wernher' 'werning' 'wernt'
 'werrity' 'werritys' 'wershub' 'werst' 'wert' 'wertelecki' 'werteleckis'
 'werth' 'wertham' 'wertheim' 'wertheimer' 'werther' 'werthmann' 'wertree'
 'wertz' 'wes' 'wesa' 'wesbecker' 'wesbury' 'wescott' 'wesearchr' 'wesee'
 'weserve' 'wesigned' 'wesite' 'wesket' 'weslaco' 'wesley' 'wesleyan'
 'wesmc' 'wespac' 'wessel'

In [54]:
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier

# Assuming X_tfidf is your TF-IDF features and Y is the label matrix
model = OneVsRestClassifier(LogisticRegression(max_iter=1000))
model.fit(tfidf_train, y_train)

predictions = model.predict(tfidf_test)

from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, predictions)

In [55]:
print(accuracy)

0.9540063638993347


In [70]:
#test_text = "World Leaders Announce Plan to Switch to Symbolic Economy, Trading National Landmarks for Climate Action Credits. In a groundbreaking move that stunned economists worldwide, leaders from over 150 countries convened yesterday to unveil a new economic model based on the trade of iconic national landmarks for climate action credits. 'Why deal with currency fluctuations when you can own the Eiffel Tower?' joked one delegate."
#test_text = "Scientists Confirm: Drinking Two Gallons of Ice Water Daily Reverses Aging. Recent studies from a prestigious university (name withheld for privacy) have demonstrated that individuals who drink at least two gallons of ice water daily can reverse the effects of aging, including the elimination of wrinkles and restoration of hair color. Critics demand peer review, while Hollywood stars have already jumped on the icy bandwagon."
#test_text = "Global Coalition Admits: Renewable Energy a Hoax to Undermine Economic Stability. In a shocking revelation, spokespersons from the Global Coalition for Fossil Fuels disclosed intercepted communications proving that the recent push for renewable energy sources is an elaborate scheme designed to destabilize the global economy. 'It's clear that solar panels and wind turbines are just the beginning of an anti-economic stability agenda,' the report claims."
test_text = "Central Bank Raises Interest Rates to Combat Inflation. In response to the rising inflation that has been impacting the economy over the past months, the central bank announced today an increase in interest rates by 0.5 percentage points. The decision, expected by many analysts, aims to curb inflation by discouraging borrowing and spending. Central Bank President stated, 'This measure is crucial for stabilizing our economy and ensuring sustainable growth.'"
test_text_tfidf = tfidf_vectoriser.transform([test_text])
prediction = model.predict(test_text_tfidf)

# Assuming 'label_names' is a list of your label names in the same order as during model training:
label_names = ['Satire', 'Hoax', 'Propaganda', 'Reliable News']

# Convert binary predictions to label names:
# predicted_labels = [label_names[i] for i, label in enumerate(prediction[0]) if label == 1]
# print("Predicted labels:", predicted_labels)
# predicted_label = label_names[prediction]
# print("Predicted label:", predicted_label)

print(type(prediction))
print(prediction)

<class 'numpy.ndarray'>
[4]
