In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from textstat.textstat import *
import re
import nltk
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay



In [2]:
tokenizer = nltk.RegexpTokenizer(r"\w+")
lemmatizer = nltk.WordNetLemmatizer()

In [3]:
df_consp = pd.read_csv(f'./data/conspiracy_submissions_filtered.csv', compression='gzip')
df_consp['label'] = 0
df_consp


Unnamed: 0,author,title,label
0,ChildrensHealthDef,"New York Post Attacks RFK, Jr. as Sales of ‘Th...",0
1,Lotso_Packetloss,What good are NATO sanctions against Russia wh...,0
2,mr_clemFandango,Ukraine Crisis - What You're Not Being Told 20...,0
3,lh7884,Bill Gates: 'If every country does what Austra...,0
4,One2alwaysplay,What's really going on??,0
...,...,...,...
7045,Peter-Rabbi,Anyone else’s cable internet down in the US?,0
7046,JackieChanG242,What if the was no afterlife and no upper power?,0
7047,nelbar,Ukraine Psyops trained by NATO | leaked documents,0
7048,nelbar,Ukraine Psyops trained by NATO | leaked documents,0


In [7]:
df_science = pd.read_csv(f'./data/worldnews_submissions_filtered.csv', compression='gzip')
df_science['label'] = 1
df_science
df = pd.concat([df_consp, df_science], axis=0)
df


Unnamed: 0,author,title,label
0,ChildrensHealthDef,"New York Post Attacks RFK, Jr. as Sales of ‘Th...",0
1,Lotso_Packetloss,What good are NATO sanctions against Russia wh...,0
2,mr_clemFandango,Ukraine Crisis - What You're Not Being Told 20...,0
3,lh7884,Bill Gates: 'If every country does what Austra...,0
4,One2alwaysplay,What's really going on??,0
...,...,...,...
14123,lolita4fun,Putin puts nuclear forces on alert,1
14124,Im-_Batman,The Ukraine president in military uniform that...,1
14125,Barang168,A series of armed explosions in eastern Ukrain...,1
14126,kukoo2112,Ukraine confirms peace talks with Russia today.,1


In [8]:
submissions = df.title.values

In [9]:
def preprocessing(text):
    text = str(text)
    text = text.lower()
    return re.sub(r"(@\[A-Za-z0-9]+)|(\w+:\/\/\S+)|^&gt;","",text)

def tokenize(text):
    text = re.sub(r"[^0-9A-Za-z \t]","",text)
    text = tokenizer.tokenize(text)
    text = [lemmatizer.lemmatize(word, pos="v") for word in text]
    return text



In [10]:
tfidf_vectorizer = TfidfVectorizer(preprocessor=preprocessing,
            tokenizer=tokenize,
            stop_words=nltk.corpus.stopwords.words('english'),
            ngram_range=(1,3),
            max_features=10000)
tfidf = tfidf_vectorizer.fit_transform(submissions).toarray()



In [11]:
X = pd.DataFrame(tfidf)
y = df['label'].values
X

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,9990,9991,9992,9993,9994,9995,9996,9997,9998,9999
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21173,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
21174,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
21175,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
21176,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)
model = LogisticRegression(class_weight='balanced', multi_class='ovr', penalty='l2', C=0.01, random_state=42).fit(X_train, y_train)

y_predict = model.predict(X_test)

rep = classification_report(y_test, y_predict)

print(rep)


              precision    recall  f1-score   support

           0       0.67      0.82      0.74      1392
           1       0.90      0.80      0.85      2844

    accuracy                           0.81      4236
   macro avg       0.79      0.81      0.80      4236
weighted avg       0.83      0.81      0.81      4236



In [None]:
conf_matrix = confusion_matrix(y_test, y_predict, normalize='true')
conf_matrix_display = ConfusionMatrixDisplay(confusion_matrix=conf_matrix)
conf_matrix_display.plot()

: 