In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report


# Data


In [2]:
train = pd.read_csv('../data/train.csv')
print(f'train shape: {train.shape}')
print(train.head())
test = pd.read_csv('../data/test.csv')
print(f'test shape: {test.shape}')
print(test.head())

train shape: (80, 2)
                                                Text  Disinformation
0  explainer blood treasure and chaos the cost of...               0
1  declared that Russia wants to resolve the situ...               1
2  Deputy persecution of the priests of the UOC a...               1
3  VSU deliberately do not remove the bodies of t...               1
4  Zakharova announced EU plans to “train terrori...               1
test shape: (20, 2)
                                                Text  Disinformation
0  treasure the usa must inevitably leave europe ...               1
1  moscow warns west against ‘playing with fire’ ...               1
2  dead soldiers apu end up in europenbsp mdash b...               1
3  the preparation of a provocation of Kyiv at th...               1
4  ukraine refugees face uncertainty and precarit...               0


# Models


## Random Forest (RF)


In [3]:
# Train rf classifier
vectorizer = TfidfVectorizer()
classifier = RandomForestClassifier()
model = make_pipeline(vectorizer, classifier)
model.fit(train['Text'], train['Disinformation'])
labels = model.predict(test['Text'])

### Performance


In [4]:
# Evaluate the model
print(classification_report(test['Disinformation'], labels))

              precision    recall  f1-score   support

           0       0.89      0.80      0.84        10
           1       0.82      0.90      0.86        10

    accuracy                           0.85        20
   macro avg       0.85      0.85      0.85        20
weighted avg       0.85      0.85      0.85        20



In [5]:
import pickle
with open('../models/rf_model.pkl', 'wb') as f:
    pickle.dump(model, f)

## Support Vector Machine (SVM)

In [17]:
# Train svm classifier
vectorizer = TfidfVectorizer()
classifier = svm.SVC()
model = make_pipeline(vectorizer, classifier)
model.fit(train['Text'], train['Disinformation'])
labels = model.predict(test['Text'])

### Performance


In [18]:
# Evaluate the model
print(classification_report(test['Disinformation'], labels))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        10
           1       1.00      1.00      1.00        10

    accuracy                           1.00        20
   macro avg       1.00      1.00      1.00        20
weighted avg       1.00      1.00      1.00        20



## Naive Bayes (NB)

In [21]:
# Train svm classifier
vectorizer = TfidfVectorizer()
classifier = MultinomialNB()
model = make_pipeline(vectorizer, classifier)
model.fit(train['Text'], train['Disinformation'])
labels = model.predict(test['Text'])

### Performance


In [22]:
# Evaluate the model
print(classification_report(test['Disinformation'], labels))

              precision    recall  f1-score   support

           0       0.91      1.00      0.95        10
           1       1.00      0.90      0.95        10

    accuracy                           0.95        20
   macro avg       0.95      0.95      0.95        20
weighted avg       0.95      0.95      0.95        20

