# Reimplementation of ML Model by Hou et al.

Implementation of a SVM model presented in [Towards Automatic Detection of Misinformation in Online Medical Videos](https://arxiv.org/pdf/1909.01543.pdf).

The model is a LinearSVC model from sklearn with C=1. L2 normalizer is applied to features.

In [None]:
import pandas as pd
from nltk import ngrams
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
import isodate
from sklearn.svm import LinearSVC
from sklearn.preprocessing import Normalizer
from sklearn.compose import make_column_transformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import svm
from imblearn.pipeline import make_pipeline
from imblearn.over_sampling import RandomOverSampler
from collections import Counter
import liwc
from nltk.tokenize import word_tokenize
import readability

## Load dataset

Load a dataset of videos into `videos` pandas DataFrame with the following columns and sample values:

```
published_at                                         2013-12-23 03:21:21+01:00
view_count                                                           1328337.0
like_count                                                             30946.0
dislike_count                                                            706.0
favourite_count                                                              0
comment_count                                                           4254.0
category_id                                                               None
updated_at                                    2021-06-03 10:46:58.939401+02:00
clean_transcript             Translator: Delia Bogdan Reviewer: Ilze Garda ...
annotation                                                           promoting
```

## Calculate counts of word classes in transcript using the LIWC lexicon

In [None]:
parse, category_names = liwc.load_token_parser('LIWC2007_English100131.dic')
lexicon, _ = liwc.dic.read_dic('LIWC2007_English100131.dic')

liwc_category_counts = Counter(
    value
    for key, values in lexicon.items()
    for value in values
)

liwc_transcript_counts = videos['clean_transcript'].apply(
    lambda transcript: pd.DataFrame({
        (category, token)
        for token in word_tokenize(transcript)
        for category in parse(token.lower())
    }, columns=['category', 'token']).groupby('category').size()
).fillna(0)

for column in liwc_transcript_counts.columns:
    liwc_transcript_counts[column] = liwc_transcript_counts[column] / liwc_category_counts[column]

## Calculate readability of transcript using the readability package

In [None]:
readability_scores = videos['clean_transcript'].apply(
    lambda transcript: pd.Series({
        f'{k1}-{k2}': v
        for k1, vs in readability.getmeasures(transcript, lang='en').items()
        for k2, v in vs.items()
    } if len(transcript) > 0 else {})
).fillna(0)
readability_scores.index = videos.index

## Combine features into dataframes X and y

In [None]:
videos['num_tracked_days'] = (
    pd.to_datetime(videos['updated_at'], utc=True) - pd.to_datetime(videos['published_at'], utc=True)
).dt.days

stats = pd.DataFrame({
    'view_count': videos['view_count'] / videos['num_tracked_days'],
    'comment_count': videos['comment_count'],
    'like_count': videos['like_count'],
    'dislike_count': videos['dislike_count'],
    'duration': videos['duration'].apply(isodate.parse_duration).dt.total_seconds(),
    # videos['category_id']
    'clean_transcript': videos['clean_transcript']
}).fillna(0)
X = pd.concat([stats, readability_scores, liwc_transcript_counts], axis=1)
y = videos['annotation']
X.head()

## The machine learning pipeline for different combinations of features

In [None]:
def clf_pipeline(column_transformer):
    return make_pipeline(
        RandomOverSampler(sampling_strategy='not majority'),
        column_transformer,
        LinearSVC(
            random_state=0,
            C=1
        )
    )

clfs = {
    'full': clf_pipeline(
        make_column_transformer(
            (
                make_pipeline(
                    TfidfVectorizer(
                        stop_words='english',
                        ngram_range=(1, 2),
                        max_features=1000
                    ),
                    Normalizer(norm='l2')
                ),
                'clean_transcript'
            ),
            (
                Normalizer(norm='l2'),
                list(set(X.columns).difference(['clean_transcript']))
            )
        )
    ),
    'ngrams': clf_pipeline(
        make_column_transformer(
            (
                make_pipeline(
                    TfidfVectorizer(
                        stop_words='english',
                        ngram_range=(1, 2),
                        max_features=1000
                    ),
                    Normalizer(norm='l2')
                ),
                'clean_transcript'
            )
        )
    ),
    'stats': clf_pipeline(
        make_column_transformer(
            (
                Normalizer(norm='l2'),
                list(set(stats.columns).difference(['clean_transcript']))
            )
        )
    ),
    'readability': clf_pipeline(
        make_column_transformer(
            (
                Normalizer(norm='l2'),
                list(readability_scores.columns)
            )
        )
    ),
    'liwc': clf_pipeline(
        make_column_transformer(
            (
                Normalizer(norm='l2'),
                list(liwc_transcript_counts.columns)
            )
        )
    ),
}

## Cross-validate the pipelines

In [None]:
from sklearn.model_selection import cross_val_predict

sampling = ~y.isna()
predicted = {}
for label, clf in clfs.items():
    print(label)
    predicted[label] = cross_val_predict(clf, X.loc[sampling], y.loc[sampling], cv=5)

## Output the classification report

In [None]:
from sklearn.metrics import classification_report

for label, y_pred in predicted.items():
    print(label)
    print(classification_report(y.loc[sampling], y_pred))
    print()
    print()