<a href="https://colab.research.google.com/github/linzhi0918/deliberative-politics/blob/main/notebooks/Revised_02_Deliberation_baselines.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# This codebook was run in Google Colab

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
DATA_PATH = "/content/drive/MyDrive/Colab_Notebooks/deliberative-politics-main/data/sample2.csv"
OUTPUT_DIR = '/content/drive/MyDrive/Colab_Notebooks/deliberative-politics-main/data'     # You'll get 2 directories here, one will have the results and one will have CSVs with extracted features

X_col = 'text'  # Name of X column (string)
y_col = 'label'        # Name of y column (0/1)

# Only harbingers and politeness features are extracted in the last section (not liwc)

# Setting up shop

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import RFE
from sklearn.utils import class_weight
from sklearn import metrics
from sklearn.model_selection import train_test_split
import numpy as np
from scipy.sparse import csr_matrix
import spacy
from spacy.lang.en import English
from spacy.lang.en.stop_words import STOP_WORDS
import sys
import os
import warnings
import pandas as pd
import json
import string
import re
import nltk
import random
import matplotlib.pyplot as plt
from nltk.tokenize import RegexpTokenizer
warnings.filterwarnings("ignore")
MODIFIED_DATA = os.path.join(OUTPUT_DIR, '/content/drive/MyDrive/Colab_Notebooks/deliberative-politics-main/data/modified_data')
OUTPUT_DIR = os.path.join(OUTPUT_DIR, '/content/drive/MyDrive/Colab_Notebooks/deliberative-politics-main/data/results')
os.makedirs(MODIFIED_DATA, exist_ok=True)
os.makedirs(OUTPUT_DIR, exist_ok=True)

In [None]:
nlp = English()

def is_number(tok):
    try:
        float(tok)
        return True
    except ValueError:
        return False

def spacy_tokenizer(text):
    return [tok.text if not is_number(tok.text) else '_NUM_' for tok in nlp(text)]

def update_metrics(report, y_test, y_pred, minority_class):
    report[0] += metrics.accuracy_score(y_test, y_pred)
    report[1] += metrics.f1_score(y_test, y_pred)
    report[2] += metrics.precision_score(y_test, y_pred)
    report[3] += metrics.recall_score(y_test, y_pred)
    report[4] += metrics.f1_score(y_test, y_pred, average='macro')
    temp = classification_report(y_test, y_pred, output_dict=True)
    key = str(minority_class)
    if key not in temp:
        key += '.0'
    report[5] += temp[key]['f1-score']
    return report

def sklearn_models(df, X_col, y_col, OUTPUT_PATH, generate_Xy, folds=10):

    df = df[df[y_col].notna()]
    kfold = StratifiedKFold(folds, shuffle=True, random_state=1)
    report = []
    vc = dict(df[y_col].value_counts())
    minority_class = min(vc, key=vc.get)

    classifiers = {
                 'logreg': LogisticRegression(class_weight='balanced'),
                'knn': KNeighborsClassifier(),
                'gaussianNB': GaussianNB(),
                'bernoulliNB': BernoulliNB(),
                'adaboost': AdaBoostClassifier(),
                'gradient-boosting': GradientBoostingClassifier(),
                'dec-tree': DecisionTreeClassifier(),
                'linear-svc': LinearSVC(class_weight='balanced'),
                'c-svc': SVC(class_weight='balanced')
                }


    for method, clf in classifiers.items():
        running_report = [0]*6
        for train_idx, test_idx in kfold.split(df, df[y_col]):
            train, test = df.iloc[train_idx], df.iloc[test_idx]
            X_train, y_train, X_test, y_test = generate_Xy(train, test, X_col=X_col, y_col=y_col, method=method)
            clf.fit(X_train, y_train)
            y_pred = clf.predict(X_test)
            running_report = update_metrics(running_report, y_test, y_pred, minority_class)

        report.append([method] + [x / folds for x in running_report])
        print(method, 'done!')
    report = pd.DataFrame(report, columns = ['method', 'accuracy', 'f1', 'precision', 'recall', 'macro-f1', 'minority-f1'])
    report.to_csv(OUTPUT_PATH)
    return report

## Count vectorizer

In [None]:
def generate_Xy(train, test, **kwargs):

    main = pd.concat([train, test])
    vectorizer = CountVectorizer(tokenizer=spacy_tokenizer, stop_words='english', strip_accents='unicode')
    corpus = list(main[kwargs['X_col']].str.lower())
    X = vectorizer.fit_transform(corpus)
    main = main.join(pd.DataFrame(X.toarray()).add_prefix('count_'))
    main.to_csv(os.path.join(MODIFIED_DATA, '/content/drive/MyDrive/Colab_Notebooks/deliberative-politics-main/modified_data/count.csv'))

    vectorizer = CountVectorizer(tokenizer=spacy_tokenizer, stop_words='english', strip_accents='unicode')
    corpus = list(train[kwargs['X_col']].str.lower())
    X_train = vectorizer.fit_transform(corpus)
    X_test = vectorizer.transform(list(test[kwargs['X_col']].str.lower()))
    X_train, y_train = csr_matrix(X_train), train[kwargs['y_col']]
    X_test, y_test = csr_matrix(X_test), test[kwargs['y_col']]
    non_sparse = ['gaussianNB', 'lda']
    if(kwargs['method'] in non_sparse):
        X_train, X_test = X_train.toarray(), X_test.toarray()
    return X_train, y_train, X_test, y_test

In [None]:
from sklearn.svm import LinearSVC
from sklearn.datasets import load_iris
from sklearn.feature_selection import SelectFromModel
df = pd.read_csv(DATA_PATH)
results = sklearn_models(df, X_col, y_col, os.path.join(OUTPUT_DIR, '/content/drive/MyDrive/Colab_Notebooks/deliberative-politics-main/modified_data/count_vectorizer.csv'), generate_Xy)

logreg done!
knn done!
gaussianNB done!
bernoulliNB done!
adaboost done!
gradient-boosting done!
dec-tree done!
linear-svc done!
c-svc done!


## Tfidf vectorizer

In [None]:
def generate_Xy(train, test, **kwargs):

    main = pd.concat([train, test])
    vectorizer = TfidfVectorizer(tokenizer=spacy_tokenizer, stop_words='english', strip_accents='unicode')
    corpus = list(main[kwargs['X_col']].str.lower())
    X = vectorizer.fit_transform(corpus)
    main = main.join(pd.DataFrame(X.toarray()).add_prefix('tfidf_'))
    main.to_csv(os.path.join(MODIFIED_DATA, '/content/drive/MyDrive/Colab_Notebooks/deliberative-politics-main/modified_data/tfidf.csv'))

    vectorizer = TfidfVectorizer(tokenizer=spacy_tokenizer, stop_words='english', strip_accents='unicode')
    corpus = list(train[kwargs['X_col']].str.lower())
    X_train = vectorizer.fit_transform(corpus)
    X_test = vectorizer.transform(list(test[kwargs['X_col']].str.lower()))
    X_train, y_train = csr_matrix(X_train), train[kwargs['y_col']]
    X_test, y_test = csr_matrix(X_test), test[kwargs['y_col']]
    non_sparse = ['gaussianNB', 'lda']
    if(kwargs['method'] in non_sparse):
        X_train, X_test = X_train.toarray(), X_test.toarray()
    return X_train, y_train, X_test, y_test

In [None]:
df = pd.read_csv(DATA_PATH)
results = sklearn_models(df, X_col, y_col, os.path.join(OUTPUT_DIR, '/content/drive/MyDrive/Colab_Notebooks/deliberative-politics-main/modified_data/tfidf_vectorizer.csv'), generate_Xy)

logreg done!
knn done!
gaussianNB done!
bernoulliNB done!
adaboost done!
gradient-boosting done!
dec-tree done!
linear-svc done!
c-svc done!


## Feature rich prediction

In [None]:
def extract_harbingers(df, X_col):

    with open('/content/drive/MyDrive/Colab_Notebooks/deliberative-politics-main/lexica/2015_Diplomacy_lexicon.json') as f:
        features = json.loads(f.readline())

    for feature in features:
        harbingers = [harbinger.encode('ascii', 'ignore').decode('ascii').lower() for harbinger in features[feature]]
        features[feature] = harbingers

    def clean_text(text):
        text = str(text)
        text = text.replace('\'', '')
        text = text.lower()
        text = text.replace('{html}',"")
        text = re.sub(re.compile('<.*?>'), '', text)
        text = re.sub(r'http\S+', '', text)
        text = re.sub('[0-9]+', '', text)
        tokenizer = RegexpTokenizer(r'\w+')
        tokens = tokenizer.tokenize(text)
        text = " ".join(tokens)
        return text

    def get_feature_frequency(text, feature):
        count = 0
        for harbinger in features[feature]:
            count += text.count(harbinger)
        return count

    df['clean_text'] = df.apply(lambda row: clean_text(row[X_col]), axis=1)
    for feature in features:
        df[feature] = df.apply(lambda row: get_feature_frequency(row['clean_text'], feature), axis=1)


In [None]:
!pip install convokit
from convokit import Corpus, Speaker, Utterance
from convokit import download
from convokit import TextParser
from convokit import PolitenessStrategies
ps = PolitenessStrategies()
spacy_nlp = spacy.load('en_core_web_sm', disable=['ner'])
cols = list(ps.transform_utterance("hello, could you please help me proofread this article?", spacy_nlp=spacy_nlp).meta['politeness_strategies'])

def extract_politeness_feats(df, X_col):

    def extract_politeness_helper(row):
        utt = ps.transform_utterance(row[X_col], spacy_nlp=spacy_nlp)
        feats = [utt.meta['politeness_strategies'][x] for x in cols]
        return pd.Series(feats)

    df[cols] = df.apply(extract_politeness_helper, axis=1)



Collecting convokit
  Downloading convokit-3.0.0.tar.gz (183 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m183.2/183.2 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting msgpack-numpy>=0.4.3.2 (from convokit)
  Downloading msgpack_numpy-0.4.8-py2.py3-none-any.whl (6.9 kB)
Collecting dill>=0.2.9 (from convokit)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m15.4 MB/s[0m eta [36m0:00:00[0m
Collecting clean-text>=0.6.0 (from convokit)
  Downloading clean_text-0.6.0-py3-none-any.whl (11 kB)
Collecting unidecode>=1.1.1 (from convokit)
  Downloading Unidecode-1.3.7-py3-none-any.whl (235 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m235.5/235.5 kB[0m [31m27.9 MB/s[0m eta [

In [None]:
# List harbingers, liwc and politeness features
with open('/content/drive/MyDrive/Colab_Notebooks/deliberative-politics-main/lexica/2015_Diplomacy_lexicon.json') as f:
    harb_dict = json.loads(f.readline())
#print(harb_dict)
main_df = pd.read_csv('/content/drive/MyDrive/Colab_Notebooks/deliberative-politics-main/lexica/politeness_list.csv')
X_cols = list(main_df.columns) + list(harb_dict.keys())
print(X_cols)

["'feature_politeness_==Please=='", " 'feature_politeness_==Please_start=='", " 'feature_politeness_==HASHEDGE=='", " 'feature_politeness_==Indirect_(btw)=='", " 'feature_politeness_==Hedges=='", " 'feature_politeness_==Factuality=='", " 'feature_politeness_==Deference=='", " 'feature_politeness_==Gratitude=='", " 'feature_politeness_==Apologizing=='", " 'feature_politeness_==1st_person_pl.=='", " 'feature_politeness_==1st_person=='", " 'feature_politeness_==1st_person_start=='", " 'feature_politeness_==2nd_person=='", " 'feature_politeness_==2nd_person_start=='", " 'feature_politeness_==Indirect_(greeting)=='", " 'feature_politeness_==Direct_question=='", " 'feature_politeness_==Direct_start=='", " 'feature_politeness_==HASPOSITIVE=='", " 'feature_politeness_==HASNEGATIVE=='", " 'feature_politeness_==SUBJUNCTIVE=='", " 'feature_politeness_==INDICATIVE=='", 'claim', 'disc_temporal_rest', 'allsubj', 'disc_expansion', 'disc_contingency', 'premise', 'disc_temporal_future', 'disc_compariso

In [None]:
def generate_Xy(train, test, **kwargs):
    global printed
    X_cols_filt = [x for x in X_cols if x in list(train.columns)]
    X_cols_nf = [x for x in X_cols if x not in list(train.columns)]
    if not printed:
        print('[WARNING!!!] Couldnt find', X_cols_nf)
        printed = True
    X_train = train[X_cols_filt].to_numpy()
    y_train = train[kwargs['y_col']]
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = test[X_cols_filt].to_numpy()
    X_test = scaler.transform(X_test)
    y_test = test[kwargs['y_col']]
    return X_train, y_train, X_test, y_test

def extract_feats(df, X_col):
    extract_harbingers(df, X_col)
    extract_politeness_feats(df, X_col)
    df.to_csv(os.path.join(MODIFIED_DATA, '/content/drive/MyDrive/Colab_Notebooks/deliberative-politics-main/modified_data/harbingers_and_politeness.csv'))

In [None]:
printed = False
df = pd.read_csv(DATA_PATH)
extract_feats(df, X_col)
results = sklearn_models(df, X_col, y_col, os.path.join(OUTPUT_DIR, '/content/drive/MyDrive/Colab_Notebooks/deliberative-politics-main/modified_data/liwc_harbingers_politeness.csv'), generate_Xy)

logreg done!
knn done!
gaussianNB done!
bernoulliNB done!
adaboost done!
gradient-boosting done!
dec-tree done!
linear-svc done!
c-svc done!


In [None]:
results

Unnamed: 0,method,accuracy,f1,precision,recall,macro-f1,minority-f1
0,logreg,0.566667,0.396667,0.366667,0.45,0.461667,0.396667
1,knn,0.475,0.25,0.3,0.25,0.388333,0.25
2,gaussianNB,0.641667,0.233333,0.3,0.2,0.491667,0.233333
3,bernoulliNB,0.616667,0.216667,0.25,0.2,0.468333,0.216667
4,adaboost,0.816667,0.76,0.783333,0.8,0.76,0.76
5,gradient-boosting,0.816667,0.76,0.783333,0.8,0.76,0.76
6,dec-tree,0.816667,0.76,0.783333,0.8,0.76,0.76
7,linear-svc,0.666667,0.63,0.566667,0.75,0.605,0.63
8,c-svc,0.691667,0.546667,0.566667,0.55,0.593333,0.546667
