In [1]:
import convokit
from convokit import Corpus, download
import numpy as np
import pandas as pd
import nltk
nltk.download('wordnet', quiet=True)
import re

In [2]:
! pip install contractions
import contractions



## Read in the dataset and convert it to a pandas dataframe

In [None]:
corpus = Corpus(download('winning-args-corpus'))

In [86]:
utterance_df = corpus.get_utterances_dataframe()
utterance_df['text'] = utterance_df['text'].astype('str')
utterance_df['reply_to'] = utterance_df['reply_to'].astype('str')

### 1. Get the conversation paths and save them to a file

In [43]:
utterance_ids = corpus.get_utterance_ids()
def get_branches(utterance_id, conversations, conversation, labels):
    meta = corpus.get_utterance(utterance_id).meta
    conversation.append(utterance_id)
    labels.append(meta['success'])

    # If we can immediately verify that there are no replies
    # then we can return early
    if meta['replies'] is None or len(meta['replies']) == 0:
        conversations.append([conversation.copy(), labels.copy()])
        conversation.pop()
        labels.pop()
        return conversations

    replies = meta['replies'] if utterance_id[:2] == "t3" else meta['replies']['data']['children']
    if len(replies) > 0:
        for reply in replies:
            reply = "t1_" + reply
            if reply in utterance_ids: get_branches(reply, conversations, conversation, labels)
    else:
        conversations.append([conversation.copy(), labels.copy()])
    
    conversation.pop()
    labels.pop()
    return conversations

In [28]:
# ctr = 0
# conversation_paths = []
# conversation_ids = corpus.get_conversation_ids()
# for conversation_id in conversation_ids:
#     ctr += 1
#     if ctr % 100 == 0:
#         print(f"{ctr} / {len(conversation_ids)}", end="\r")
#     branches = get_branches(conversation_id, [], [], [])
#     conversation_paths.append(branches)

3000 / 3051

In [48]:
# conversation_paths_np = np.array(conversation_paths, dtype='object')
# np.save('cmv_conversation_paths.npy', conversation_paths_np, allow_pickle=True)

### 2. Load the conversation paths from the .npy file

In [49]:
conversation_paths_np = np.load('cmv_conversation_paths.npy', allow_pickle=True)

In [141]:
id_utterance_map = {}
for id in utterance_ids:
    id_utterance_map[id] = corpus.get_utterance(id).text

In [142]:
def get_conversation_from_path(utterance_ids):
    conversation = ""
    for id in utterance_ids:
        text = id_utterance_map[id]
        conversation = conversation + text

    return conversation

In [120]:
def get_label_from_path(labels):
    none_ctr = 0
    success_ctr = 0
    fail_ctr = 0

    for label in labels:
        if label is None:
            none_ctr = none_ctr + 1
        elif label == 1:
            success_ctr = success_ctr + 1
        elif label == 0:
            fail_ctr = fail_ctr + 1
        else:
            print(f"{label} is an unknown label")
    
    if success_ctr > 0 and fail_ctr > 0:
        print(f"Success = {success_ctr}  and Failures = {fail_ctr}")

    if success_ctr > fail_ctr:
        return 1
    elif fail_ctr > success_ctr:
        return 0
    else:
        return None

In [116]:
total_num_conversations = 0
for conversation_group in conversation_paths:
    total_num_conversations = total_num_conversations + len(conversation_group)

### Create a new dataframe out of our concatenated conversations

In [185]:
conversation_arr = np.array([["", 0]] * total_num_conversations, dtype='object')
idx_ctr = 0

for i, conversation_group in enumerate(conversation_paths):
    print(f"{i + 1} / {len(conversation_paths)}", end="\r")
    for path_label in conversation_group:
        path = path_label[0]
        labels = path_label[1]
        conversation = get_conversation_from_path(path)
        label = get_label_from_path(labels)
        conversation_arr[idx_ctr][0] = conversation
        conversation_arr[idx_ctr][1] = label
        idx_ctr = idx_ctr + 1

3050 / 3051

In [186]:
conversation_df = pd.DataFrame(conversation_arr, columns=["conversation", "success"])

In [189]:
conversation_df.to_csv('cmv_df.csv', index=False)

In [None]:
# Filter out the utterances whose success value is neither 0 or 1
utterance_df = utterance_df[(utterance_df['meta.success'] == 0) | (utterance_df['meta.success'] == 1)]

In [None]:
utterance_df.memory_usage()

In [None]:
utterance_df = utterance_df.iloc[len(utterance_df) // 2:]

## Data Cleaning

### 1. Convert utterances to lowercase

In [191]:
conversation_df['conversation'] = conversation_df['conversation'].str.lower()

### 2. Remove any HTML or URLs from the utterances

In [192]:
conversation_df['conversation'] = conversation_df['conversation'].apply(lambda x: re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', str(x)))
conversation_df['conversation'] = conversation_df['conversation'].apply(lambda x: re.sub('<[^<]+?>', '', str(x)))

### 3. Expand contractions

In [193]:
conversation_df['conversation'] = conversation_df['conversation'].apply(lambda x: contractions.fix(x))

### 4. Tokenization

In [194]:
from nltk.tokenize import sent_tokenize, word_tokenize
nltk.download('punkt', quiet=True)

True

### 4.1. Sentence tokenization

In [195]:
conversation_df['conversation'] = conversation_df['conversation'].apply(lambda x: ' '.join([w for w in sent_tokenize(x)]))

### 4.2. Word tokenization

In [196]:
conversation_df['conversation'] = conversation_df['conversation'].apply(lambda x: ' '.join([w for w in word_tokenize(x)]))

### 5. Remove non-alphanumeric characters

In [197]:
conversation_df['conversation'] = conversation_df['conversation'].apply(lambda x: re.sub('[^a-zA-Z0-9]', ' ', str(x)))

# Pre-processing

In [198]:
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

### 1. Remove the stop words

In [199]:
stop_words = stopwords.words('english')
conversation_df['conversation'] = conversation_df['conversation'].apply(lambda x: ' '.join([w for w in x.split() if w not in stop_words]))

### 2. Perform stemming

In [200]:
# stemmer = PorterStemmer()
# utterance_df['text'] = utterance_df['text'].apply(lambda x: ' '.join([stemmer.stem(w) for w in x.split()]))

### 3. Perform lemmatization

In [201]:
lemmatizer = nltk.WordNetLemmatizer()
conversation_df['conversation'] = conversation_df['conversation'].apply(lambda x: ' '.join([lemmatizer.lemmatize(w) for w in x.split()]))

### 4. Remove extra whitespace

In [202]:
conversation_df['conversation'] = conversation_df['conversation'].apply(lambda x: ' '.join(x.split()))

# Write the preprocessed dataframe to a .csv file

In [203]:
conversation_df.to_csv('cmv_df_cleaned.csv', index=False)