In [1]:
import convokit
from convokit import Corpus, download
import numpy as np
import pandas as pd
import nltk
nltk.download('wordnet', quiet=True)
import re

In [2]:
! pip install contractions
import contractions



## Read in the dataset and convert it to a pandas dataframe

In [3]:
corpus = Corpus(download('persuasionforgood-corpus'))

Dataset already exists at C:\Users\brent\.convokit\downloads\persuasionforgood-corpus


In [22]:
utterances_df = corpus.get_utterances_dataframe()
conversations_df = corpus.get_conversations_dataframe()

### 1. Write a function to get full conversations as a single string

In [23]:
conversation_ids = corpus.get_conversation_ids()
def get_conversation(conversation_id):
    conversation = ""

    utterances = list(utterances_df[utterances_df['conversation_id'] == conversation_id]['text'])
    for utterance in utterances:
        conversation = conversation + " " + utterance
    
    return conversation

### 2. Write a function to see if any amount was donated during a conversation

In [37]:
def get_conversation_donation(conversation_id):
    er_donation = conversations_df[conversations_df.index == conversation_id]['meta.donation_er'].values[0]
    ee_donation = conversations_df[conversations_df.index == conversation_id]['meta.donation_ee'].values[0]

    return 1 if er_donation + ee_donation > 0 else 0


### 3. Use the above functions to generate our raw features and labels

In [38]:
conversations = [get_conversation(c_id) for c_id in conversation_ids]
donations = [get_conversation_donation(c_id) for c_id in conversation_ids]

In [39]:
df_data = [[c, d] for (c, d) in zip(conversations, donations)]

### 4. Create a new dataframe out of our conversations

In [42]:
df = pd.DataFrame(df_data, columns=["conversation", "success"])

In [43]:
df.to_csv('pfg_df.csv', index=False)

## Data Cleaning

### 1. Convert utterances to lowercase

In [45]:
df['conversation'] = df['conversation'].str.lower()

### 2. Remove any HTML or URLs from the utterances

In [46]:
df['conversation'] = df['conversation'].apply(lambda x: re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', str(x)))
df['conversation'] = df['conversation'].apply(lambda x: re.sub('<[^<]+?>', '', str(x)))

### 3. Expand contractions

In [47]:
df['conversation'] = df['conversation'].apply(lambda x: contractions.fix(x))

### 4. Tokenization

In [48]:
from nltk.tokenize import sent_tokenize, word_tokenize
nltk.download('punkt', quiet=True)

True

### 4.1. Sentence tokenization

In [49]:
df['conversation'] = df['conversation'].apply(lambda x: ' '.join([w for w in sent_tokenize(x)]))

### 4.2. Word tokenization

In [50]:
df['conversation'] = df['conversation'].apply(lambda x: ' '.join([w for w in word_tokenize(x)]))

### 5. Remove non-alphanumeric characters

In [51]:
df['conversation'] = df['conversation'].apply(lambda x: re.sub('[^a-zA-Z0-9]', ' ', str(x)))

# Pre-processing

In [53]:
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

### 1. Remove the stop words

In [54]:
stop_words = stopwords.words('english')
df['conversation'] = df['conversation'].apply(lambda x: ' '.join([w for w in x.split() if w not in stop_words]))

### 2. Perform stemming

In [200]:
# stemmer = PorterStemmer()
# df['text'] = df['text'].apply(lambda x: ' '.join([stemmer.stem(w) for w in x.split()]))

### 3. Perform lemmatization

In [55]:
lemmatizer = nltk.WordNetLemmatizer()
df['conversation'] = df['conversation'].apply(lambda x: ' '.join([lemmatizer.lemmatize(w) for w in x.split()]))

### 4. Remove extra whitespace

In [56]:
df['conversation'] = df['conversation'].apply(lambda x: ' '.join(x.split()))

# Write the preprocessed dataframe to a .csv file

In [57]:
df.to_csv('pfg_df_cleaned.csv', index=False)