In [1]:
import numpy as np
import pandas as pd

# Read in the dialogue and info dataframes

In [2]:
dialogue_df = pd.read_csv('dialogue_df_cleaned.csv')
info_df = pd.read_csv('info_df_cleaned.csv')

# Filter the dataframes for the persuader's utterances

In [3]:
persuader_df = dialogue_df[dialogue_df.role == 0]

# Generate the labels array

In [4]:
persuasion_styles = list(persuader_df.er_label_1.unique())
persuader_styles_map = {style: idx for idx, style in enumerate(persuasion_styles)} # Generate a style: idx map so we can easily map styles to integers

labels = np.array([persuader_styles_map[style] for style in persuader_df.er_label_1])

### 1. Write the labels array to a text file

In [5]:
np.savetxt("./features/labels.out", labels)

# TF-IDF Feature Extraction

### 1. Use sklearn to get the TF-IDF feature matrix

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

In [7]:
vectorizer = TfidfVectorizer()

# N x M matrix that holds our tfidf features. N = # of persuader utterances and M = # of unique words in all persuader utterances
tfidf_feature_matrix = vectorizer.fit_transform(persuader_df['sentence'])
tfidf_feature_matrix = tfidf_feature_matrix.toarray()

### 2. Write the TF-IDF feature matrix to a text file

In [18]:
np.savetxt("./features/tfidf.out", tfidf_feature_matrix)

# Word2Vec Feature Extraction

### 1. Use gensim to load the Word2Vec model

In [9]:
import gensim.downloader as api
wv = api.load('word2vec-google-news-300')

### 2. Capture the Word2Vec features and use the AVERAGE as the feature vectors

In [10]:
# Store our N x M variables
N = len(persuader_df["sentence"])
M = len(wv[0])

# Create an empty N x M numpy array to represent our feature matrix
wv_avg_feature_matrix = np.empty(shape=(N, M))

# Iterate through each utterance in the persuader dataframe
for i, utterance in enumerate(persuader_df["sentence"]):
    # Store the Word2Vec vector for each valid word in the review
    vec_list = [wv[word] for word in utterance.split() if word in wv]

    # Create a single feature vector by averaging together each vector from each word
    n = len(vec_list) if len(vec_list) > 0 else 1
    vec = sum(vec_list) / n

    # Append our feature to the feature matrix
    wv_avg_feature_matrix[i] = vec

### 3. Write the averaged Word2Vec feature matrix to a text file

In [11]:
np.savetxt("./features/word2vec_averaged.out", wv_avg_feature_matrix)

### 4. Capture the Word2Vec features and use the FIRST 10 VECTORS as the feature vectors

In [12]:
'''
This feature matrix has terrible performancem, we can probably just ignore this

C = 10

# Create an empty N x M x 10 numpy array to represent our feature matrix
wv_ten_feature_matrix = np.empty([N, M * C])

# Iterate through each review in the DataFrame
for i, utterance in enumerate(persuader_df["sentence"]):
    # Store the Word2Vec vector for the first 10 valid words in the review
    vec_list = np.empty([M * C])
    ctr = 0
    for word in utterance.split():
        if ctr >= M * C: break
        if word not in wv: continue
        for val in wv[word]: 
            vec_list[ctr] = val
            ctr += 1

    # Append our feature to the feature matrix
    wv_ten_feature_matrix[i] = vec_list

'''

### 5. Write the first ten Word2Vec feature matrix to a text file

In [13]:
# np.savetxt("./features/word2vec_first_ten.out", wv_ten_feature_matrix)

# Bag of Words Feature Extraction

### 1. Create a list of unique words in all persuader utterances

In [14]:
unique_words = list(set([word for utterance in persuader_df["sentence"] for word in utterance.split()]))
unique_words_dict = {word: idx for idx, word in enumerate(unique_words)}

### 2. Use the unique words to create our BoW feature matrix

In [17]:
def generate_bow_feature(utterance):
    feature = [0] * len(unique_words)
    for word in utterance.split():
        idx = unique_words_dict[word] if word in unique_words_dict else None
        if idx is not None:
            feature[idx] += 1

    return np.array(feature)

bow_feature_matrix = np.array([generate_bow_feature(utterance) for utterance in persuader_df["sentence"]])

### 3. Write the BoW feature matrix to a text file

In [19]:
np.savetxt("./features/bag_of_words.out", bow_feature_matrix)

# Bigram Feature Extraction

### 1. Generate a one-hot encoding of all bigrams in the utterances

In [20]:
bigrams = [bigram for utterance in persuader_df["sentence"] for bigram in zip(utterance.split()[:-1], utterance.split()[1:])]
bigrams_dict = {bigram: idx for idx, bigram in enumerate(bigrams)}

### 2. Convert the utterances to a bigram feature matrix

In [24]:
def generate_bigram_feature(utterance):
    feature = [0] * len(bigrams)

    utterance = utterance.split()
    for i in range(len(utterance) - 1):
        bigram = (utterance[i], utterance[i + 1])
        bigram_idx = bigrams_dict[bigram] if bigram in bigrams_dict else None
        if bigram_idx is not None:
            feature[bigram_idx] += 1

    return np.array(feature)

bigram_feature_matrix = np.array([generate_bigram_feature(utterance) for utterance in persuader_df["sentence"]])

### 3. Write the Bigrams feature matrix to a text file

In [25]:
np.savetxt("./features/bigrams.out", bow_feature_matrix)