In [1]:
import numpy as np
import pandas as pd
from helper_funcs import report_f1_results, report_accuracy, split_data
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import joblib

# Read in the dataframe(s)

In [65]:
df = pd.read_csv('cmv_df_cleaned.csv')
pfg_df = pd.read_csv('pfg_df_cleaned.csv')
our_df = pd.read_csv('our_df_cleaned.csv')

### Combine our CMV and PFG dataframes into a single dataframe

In [3]:
df = df.append(pfg_df, ignore_index=True)

# Filter out the rows that have NaN as a label

### Report the size of the cmv_df before and after filtering

In [4]:
size_before = len(df)
df = df[(df['success'] == 0) | (df['success'] == 1)]
size_after = len(df)
print(f"Size of the dataframe has been reduced from {size_before} down to {size_after}")

Size of the dataframe has been reduced from 111613 down to 29465


# Shuffle the dataframe

In [5]:
df = df.sample(frac=1).reset_index(drop=True)
df = df[:-1]

# Generate the labels array

In [6]:
labels = np.array(list(df['success']))

# Split the dataframe and labels array into smaller chunks

In [7]:
num_chunks = 8
dfs = np.split(df, num_chunks)
labels = np.split(labels, num_chunks)

# Bag of Words Feature Extraction

### 1. Create a list of unique words in all persuader conversations

In [8]:
unique_words = list(set([word for conversation in df["conversation"] for word in conversation.split()]))
unique_words_dict = {word: idx for idx, word in enumerate(unique_words)}

### 2. Use the unique words to create our BoW feature matrix

In [9]:
def generate_bow_feature(conversation):
    feature = [0] * len(unique_words)
    for word in conversation.split():
        idx = unique_words_dict[word] if word in unique_words_dict else None
        if idx is not None:
            feature[idx ] += 1

    return np.array(feature)

bow_matrices = []

for idx in range(len(dfs)):
    bow_matrices.append(np.array([generate_bow_feature(conversation) for conversation in dfs[idx]["conversation"]]))

# sklearn SVM Model

In [10]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

### 1. Write the training function

In [11]:
def train(features, labels):
    # parameters = [{'kernel': ['rbf'], 'C':[1, 10, 100, 1000], 'gamma': [0.001, 0.0001]}]

    svm = SVC(kernel='rbf', C=1000, gamma=0.001)
    # svm = SVC()
    # clf = GridSearchCV(svm, parameters, verbose=2)
    # clf.fit(features, labels)
    svm.fit(features, labels)

    # Return the trained model
    return svm

### 2. Write the testing function

In [12]:
def test(features, labels, model, feature_type="No Feature Type Provided"):
    label_predictions = model.predict(features)
    accuracy = accuracy_score(list(labels), label_predictions)
    report_accuracy(accuracy, feature_type, "sklearn SVM")

# Split data into testing and training subsets

In [13]:
train_xs = []
test_xs = []
train_ys = []
test_ys = []

for idx, matrix in enumerate(bow_matrices):
    train_x, test_x, \
        train_y, test_y = split_data(matrix, labels[idx])
    
    train_xs.append(train_x)
    test_xs.append(test_x)
    train_ys.append(train_y)
    test_ys.append(test_y)


# Testing the sklearn SVM Model

### 1. Train the model

In [14]:
for idx, (x, y) in enumerate(zip(train_xs, train_ys)):
    print(f"Training model {idx + 1} / {len(train_xs)}", end='\r')
    model = train(x, y)
    joblib.dump(model, f"./models/{idx}_bow_svm_model.pkl")
    del(model) # This might not be needed

Training model 8 / 8

In [15]:
# joblib.dump(model, "./models/0_tfidf_svm_model.pkl")
# joblib.load("./models/0_tfidf_svm_model.pkl")

### 2. Test the model

In [16]:
for idx, (x, y) in enumerate(zip(test_xs, test_ys)):
    model = joblib.load(f"./models/{idx}_bow_svm_model.pkl")
    test(test_xs[idx], test_ys[idx], model, f"Bag of Words Model #{idx + 1}")
    del(model) # This might not be needed

[Bag of Words Model #1]	[sklearn SVM]		Accuracy: 78.15%
[Bag of Words Model #2]	[sklearn SVM]		Accuracy: 75.58%
[Bag of Words Model #3]	[sklearn SVM]		Accuracy: 78.02%
[Bag of Words Model #4]	[sklearn SVM]		Accuracy: 78.43%
[Bag of Words Model #5]	[sklearn SVM]		Accuracy: 79.10%
[Bag of Words Model #6]	[sklearn SVM]		Accuracy: 78.56%
[Bag of Words Model #7]	[sklearn SVM]		Accuracy: 75.98%
[Bag of Words Model #8]	[sklearn SVM]		Accuracy: 81.14%


# Get predictions for our dataset

In [32]:
model = joblib.load(f"./models/1_bow_svm_model.pkl")

### 1. Generate the features for our conversations

In [54]:
# our_labels = [0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0]
# our_labels = [0, 0, 0, 0, 1, 0, 0, 1]
our_labels = [0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

In [62]:
def get_accuracy(Y):
    total = len(our_labels)
    correct = 0
    for idx, label in enumerate(Y):
        if label == our_labels[idx]:
            correct = correct + 1
    
    return (correct / total) * 100


In [44]:
for i in range(1, 8, 1):
    model = joblib.load(f"./models/{i}_bow_svm_model.pkl")
    our_features = []
    for conversation in our_df["conversation"]:
        our_features.append(generate_bow_feature(conversation)[:-1])
    predictions = model.predict(our_features)
    print(f"[{get_accuracy(predictions):.2f}%] {predictions}")

[55.56%] [1. 1. 0. 1. 1. 0. 0. 0. 0. 0. 0. 1. 1. 1. 0. 0. 0. 1.]
[55.56%] [1. 1. 0. 1. 1. 0. 0. 0. 0. 0. 0. 1. 1. 1. 0. 0. 0. 1.]
[22.22%] [1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
[77.78%] [0. 1. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0. 1.]
[77.78%] [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1.]
[83.33%] [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
[72.22%] [1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1.]


In [52]:
for i in range(1, 8, 1):
    model = joblib.load(f"./models/{i}_bow_svm_model.pkl")
    our_features = []
    for conversation in our_df["conversation"]:
        our_features.append(generate_bow_feature(conversation)[:-1])
    predictions = model.predict(our_features)
    print(f"[{get_accuracy(predictions):.2f}%] {predictions}")

[75.00%] [1. 0. 0. 0. 1. 0. 0. 0.]
[75.00%] [1. 0. 0. 0. 1. 1. 0. 1.]
[25.00%] [1. 1. 1. 1. 1. 1. 1. 1.]
[75.00%] [0. 0. 0. 0. 1. 1. 0. 0.]
[87.50%] [0. 0. 0. 0. 1. 0. 0. 0.]
[87.50%] [0. 0. 0. 0. 1. 0. 0. 0.]
[87.50%] [0. 0. 0. 0. 1. 0. 0. 0.]


In [67]:
for i in range(1, 8, 1):
    model = joblib.load(f"./models/{i}_bow_svm_model.pkl")
    our_features = []
    for conversation in our_df["conversation"]:
        our_features.append(generate_bow_feature(conversation)[:-1])
    predictions = model.predict(our_features)
    print(f"[{get_accuracy(predictions):.2f}%] {predictions}")

30
[53.33%] [1. 1. 0. 1. 1. 0. 0. 0. 0. 0. 0. 1. 1. 1. 0. 0. 0. 1. 1. 1. 1. 0. 0. 1.
 0. 0. 1. 0. 1. 0.]
30
[43.33%] [1. 1. 0. 1. 1. 0. 0. 0. 0. 0. 0. 1. 1. 1. 0. 0. 0. 1. 1. 1. 1. 1. 0. 1.
 0. 0. 1. 1. 1. 1.]
30
[13.33%] [1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1.]
30
[76.67%] [0. 1. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0. 1. 1. 1. 0. 0. 0. 0.
 0. 0. 1. 0. 0. 0.]
30
[80.00%] [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1. 0. 1. 0. 0. 0. 0.
 0. 0. 1. 0. 0. 0.]
30
[80.00%] [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 1. 0. 0. 0. 0.
 0. 0. 1. 0. 0. 0.]
30
[76.67%] [1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1. 0. 1. 0. 0. 0. 0.
 0. 0. 1. 0. 0. 0.]


In [34]:
our_predictions = model.predict(our_features)

In [35]:
our_predictions

array([1., 1., 0., 1., 1., 0., 0., 0., 0., 0., 0., 1., 1., 1., 0., 0., 0.])

In [31]:
our_df.tail()

Unnamed: 0,conversation
12,hello today well thank plan day really live ch...
13,want thank much welcome hope great day thank m...
14,wow plant grow quickly know excited see look l...
15,wu wu tang clan american hip hop group formed ...
16,awesome angry yesterday go back house working ...
