In [None]:
import os
from Dataset import SpeechDataset
from DataLoader import DataLoader
from Speech import Speech
from preprocessors import *
from utils import scrape_speeches
from tqdm import tqdm
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn import metrics
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import *
from sklearn.pipeline import Pipeline
import tqdm

In [None]:
cwd = os.getcwd()
dataset_types = ["important", "typical"]
resources_dir = f"{cwd}/resources"
saving_dir = f"{cwd}/resources/dataset_all.csv"

In [None]:
if not os.path.exists(saving_dir):
    df_dict = {
        "speaker": [], "title": [], "year": [], "content": [],
        "anger": [], "disgust": [], "disgust": [], "joy": [], "neutral": [], "sadness": [], "surprise": [],
        "polarity": [], "subjectivity": [], "complexity": [], "lexical_richness": [],
        "entities_proportion_in_speech": [], "imagery_proportion_in_speech": [],
        "stopwords_proportion_in_speech": [], "mean_sentence_length": [],
        "label": []
    }

    imagery_words = pd.read_csv("resources/visual_words.csv", header=None)
    imagery_words = list(imagery_words[0].array)
    stop_words = list(spacy.load("en_core_web_md").Defaults.stop_words)

    for dataset_type in dataset_types:
        path = f"{cwd}/dataset/{dataset_type}"
        dataset = SpeechDataset(path)
        dataloader = DataLoader(dataset)
        with tqdm(total=len(dataloader.dataset)) as progress_bar:
            for speech in dataloader:
                for key in df_dict.keys():
                    try:
                        df_dict[key].append(getattr(speech, f"get_{key}")())
                    except:
                        pass
                emotions = speech.get_emotion_scores(return_all_scores=True)[0]
                for emotion in emotions:
                    df_dict[emotion["label"]].append(emotion["score"])

                df_dict["entities_proportion_in_speech"].append(speech.get_proportion_in_speech(speech.get_entities()))
                df_dict["imagery_proportion_in_speech"].append(speech.get_proportion_in_speech(imagery_words))
                df_dict["stopwords_proportion_in_speech"].append(speech.get_proportion_in_speech(stop_words))
                if dataset_type == "important":
                    df_dict["label"].append(1.0)
                else:
                    df_dict["label"].append(0.0)
                progress_bar.update(1)

    if not os.path.exists(resources_dir):
        os.mkdir(resources_dir)
    df = pd.DataFrame(df_dict)
    df.to_csv(saving_dir)
else:
    df = pd.read_csv(saving_dir)

### EDA

##### More speeches with high fear (> 0.2) in important than typical.
##### Fewer speeches with high joy (> 0.2) in important than typical.
##### Fewer speeches with high neutrality (0.2) in important than typical.
##### Fewer speeches with low surprise (< 0.2) in important than typical.
##### Fewer speeches with polarity (>0.15) in important than typical.
##### More speeches with high subjectivity (>0.4) in important than typical.

##### Fewer speeches with high proportion of entities (> 0.02) in imporant than typical.
##### Fewer speeches with high proportion of imagery words (>0.075) in important than in typical.
##### Fewer speaches with high proportion (>0.05) of stopwords in important than in typical.
##### Fewer speeches with high complexity (>60) in important than in typical.
##### Fewer speeches with high lexical richness (>0.3) in important than in typical.
##### More speeches with high (>20) mean sentence length in important than in typical.

In [None]:
for colname in df.columns[5:-1]:
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize = (20, 10))
    ax1.bar(list(range(0, len(df[df["label"]==1][colname].array))), df[df["label"]==1][colname].array)
    ax1.set_xlabel("Speech Name")
    ax1.set_ylabel(f"{colname} score")
    ax1.set_title(f"Important Speeches vs {colname} Score")
    ax2.bar(list(range(0, len(df[df["label"]==0][colname].array))), df[df["label"]==0][colname].array)
    ax2.set_xlabel("Speech Name")
    ax2.set_ylabel(f"{colname} score")
    ax2.set_title(f"Typical Speeches vs {colname} Score")
    plt.show()

### Classification using SVM with RBF Kernel

##### Accuracy on test: 78.72%
##### Precision on test: 0.74
##### Sensitivity on test: 0.86
##### AUC on test: 0.78

In [None]:
data = np.array(df.iloc[:, 5:-1])
target = np.array(df.iloc[:, -1])
X_train, X_test, y_train, y_test = train_test_split(data, 
                                                    target, 
                                                    test_size=0.3, 
                                                    stratify=target, random_state=109)
param_grid = {'svm__C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000],
              'svm__gamma': [100, 10, 1, 0.1, 0.2, 0.02, 0.002, 0.0002, 0.01, 0.001, 0.0001],
              'svm__kernel': ['rbf']}
pipeline = Pipeline(steps = [("StandardScaler", StandardScaler()), ("svm", svm.SVC())])
search = GridSearchCV(pipeline, param_grid, scoring="accuracy", cv=5, refit = True, verbose = 0, n_jobs=5)
search.fit(X_train, y_train)
y_pred = search.predict(X_test)

print("Mean Accuracy on test:", metrics.accuracy_score(y_test, y_pred))
print("Mean Precision on test:", metrics.precision_score(y_test, y_pred))
print("Mean Sensitivity on test:", metrics.recall_score(y_test, y_pred))
print("Mean AUC over on test:", metrics.roc_auc_score(y_test, y_pred))

cm = metrics.confusion_matrix(y_test, y_pred)
cm_display = metrics.ConfusionMatrixDisplay(cm).plot()

fpr, tpr, _ = metrics.roc_curve(y_test, y_pred, pos_label=search.classes_[1])
roc_display = metrics.RocCurveDisplay(fpr=fpr, tpr=tpr).plot()