Notebook that cleans up the raw debates json, arranges the debate transcripts, identifies speakers and removes HTML. This notebook covers a part of the debates (92 out of 148) which have clear indications of the participants in the debate transcripts. The other debates are in part 2 where the speakers have to be identified manually. 

**TO DO**: 

Break each statement down to sentences and make tabular csv

In [1]:
import json
import re
import numpy as np

In [2]:
with open("full_debates_raw.json", "r") as f:
    debates = json.load(f)

In [3]:
def remove_html(text):
    """
    Clean up given string by removing all HTML tags and leaving only plain text
    """
    pattern = re.compile(r"<[^<]+?>")
    text_clean = re.sub(pattern, "", text)
    return text_clean


In [4]:
def get_speaker_list(debate_text):
    """
    Function to obtain list of candidates and moderators for a given debate text
    """
    debate_text_split = debate_text.split("<b>")[1:]
    speaker_text = []
    for x in debate_text_split:
        line_split = x.split(":</b>")
        speaker_text.append(line_split)
    candidates_desc = speaker_text[0][1]
    moderators_desc = speaker_text[1][1]
    speaker_name_pattern = re.compile(r"\w+\s[\($]"
                                     "|\w+;")
    candidates = [p[:-1].upper().rstrip() for p in re.findall(speaker_name_pattern, candidates_desc) ]
    moderators = [p[:-1].upper().rstrip() for p in re.findall(speaker_name_pattern, moderators_desc) ]
    return candidates, moderators, remove_html(candidates_desc)
    

In [5]:
def get_candidate_speech(debate_text):
    """
    Select only text of speech by candidates and ignore moderators. Cleanup, annotate and arrange.
    """
    candidate_list, mod_list, candidates_desc = get_speaker_list(debate_text)
    debate_text_split = debate_text.split("<b>")[1:]
    candidate_speech = []
    for line in debate_text_split:
        speaker = line.split(":")[0]
        speech = line.split(":")[1:]
        if len(speech)>1:
            speech = "".join([x for x in speech])
        elif len(speech) >0:
            speech = speech[0]   
        else:
            speech = ""
        for candidate in candidate_list:
            if speaker==candidate:
                candidate_speech.append([speaker, remove_html(speech)])
    return candidate_speech, remove_html(candidates_desc)
                
                

In [6]:
get_speaker_list(debates[str(5)]["text"])

(['CLINTON', 'SANDERS'],
 ['BLITZER'],
 'Former Secretary of State Hillary Clinton;Senator Bernie Sanders (VT); ')

In [None]:
for i in np.arange(1, 149):
    try:
        candidates, moderators, candidates_desc = get_speaker_list(debates[str(i)]["text"])
        print(i, candidates, moderators, debates[str(i)]["url"])
    except:
        print(i, [], debates[str(i)]["url"])


### Select debates where this speaker identification scheme works

We'll devise a different scheme for the other debates. Some of them def require manual annotation

In [8]:
part1 = np.concatenate([np.arange(1,34), np.arange(38,58), 
                   np.arange(62,75), np.arange(76,79), 
                   np.arange(81,87), np.arange(88,94), [95], 
                   np.arange(115,125)])

In [9]:
part1

array([  1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,
        14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,
        27,  28,  29,  30,  31,  32,  33,  38,  39,  40,  41,  42,  43,
        44,  45,  46,  47,  48,  49,  50,  51,  52,  53,  54,  55,  56,
        57,  62,  63,  64,  65,  66,  67,  68,  69,  70,  71,  72,  73,
        74,  76,  77,  78,  81,  82,  83,  84,  85,  86,  88,  89,  90,
        91,  92,  93,  95, 115, 116, 117, 118, 119, 120, 121, 122, 123,
       124])

In [10]:
len(part1)

92

In [11]:
debates_clean = {}

In [12]:
for i in part1:
    debates_clean[str(i)] = {}
    candidates_speech, candidates_desc = get_candidate_speech(debates[str(i)]["text"])
    debates_clean[str(i)]["candidates_desc"] = candidates_desc
    debates_clean[str(i)]["candidates_speech"] = candidates_speech
    debates_clean[str(i)]["date"] = debates[str(i)]["date"]
    debates_clean[str(i)]["desc"] = debates[str(i)]["desc"]
    debates_clean[str(i)]["url"] = debates[str(i)]["url"]
    

In [13]:
with open("debates_clean_part1.json", "w") as f:
    json.dump(debates_clean, f, indent=4)

In [114]:
def split_sentences(long_text, len_thresh=20):
    long_text = re.sub(r"\.\.\.", " ", long_text) # remove "..."
    text_splits = long_text.split(".")
    text_splits = [x for x in text_splits if len(x)>len_thresh]
    return text_splits
    
    

In [117]:
speech_list = []
for i in part1:
    debate_i = debates_clean[str(i)]["candidates_speech"]
    date_i = debates_clean[str(i)]["date"]
    for line in debate_i:
        statement_split = split_sentences(line[1])
        for x in statement_split:
            speech_list.append([line[0], int(date_i[-4:]), x.lstrip(' ').rstrip()])
        
    

In [118]:
speech_list_df = pd.DataFrame(speech_list, columns = ["Speaker", "Year", "Sentence"])
speech_list_df.to_csv("debate_sentences_part1.csv", index=False)