# BurstMyBubble (BMB) exploration of training data

The data used here can be found in [convote](http://www.cs.cornell.edu/home/llee/data/convote.html) from Cornell university. For the purpose of this app, we used the stage_one set in convote:

>"data_stage_one" was used to identify by-name references to
  train our agreement classifier, which acts on such references.  All
  references in this dataset are annotated with a special set of
  characters of the form "xz1111111", where 1111111 is replaced by a
  seven-digit code indicating the House Member who we determined to be
  the target of the reference.  The first six digits of the code
  matches the index used to label the target Member's speech segments,
  (see description of our individual-file-naming convention, below).  The
  seventh digit is a relic from early experiments and was not used in
  our final study.
  
This notebook will explore the development set to test the implementation of **BMB**. For this notebook to work you will have to untar `tar -xzvf convote_v1.1.tar.gz` in the data folder.

In [None]:
%matplotlib inline
import os
import nltk
import numpy as np
import pandas as pd
from glob import glob
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize, RegexpTokenizer
from nltk.probability import FreqDist
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

plt.style.use('ggplot')
stop_words= stopwords.words("english")
data_path = '../data/convote_v1.1/data_stage_one/test_set'
files = glob(os.path.join(data_path, '*.txt'))
tokenizer = RegexpTokenizer(r'\w+')
lemmatize = WordNetLemmatizer()


def frequency_explorer(tokens, title):
    """
    This function tokenize it and plots the frequency of a text string
    """
    tokens = [lemmatize.lemmatize(x) for x in tokens]
    fdist = FreqDist(tokens)
    print(fdist)
    print(title)
    fdist.plot(30,cumulative=False)
    plt.show()

def opennfilter(filename):
    """
    Reads a file, tokenize it, filter stopwords, and returns a list of strings of cleaned text
    """
    # get label based on the filename structure ###_@@@@@@_%%%%$$$_PMV.txt, where p is the party
    party = filename[:filename.rfind('.txt')].split('_')[-1][0]
    with open(filename) as text:
        tokens = tokenizer.tokenize(text.read())
        tokens = [lemmatize.lemmatize(x) for x in tokens]
        filtered_sent = ' '.join([w for w in words if w not in stop_words])
    return party, filtered_sent
        
        

In [None]:
text = open(files[0]).read()
words = tokenizer.tokenize(text)
frequency_explorer(words, 'Words from %s' % files[0])

In [None]:
text = open(files[3]).read()
words = tokenizer.tokenize(text)
frequency_explorer(words, 'Words from %s' % files[3])

In [None]:
text = open(files[3]).read()
words = tokenizer.tokenize(text)
filtered_sent3 = [w for w in words if w not in stop_words]
frequency_explorer(filtered_sent3, 'Filtered Words from %s' % files[3])

In [None]:
text = open(files[0]).read()
words = tokenizer.tokenize(text)
filtered_sent0 = [w for w in words if w not in stop_words]
frequency_explorer(filtered_sent0, 'Filtered Words from %s' % files[0])

In [None]:
docs = [(filename[:filename.rfind('.txt')].split('_')[-1][0], open(filename).read().strip()) for filename in files]
labels, sentences = zip(*docs)
labels = np.array(labels)
cv = CountVectorizer(lowercase=True, stop_words='english', ngram_range = (1,5), tokenizer = tokenizer.tokenize)
text_counts= cv.fit_transform(sentences)
df1 = pd.DataFrame(text_counts.toarray())

In [None]:
df1.head()

In [None]:
pca = PCA(n_components=2)
X_pca = pca.fit_transform(df1)

In [None]:
colors = ['navy', 'crimson']
plt.figure(figsize=(8, 8))
for color, i, target_name in zip(colors, ['D', 'R'], labels):
        plt.scatter(X_pca[labels == i, 0], X_pca[labels == i, 1], color=color, lw=1, label=target_name)
plt.show()        