In [1]:
import pandas as pd
import re #a package for doing regex
import glob #for accessing files on our local system

###We'll be using data from http://www.cs.cornell.edu/home/llee/data/convote.html to explore k-means clustering

In [2]:
!curl -O http://www.cs.cornell.edu/home/llee/data/convote/convote_v1.1.tar.gz

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 9607k  100 9607k    0     0  2424k      0  0:00:03  0:00:03 --:--:-- 2424k


In [3]:
!tar -zxvf convote_v1.1.tar.gz

x convote_v1.1/README.v1.1.txt
x convote_v1.1/data_stage_one/
x convote_v1.1/data_stage_one/development_set/
x convote_v1.1/data_stage_one/development_set/553_400381_2672007_ROY.txt
x convote_v1.1/data_stage_one/development_set/493_400091_2346044_ROY.txt
x convote_v1.1/data_stage_one/development_set/493_400070_2346085_ROY.txt
x convote_v1.1/data_stage_one/development_set/199_400145_1134100_ROY.txt
x convote_v1.1/data_stage_one/development_set/052_400199_0327038_DON.txt
x convote_v1.1/data_stage_one/development_set/199_400071_1134007_ROY.txt
x convote_v1.1/data_stage_one/development_set/052_400343_1479026_RON.txt
x convote_v1.1/data_stage_one/development_set/052_400228_1479083_ROY.txt
x convote_v1.1/data_stage_one/development_set/421_400263_2053004_DON.txt
x convote_v1.1/data_stage_one/development_set/199_400170_1134036_DON.txt
x convote_v1.1/data_stage_one/development_set/421_400328_2053028_ROY.txt
x convote_v1.1/data_stage_one/development_set/199_400421_2013019_ROY.txt
x convote_v1.1/

In [4]:
paths = glob.glob("convote_v1.1/data_stage_one/development_set/*")
speeches = []
for path in paths:
    speech = {}
    filename = path[-26:]
    speech['filename'] = filename
    speech['bill_no'] = filename[:3]
    speech['speaker_no'] = filename[4:10]
    speech['bill_vote'] = filename[-5]
    speech['party'] = filename[-7]
    
    # Open the file
    speech_file = open(path, 'r')
    # Read the stuff out of it
    speech['contents'] = speech_file.read()

    cleaned_contents = re.sub(r"[^ \w]",'', speech['contents'])
    cleaned_contents = re.sub(r" +",' ', cleaned_contents)
    cleaned_contents = cleaned_contents.strip()
    words = cleaned_contents.split(' ')
    speech['word_count'] = len(words)
    
    speeches.append(speech)

In [5]:
speeches[:5]

[{'bill_no': '052',
  'bill_vote': 'N',
  'contents': "mr. chairman , i thank the gentlewoman for yielding me this time . \nmy good colleague from california raised the exact and critical point . \nthe question is , what happens during those 45 days ? \nwe will need to support elections . \nthere is not a single member of this house who has not supported some form of general election , a special election , to replace the members at some point . \nbut during that 45 days , what happens ? \nthe chair of the constitution subcommittee says this is what happens : martial law . \nwe do not know who would fill the vacancy of the presidency , but we do know that the succession act most likely suggests it would be an unelected person . \nthe sponsors of the bill before us today insist , and i think rightfully so , on the importance of elections . \nbut to then say that during a 45-day period we would have none of the checks and balances so fundamental to our constitution , none of the separatio

In [6]:
speeches_df = pd.DataFrame(speeches)
speeches_df.head()

Unnamed: 0,bill_no,bill_vote,contents,filename,party,speaker_no,word_count
0,52,N,"mr. chairman , i thank the gentlewoman for yie...",052_400011_0327014_DON.txt,D,400011,974
1,52,N,"mr. chairman , i want to thank my good friend ...",052_400011_0327025_DON.txt,D,400011,556
2,52,N,"mr. chairman , i rise to make two fundamental ...",052_400011_0327044_DON.txt,D,400011,282
3,52,N,"mr. chairman , reclaiming my time , let me mak...",052_400011_0327046_DON.txt,D,400011,261
4,52,N,"mr. chairman , i thank my distinguished collea...",052_400011_1479036_DON.txt,D,400011,189


In [7]:
speeches_df["word_count"].describe()

count      702.000000
mean       273.216524
std        698.807057
min          3.000000
25%         17.000000
50%         92.500000
75%        368.750000
max      15402.000000
Name: word_count, dtype: float64

###Notice that we have a lot of speeches that are relatively short. They probably aren't the best for clustering because of their brevity

###Time to bring the TF-IDF vectorizer

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
vectorizer = TfidfVectorizer(max_features=10000, stop_words='english')
longer_speeches = speeches_df[speeches_df["word_count"] > 92] 
#filtering for word counts greater than 92 (our median length)
X = vectorizer.fit_transform(longer_speeches['contents'])

In [None]:
from sklearn.cluster import KMeans

In [None]:
number_of_clusters = 7
km = KMeans(n_clusters=number_of_clusters)
km.fit(X)

In [None]:
print("Top terms per cluster:")
order_centroids = km.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()
for i in range(number_of_clusters):
    print("Cluster %d:" % i),
    for ind in order_centroids[i, :15]:
        print(' %s' % terms[ind]),
    print ''

In [None]:
additional_stopwords = ['mr','congress','chairman','madam','amendment','legislation','speaker']

In [None]:
import nltk

english_stopwords = nltk.corpus.stopwords.words('english')
new_stopwords = additional_stopwords + english_stopwords

In [None]:
vectorizer = TfidfVectorizer(max_features=10000, stop_words=new_stopwords)

In [None]:
longer_speeches = speeches_df[speeches_df["word_count"] > 92]
X = vectorizer.fit_transform(longer_speeches['contents'])

In [None]:
number_of_clusters = 7
km = KMeans(n_clusters=number_of_clusters)
km.fit(X)

In [None]:
print("Top terms per cluster:")
order_centroids = km.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()
for i in range(number_of_clusters):
    print("Cluster %d:" % i),
    for ind in order_centroids[i, :15]:
        print(' %s' % terms[ind]),
    print ''

In [None]:
longer_speeches["k-means label"] = km.labels_

In [None]:
longer_speeches.head()

In [None]:
china_speeches = longer_speeches[longer_speeches["k-means label"] == 1]

In [None]:
china_speeches.head()

In [None]:
vectorizer = TfidfVectorizer(max_features=10000, stop_words=new_stopwords)
X = vectorizer.fit_transform(china_speeches['contents'])

number_of_clusters = 5
km = KMeans(n_clusters=number_of_clusters)
km.fit(X)

print("Top terms per cluster:")
order_centroids = km.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()
for i in range(number_of_clusters):
    print("Cluster %d:" % i),
    for ind in order_centroids[i, :10]:
        print(' %s' % terms[ind]),
    print ''

In [None]:
km.get_params()

In [None]:
km.score(X)