In [13]:
from htrc_features import FeatureReader
import numpy as np
import pandas as pd

In [14]:
def get_htrc_page_data(document):
    fr = FeatureReader([document])
    vol = next(fr.volumes())
    ptc = vol.tokenlist(pos=False, case=False).reset_index().drop(['section'], axis=1)
    page_list = set(ptc['page'])
    
    # extract tokens by page 
    tokens=list()
    for page in page_list:
        page_data = str()
        
        # operate on each token
        for page_tokens in ptc.loc[ptc['page'] == page].iterrows():
            if page_tokens[1][1].isalpha():
                
                # deal with frequency count by creating correct number of tokens
                page_data += (' '.join([page_tokens[1][1]] * page_tokens[1][2])) + " "

        tokens.append(page_data.split())
    return tokens

In [15]:
# htids for two classes 
hard_sf = ["mdp.39015038888775", "pst.000027847633", "mdp.39015013517985", "mdp.39015020680461", "mdp.39015020690858", "pst.000029273768", "mdp.39015013433738", "mdp.39015013534014", "mdp.39015012435791", "pst.000059688501"]
soft_sf = ["mdp.39015020645456", "mdp.39015003922005", "mdp.39015000244775", "mdp.39015047597136", "ppt.ssfcbz201710000391", "mdp.49015000529041", "uiug.30112077272364", "ppt.ssfcbz201710000347", "mdp.49015003071447", "inu.30000004080028"]

In [16]:
# create labels for classifier
labels = ["hard"] * len(hard_sf) + ["soft"] * len(soft_sf)

In [None]:
# construct text for consumption into dtm
raw_data = list()
for doc in hard_sf + soft_sf:
    page_data = get_htrc_page_data(doc)
    tokens = ' '.join([w for p in page_data for w in p])
    raw_data.append(tokens)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
vec = CountVectorizer(input='content',
                             stop_words='english',
                             strip_accents='unicode')

In [None]:
dtm = vec.fit_transform(raw_data)

In [None]:
from sklearn.linear_model import SGDClassifier
from sklearn import metrics

clf = SGDClassifier(tol=None,max_iter=1000,random_state=42).fit(dtm,labels)

In [None]:
df = pd.DataFrame([clf.coef_[0][idx] for idx in np.argsort(clf.coef_[0])[:40]],
                  index = [vec.get_feature_names_out()[idx] for idx in np.argsort(clf.coef_[0])[:40]])
df.plot(figsize=(20, 5),kind='bar',title='Key Features: Hard',legend=False)

In [None]:
from sklearn.cluster import AffinityPropagation
from sklearn.metrics.pairwise import cosine_similarity
cosine_dist_matrix = 1 - cosine_similarity(dtm)

%timeit
affprop = AffinityPropagation(random_state=None,affinity="precomputed", max_iter=1000, damping=0.99)
affprop.fit(cosine_dist_matrix)

In [None]:
# Display clusters
for cluster in np.unique(affprop.labels_):
    print(cluster," ".join(np.array(labels)[(affprop.labels_ == cluster)]))