In [2]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from ctfidf import CTFIDFVectorizer

In [31]:
references_df = pd.read_pickle("pandas_data.pkl")
df=references_df[["id","abstract"]]

In [32]:
dt=pd.read_pickle("clustered_embeddings_df_test.pkl")

In [38]:
docs=dt[["id","cluster_hamed","cluster_d2v"]].merge(df,how='inner', on="id")
docs=docs.dropna()
docs1=docs[docs["cluster_hamed"]!=-1]
docs2=docs[docs["cluster_d2v"]!=-1]

In [41]:
docs2

Unnamed: 0,id,cluster_hamed,cluster_d2v,abstract
0,53e99792b7602d9701f5b074,-1,362,Yalut is a novel user-centric hybrid content s...
1,53e99792b7602d9701f5b0c6,-1,650,Although different kinds of probabilistic π-ca...
3,53e997b2b7602d9701f94f82,-1,611,A main challenge in today's embedded system de...
4,53e997bdb7602d9701fa70a1,-1,582,We show that if a flow network has k input/out...
6,53e997cbb7602d9701fbd35a,-1,494,- The aim of this paper is to propose a novel ...
...,...,...,...,...
167306,603768b2d3485cfff1dd9724,250,232,We study the pricing and hedging of derivative...
167308,6037695fd3485cfff1de8505,337,275,This paper studies the ability of competing re...
167310,603769ebd3485cfff1df496d,-1,638,The ongoing fragmentation of work. has resulte...
167311,60376a9fd3485cfff1e04286,250,232,We consider dynamic asset allocation problems ...


In [42]:
docs_per_class1 = docs1.groupby(['cluster_hamed'], as_index=False).agg({'abstract': ' '.join})
docs_per_class2 = docs2.groupby(['cluster_d2v'], as_index=False).agg({'abstract': ' '.join})

In [45]:
# Create bag of words
count_vectorizer1 = CountVectorizer().fit(docs_per_class1.abstract)
words1 = count_vectorizer1.get_feature_names()
# Create c-TF-IDF
count1 = count_vectorizer1.transform(docs_per_class1.abstract)
ctfidf1 = CTFIDFVectorizer().fit_transform(count1, n_samples=len(docs1)).toarray()

In [46]:
# Create bag of words
count_vectorizer2 = CountVectorizer().fit(docs_per_class2.abstract)
words2 = count_vectorizer2.get_feature_names()
# Create c-TF-IDF
count2 = count_vectorizer2.transform(docs_per_class2.abstract)
ctfidf2 = CTFIDFVectorizer().fit_transform(count2, n_samples=len(docs2)).toarray()

In [48]:
topics1=[]
for label in docs_per_class1.cluster_hamed:
    topic=[]
    for index in ctfidf1[label].argsort()[-10:]:
        topic.append(words1[index])
    topics1.append(topic)
    #print(label,topic)

In [49]:
topics2=[]
for label in docs_per_class2.cluster_d2v:
    topic=[]
    for index in ctfidf2[label].argsort()[-10:]:
        topic.append(words2[index])
    topics2.append(topic)
    #print(label,topic)

In [58]:
topics1

[['col',
  'first',
  'monograph',
  '1561',
  'economic',
  '1300000011',
  'h4',
  'div',
  'article',
  'page'],
 ['für', 'den', 'auf', 'eine', 'zu', 'werden', 'von', 'die', 'der', 'und'],
 ['chinese',
  'highlights',
  'developments',
  'japan',
  'rec',
  'kanji',
  'character',
  'ognition',
  'decade',
  'technical'],
 ['un', 'du', 'dans', 'en', 'le', 'une', 'des', 'les', 'la', 'de'],
 ['former',
  'release',
  '2007',
  'extensible',
  'rewrite',
  'gt',
  'fabrics',
  'sep',
  'ogsa',
  'dai'],
 ['crisis',
  'ss',
  'demonstration',
  'backend',
  'l3np',
  'granularity',
  'iag',
  'wimax',
  'article',
  'page'],
 ['printing',
  'covers',
  'geography',
  'guards',
  'ivt',
  'available',
  'no',
  'aer',
  'abstract',
  'gallery'],
 ['algorithmic',
  'quic',
  'scalability',
  'farms',
  'identifiers',
  'internet',
  'una',
  'pwlan',
  'article',
  'page'],
 ['specialized',
  'transport',
  'port',
  'skill',
  'rulesets',
  'incidents',
  'itt',
  'nids',
  'article',
  

In [56]:
import pickle
with open('topics_d2v.pkl', 'wb') as f:
    pickle.dump(topics2, f)



In [23]:
# Get similar categories
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

distances = cosine_similarity(ctfidf, ctfidf)
np.fill_diagonal(distances, 0)
#distances[0].argmax()
distances[0]
print(topics[1])
print(topics[121])



['für', 'den', 'eine', 'auf', 'zu', 'werden', 'von', 'die', 'der', 'und']
['log2', 'collision', 'stations', 'gossiping', 'radio', 'frac', 'log', 'rounds', 'broadcast', 'broadcasting']


In [28]:
ctfidf[2]

array([0., 0., 0., ..., 0., 0., 0.])

In [24]:
result = pd.DataFrame([(index, distances[index].argmax()) for index in range(len(docs_per_class))],columns=["From", "To"])

In [25]:
result

Unnamed: 0,From,To
0,0,5
1,1,121
2,2,266
3,3,638
4,4,460
...,...,...
724,724,718
725,725,728
726,726,728
727,727,360


In [None]:
def _reduce_to_n_topics(documents: pd.DataFrame) -> pd.DataFrame:

        # Track which topics where originally merged
        if not self._merged_topics:
            self._merged_topics = []

        # Create topic similarity matrix
        similarities = cosine_similarity(self.c_tf_idf_)
        np.fill_diagonal(similarities, 0)

        # Find most similar topic to least common topic
        topics = documents.Topic.tolist().copy()
        mapped_topics = {}
        while len(self.get_topic_freq()) > self.nr_topics + self._outliers:
            topic_to_merge = self.get_topic_freq().iloc[-1].Topic
            topic_to_merge_into = np.argmax(similarities[topic_to_merge + self._outliers]) - self._outliers
            similarities[:, topic_to_merge + self._outliers] = -self._outliers
            self._merged_topics.append(topic_to_merge)

            # Update Topic labels
            documents.loc[documents.Topic == topic_to_merge, "Topic"] = topic_to_merge_into
            mapped_topics[topic_to_merge] = topic_to_merge_into
            self._update_topic_size(documents)

        # Map topics
        mapped_topics = {from_topic: to_topic for from_topic, to_topic in zip(topics, documents.Topic.tolist())}
        self.topic_mapper_.add_mappings(mapped_topics)

        # Update representations
        documents = self._sort_mappings_by_frequency(documents)
        self._extract_topics(documents)
        self._update_topic_size(documents)
        return documents

In [None]:
def get_topic_freq(self, topic: int = None) -> Union[pd.DataFrame, int]:
        
        check_is_fitted(self)
        if isinstance(topic, int):
            return self.topic_sizes_[topic]
        else:
            return pd.DataFrame(self.topic_sizes_.items(), columns=['Topic', 'Count']).sort_values("Count",
                                                                                                   ascending=False)