In [1]:
# Importing the required libraries.
import numpy as np
import pickle, zlib
from random import sample
import scipy.cluster.hierarchy as sch
from gensim.models.doc2vec import Doc2Vec
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
# For TF-IDF keywords and scores.
def display_scores(vectorizer, tfidf_result):
    scores = zip(vectorizer.get_feature_names(),np.asarray(tfidf_result.sum(axis=0)).ravel())
    sorted_scores = sorted(scores, key=lambda x: x[1], reverse=True)
    tf_idf_words, tf_idf_scores = [], []
    useless_words = set(['offici','said','govern','near','irregular','special','say','ad','minist','chief','clash','bodi','local','time','work','vigil','mla','region', 'get','start','member','mahatma','congress','state','gram','depart', 'rs', 'crore', 'also', 'card', 'district', 'tuesday', 'offic', 'year', 'meet', 'day', 'would', 'peopl', 'nation', 'lakh', 'plan', 'union', 'alleg', 'provid', 'two', 'km', 'taken', 'guarante', 'take', 'complet', 'report', 'case', 'found', 'per', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine', 'ten', 'monday', 'tuesday', 'wednesday', 'thrusday', 'friday', 'saturday', 'sunday', 'issu', 'ask', 'level', 'order', 'parti', 'director', 'releas', 'bjp', 'sourc', 'cm', 'injur', 'mr', 'ramesh', 'visit', 'act', 'famili', 'secretari', 'first', 'last', 'includ', 'probe', 'direct', 'month'])
    for item in sorted_scores:
        if item[0] not in useless_words:
            tf_idf_words.append(item[0])
            tf_idf_scores.append(np.round(item[1],2))
    return tf_idf_words,tf_idf_scores

# Resolve article_ids to their corresponding titles and tf_idf_keywords.
def resolve_articles(ids, dataset):
    data = {}
    for i in dataset:
        if i[0] not in data:
            data[i[0]] = [i[1],i[3]]
    titles = []
    resolved_text = []
    for i in ids:
        titles.append(data[i][0])
        resolved_text.append(data[i][1])
    for i in range(len(resolved_text)):
        temp = ''
        for j in resolved_text[i]:
            temp+=(j+' ')
        resolved_text[i] = temp      
    vectorizer = TfidfVectorizer()
    tfidf_result = vectorizer.fit_transform(resolved_text)
    result = display_scores(vectorizer, tfidf_result)
    return titles, result[0], result[1]
        

In [3]:
# Function for Hierarchial Clustering.
def hierarchial_clustering(data1, data2, method='cosine'):

    # If number of articles are greater than 30k, then randomly sample 30k.
    limit = 30000
    if len(data1)>limit:
        data2 = sample(data2,limit)
        data1 = [i[1] for i in data2]

    linkage_matrix = sch.linkage(data1, 'complete', metric=method)
    
    for threshold in np.arange(1.5,0,-0.05):
        labels = sch.fcluster(linkage_matrix, threshold, criterion='distance')
        n_clusters = len(set(labels))
        if n_clusters>=50 and n_clusters<=100:
            break
    
    print('Number of articles:',len(data1),'\tNumber of clusters:',n_clusters)

    global_centroid = np.median(data1,axis=0)
    
    data = np.hstack((np.array(data2),labels.reshape(len(labels),1)))
    clusters = {}
    for i in data:
        if i[2] not in clusters:
            clusters[i[2]] = [[i[0],i[1]]]
        else:
            clusters[i[2]].append([i[0],i[1]])
    
    temp = []
    for i in clusters:
        tempu = []
        for j in clusters[i]:
            tempu.append(j[1])
        cluster_centroid = np.mean(tempu,axis=0)
        if method=='cosine':
            temp.append([cosine_similarity([global_centroid,cluster_centroid])[0][1],cluster_centroid,i])
        elif method=='euclidean':
            temp.append([distance.euclidean(global_centroid,cluster_centroid),cluster_centroid,i])
        temp.sort(reverse=True if method=='cosine' else False)

    result = []
    for i in temp[:25]:
        tempu = []
        for j in clusters[i[2]]:
            if method=='cosine':
                tempu.append([cosine_similarity([i[1],j[1]])[0][1],j[0]])
            elif method=='euclidean':
                tempu.append([distance.euclidean(i[1],j[1]),j[0]])
        tempu.sort(reverse=True if method=='cosine' else False)
        for j in tempu[:4]:
            result.append(j[1])

    return result

In [4]:
datasets = ['dataset_agriculture', 'dataset_development', 'dataset_environment', 'dataset_industrialization', 'dataset_lifestyle']
models = ['model_agriculture', 'model_development', 'model_environment', 'model_industrialization', 'model_lifestyle']

for dataset, model in zip(datasets,models):

    # Printing the collection name.
    collection_name = dataset[8:]
    print('\nCollection:',collection_name.capitalize())

    # Loading the dataset and the model from the drive.
    file = open('./DT2V_Datasets/'+dataset, 'rb')
    dataset = pickle.loads(zlib.decompress(pickle.load(file)))
    file.close()
    model = Doc2Vec.load('./Models/'+model)

    # Collecting the article_ids, and corresponding article_vectors for each class.
    temp_ids = [set() for _ in range(9)]
    temp_vectors = [[] for _ in range(9)]
    temp_datasets = [[] for _ in range(9)]
    for i in dataset:
        if i[6]=='Unemp' and i[7]=='Slow':
            if i[0] not in temp_ids[0]:
                temp_ids[0].add(i[0])
                temp_vectors[0].append(model.docvecs[i[0]])
                temp_datasets[0].append([i[0],model.docvecs[i[0]]])
        if i[6]=='Unemp' and i[7]=='Average':
            if i[0] not in temp_ids[1]:
                temp_ids[1].add(i[0])
                temp_vectors[1].append(model.docvecs[i[0]])
                temp_datasets[1].append([i[0],model.docvecs[i[0]]])
        if i[6]=='Unemp' and i[7]=='Fast':
            if i[0] not in temp_ids[2]:
                temp_ids[2].add(i[0])
                temp_vectors[2].append(model.docvecs[i[0]])
                temp_datasets[2].append([i[0],model.docvecs[i[0]]])
        if i[6]=='Agri' and i[7]=='Slow':
            if i[0] not in temp_ids[3]:
                temp_ids[3].add(i[0])
                temp_vectors[3].append(model.docvecs[i[0]])
                temp_datasets[3].append([i[0],model.docvecs[i[0]]])
        if i[6]=='Agri' and i[7]=='Average':
            if i[0] not in temp_ids[4]:
                temp_ids[4].add(i[0])
                temp_vectors[4].append(model.docvecs[i[0]])
                temp_datasets[4].append([i[0],model.docvecs[i[0]]])
        if i[6]=='Agri' and i[7]=='Fast':
            if i[0] not in temp_ids[5]:
                temp_ids[5].add(i[0])
                temp_vectors[5].append(model.docvecs[i[0]])
                temp_datasets[5].append([i[0],model.docvecs[i[0]]])
        if i[6]=='Non Agri' and i[7]=='Slow':
            if i[0] not in temp_ids[6]:
                temp_ids[6].add(i[0])
                temp_vectors[6].append(model.docvecs[i[0]])
                temp_datasets[6].append([i[0],model.docvecs[i[0]]])
        if i[6]=='Non Agri' and i[7]=='Average':
            if i[0] not in temp_ids[7]:
                temp_ids[7].add(i[0])
                temp_vectors[7].append(model.docvecs[i[0]])
                temp_datasets[7].append([i[0],model.docvecs[i[0]]])
        if i[6]=='Non Agri' and i[7]=='Fast':
            if i[0] not in temp_ids[8]:
                temp_ids[8].add(i[0])
                temp_vectors[8].append(model.docvecs[i[0]])
                temp_datasets[8].append([i[0],model.docvecs[i[0]]])

    # Finding the top titles and keywords for each class.
    names = ['unemp_slow','unemp_avg','unemp_fast','agri_slow','agri_avg','agri_fast','non_agri_slow','non_agri_avg','non_agri_fast']
    for i in range(9):
        print(names[i].capitalize())
        result = resolve_articles(hierarchial_clustering(temp_vectors[i],temp_datasets[i]),dataset)
        file = open('./Results/'+collection_name+'_'+names[i]+'_'+'titles.csv','w',encoding='utf-8')
        for title in result[0]:
            file.write(title+'\n')
        file.close()
        file = open('./Results/'+collection_name+'_'+names[i]+'_'+'keywords.csv','w')
        for keyword in result[1][:100]:
            file.write(keyword+'\n')
        file.close()


Collection: Agriculture
Unemp_slow
Number of articles: 7506 	Number of clusters: 55
Unemp_avg
Number of articles: 5155 	Number of clusters: 55
Unemp_fast
Number of articles: 4884 	Number of clusters: 50
Agri_slow
Number of articles: 14513 	Number of clusters: 54
Agri_avg
Number of articles: 5429 	Number of clusters: 60
Agri_fast
Number of articles: 735 	Number of clusters: 60
Non_agri_slow
Number of articles: 30000 	Number of clusters: 54
Non_agri_avg
Number of articles: 8955 	Number of clusters: 59
Non_agri_fast
Number of articles: 443 	Number of clusters: 51
Collection: Development
Unemp_slow
Number of articles: 1426 	Number of clusters: 67
Unemp_avg
Number of articles: 848 	Number of clusters: 57
Unemp_fast
Number of articles: 1034 	Number of clusters: 50
Agri_slow
Number of articles: 2077 	Number of clusters: 65
Agri_avg
Number of articles: 642 	Number of clusters: 58
Agri_fast
Number of articles: 124 	Number of clusters: 57
Non_agri_slow
Number of articles: 11936 	Number of cluste