In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
from sklearn import metrics
import numpy as np
import pandas as pd
import json
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.cluster import KMeansClusterer, cosine_distance
from sklearn.metrics import precision_recall_fscore_support
import warnings
warnings.filterwarnings('ignore')

In [17]:
def cluster_kmean(train_file, test_file):
    with open(train_file) as train_data:
        train_data = json.load(train_data)
        df_train = pd.DataFrame(train_data)
        df_train.columns = ['text']

    with open(test_file) as te_data:    
        test_data = json.load(te_data)    
        df_test = pd.DataFrame(test_data)
        df_test.columns = ['text','label']
    tfidf_vect = TfidfVectorizer(stop_words="english",min_df=5) 
    dtm= tfidf_vect.fit_transform(df_train['text'])

    num_clusters=3
    clusterer = KMeansClusterer(num_clusters, cosine_distance, repeats=20)
    clusters = clusterer.cluster(dtm.toarray(), assign_clusters=True) 
    centroids=np.array(clusterer.means())

    sorted_centroids = centroids.argsort()[:, ::-1] 
    voc_lookup= tfidf_vect.get_feature_names()

    for i in range(num_clusters):
        top_words=[voc_lookup[word_index] for word_index in sorted_centroids[i, :20]]
        #print("Cluster %d:\n %s " % (i, "; ".join(top_words)))
        
    test_dtm = tfidf_vect.transform(df_test["text"])
    predicted = [clusterer.classify(v) for v in test_dtm.toarray()]    
    
    k=[]
    for i in range(len(df_test)):
        k.append(df_test.label.values[i][0])    # k will contain the first tag from label column for each news article

    confusion_df = pd.DataFrame(list(zip(k, predicted)), columns = ["label", "cluster"])
    confusion_df[['label']] = confusion_df[['label']].astype(str)
    crosstab=pd.crosstab( index=confusion_df.cluster, columns=confusion_df.label)
    print(crosstab,"\n")
    
    j=crosstab.idxmax(axis=1)
    cluster_dict=j.to_dict()
    for i in range(len(cluster_dict)):
        print("Cluster",i,":","Topic",cluster_dict[i])    
    predicted_target=[cluster_dict[i] for i in predicted]
    target_names = ["0","1","2"]
    print(metrics.classification_report(k, predicted_target, target_names=target_names))

    
def cluster_lda(train_file, test_file):
    topic_assign = None
    labels = None
    # add your code here
    with open(train_file) as train_data:
        train_data = json.load(train_data)
        df_train = pd.DataFrame(train_data)
        df_train.columns = ['text']
        
    with open(test_file) as te_data:    
        test_data = json.load(te_data)    
        df_test = pd.DataFrame(test_data)
        df_test.columns = ['text','label']    
    
    tf_vectorizer = CountVectorizer(max_df=0.80, min_df=30, stop_words='english')
    tf = tf_vectorizer.fit_transform(df_train['text'])
    tf_test = tf_vectorizer.transform(df_test['text'])
    tf_feature_names = tf_vectorizer.get_feature_names()

    K = 3
    lda = LatentDirichletAllocation(n_components=K, max_iter=25,verbose=1, evaluate_every=1, \
                                    learning_method='online', n_jobs=1, random_state=0).fit(tf)   
    topic_assign = lda.transform(tf_test)
    prob_threshold=0.25
    topics=np.copy(topic_assign)
    topics=np.where(topics>=prob_threshold, 1, 0)
    predicted_lda=topics.argmax(axis=1)

    labels=[]
    for i in range(len(df_test)):
        labels.append(df_test.label.values[i][0])

    confusion_lda_df = pd.DataFrame(list(zip(labels, predicted_lda)), columns = ["label", "cluster"])
    confusion_lda_df[['label']] = confusion_lda_df[['label']].astype(str)
    crosstab_lda=pd.crosstab( index=confusion_lda_df.cluster, columns=confusion_lda_df.label)
    print(crosstab_lda)
    
    a = crosstab_lda.idxmax(axis=1)
    cluster_lda_dict = a.to_dict()            # Dynamic allocation of majority vote 
    for i in range(len(cluster_lda_dict)):
        print("Cluster",i,":","Topic",cluster_lda_dict[i])
        
    predicted_target_lda = [cluster_lda_dict[i] for i in predicted_lda]
    target_names = ["0","1","2"]
    print(metrics.classification_report(labels, predicted_target_lda,target_names=target_names))
    return topic_assign, labels    


def overlapping_cluster(topic_assign, labels):
    final_thresh, f1 = None, None
    threshold=np.arange(0.05,1.00,0.05)
    fscore_q3=[]
    for i in range(len(threshold)):
        topics_q3=np.copy(topic_assign)   # from Q 2
        topic_q3 = np.where(topics_q3>=threshold[i], 1, 0)
        predicted_lda_q3 = topic_q3.argmax(axis=1)
        confusion_lda_df_q3 = pd.DataFrame(list(zip(labels, predicted_lda_q3)), columns = ["label", "cluster"])
        confusion_lda_df_q3[['label']] = confusion_lda_df_q3[['label']].astype(str)
        crosstab_lda_q3=pd.crosstab( index=confusion_lda_df_q3.cluster, columns=confusion_lda_df_q3.label)

        b = crosstab_lda_q3.idxmax(axis=1)
        cluster_lda_dict_q3 = b.to_dict()            # Dynamic allocation of majority vote 

        predicted_target_lda_q3 = [cluster_lda_dict_q3[i] for i in predicted_lda_q3]
        precision, recall, fscore, support = precision_recall_fscore_support(labels, predicted_target_lda_q3,)
        fscore_q3.append(fscore)

    fscore_data= pd.DataFrame(list(map(np.ravel, fscore_q3)))
    fscore_data.columns=['Disaster and Accident','News and Economy','Travel & Transportation'] 
    fscore_data['Threshold'] = threshold
    fscore_data.set_index('Threshold', inplace=True)

    final_thresh = fscore_data.idxmax(axis=0)
    f1 = fscore_data.max()
    return final_thresh, f1

In [18]:
if __name__ == "__main__":
    # Due to randomness, you won't get the exact result
# as shown here, but your result should be close
# if you tune the parameters carefully
# Q1
    cluster_kmean('C:/Users/HP/Downloads/train_text.json', 'C:/Users/HP/Downloads/test_text.json')
# Q2
    topic_assign, labels =cluster_lda('C:/Users/HP/Downloads/train_text.json', 'C:/Users/HP/Downloads/test_text.json')
    #cluster_lda('C:/Users/HP/Downloads/train_text.json', 'C:/Users/HP/Downloads/test_text.json')
# Q2
    threshold, f1 = overlapping_cluster(topic_assign, labels)
    print(threshold,"\n")
    print(f1)

label    Disaster and Accident  News and Economy  Travel & Transportation
cluster                                                                  
0                          114                 2                      125
1                           93                11                       36
2                            3               193                       23 

Cluster 0 : Topic Travel & Transportation
Cluster 1 : Topic Disaster and Accident
Cluster 2 : Topic News and Economy
             precision    recall  f1-score   support

          0       0.66      0.44      0.53       210
          1       0.88      0.94      0.91       206
          2       0.52      0.68      0.59       184

avg / total       0.69      0.69      0.68       600

iteration: 1 of max_iter: 25, perplexity: 1906.9540
iteration: 2 of max_iter: 25, perplexity: 1874.3130
iteration: 3 of max_iter: 25, perplexity: 1861.5840
iteration: 4 of max_iter: 25, perplexity: 1854.4301
iteration: 5 of max_iter: 25, perpl