In [1]:
## Packages need for data pre-process
import re
import numpy as np
import pandas as pd
from pprint import pprint

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# spacy for lemmatization
import spacy

from scipy import sparse
from collections import Counter

import nltk
from numpy import savetxt
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize

import itertools

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

# For evaluztion
from sklearn.metrics.cluster import normalized_mutual_info_score
from sklearn.metrics.cluster import adjusted_mutual_info_score
from sklearn.metrics.cluster import adjusted_rand_score

# Running time
from timeit import default_timer as timer

# Visualization
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from matplotlib.pyplot import figure

## 1. Jobs

In [18]:
# Import preprocessed Dataset
jobs_after = pd.read_csv('PW_files/sub_jobs_main_meta_doc_stemming.csv')
del jobs_after['Unnamed: 0']
True_Label = list(jobs_after["label"])

In [20]:
# read in Monocle3 clustering results 
Monocle3 = pd.read_csv("PW_files/jobs_estimated_monocle3PW_HDSCAN_stemming.csv")
Monocle3_results = pd.DataFrame(Monocle3.drop(["Unnamed: 0", "idx"], axis =1))
col_name = Monocle3_results.columns.to_list()

#### (1) Purity

In [22]:
# Purity
Correct_target = pd.DataFrame(jobs_after[["category", "label"]])
Monocle3_purity = []
for i, j in enumerate(col_name):
    n = pd.DataFrame(Monocle3_results[j])
    df_compare = pd.concat([n, Correct_target], axis=1)
    numerator = df_compare.groupby([j, "label"], as_index=False)['category'].count().sort_values('category', ascending=False).drop_duplicates(j)["category"].sum()
    purity = numerator/len(True_Label)
    Monocle3_purity.append(purity)
    
Monocle3_purity

[0.6886842571276784,
 0.6695590579068532,
 0.6799185408181335,
 0.6793872852842218,
 0.6788560297503099,
 0.6792102001062511,
 0.6664600672923676,
 0.6704444837967062,
 0.6283867540286878,
 0.6692934301398973,
 0.6263502744820258,
 0.6696476004958385,
 0.6839029573224721,
 0.6703559412077209,
 0.6695590579068532,
 0.6816008500088543,
 0.6260846467150699,
 0.6366212148043209,
 0.5610943863998583,
 0.5610943863998583,
 0.5610943863998583,
 0.5030989906144856,
 0.5030989906144856,
 0.5030989906144856,
 0.5030989906144856,
 0.5030989906144856,
 0.5030989906144856,
 0.5030989906144856]

#### (2) AMI

In [23]:
Monocle3_AMI = []
for i, j in enumerate(col_name):
    n = Monocle3_results[j]
    AMI = adjusted_mutual_info_score(True_Label, n)
    Monocle3_AMI.append(AMI)
    
Monocle3_AMI

[0.2381220833037949,
 0.24378686133797337,
 0.2652247599108734,
 0.2661958369228608,
 0.2759972422676459,
 0.27897380732589294,
 0.285122076004215,
 0.30183053046302494,
 0.28371762316187843,
 0.302092343845058,
 0.2863299123238054,
 0.30832301053713795,
 0.3221151070822316,
 0.31206458106076884,
 0.3135759945782241,
 0.3373762467634165,
 0.30617508207273136,
 0.3162180305764528,
 0.30908628220628587,
 0.30908628220628587,
 0.3092022476946366,
 0.22984990596767538,
 0.22984990596767538,
 0.22984990596767538,
 0.22984990596767538,
 0.22984990596767538,
 0.22984990596767538,
 0.22984990596767538]

#### (3) ARI

In [24]:
Monocle3_ARI = []
for i, j in enumerate(col_name):
    n = Monocle3_results[j]
    ARI = adjusted_rand_score(True_Label, n)
    Monocle3_ARI.append(ARI)
    
Monocle3_ARI

[0.09360703553700857,
 0.10768281462841851,
 0.1494908581504823,
 0.1493089630068951,
 0.1662136826859724,
 0.15970867450513604,
 0.2097682808616285,
 0.2770523578117443,
 0.21349320541983896,
 0.273784418411545,
 0.22282597416113767,
 0.28897917099587866,
 0.317282649967526,
 0.29414647393542875,
 0.2967955156704084,
 0.31976507938352916,
 0.2662236555349271,
 0.2835639570305535,
 0.23314717986228808,
 0.23314717986228808,
 0.23310661421259565,
 0.1522479750980312,
 0.1522479750980312,
 0.1522479750980312,
 0.1522479750980312,
 0.1522479750980312,
 0.1522479750980312,
 0.1522479750980312]

#### Summary

In [25]:
Monocle3_evaluation_metrics = {'Purity': Monocle3_purity, 'AMI': Monocle3_AMI, 'ARI': Monocle3_ARI}
Monocle3_evaluation_metrics = pd.DataFrame(data = Monocle3_evaluation_metrics)
Monocle3_evaluation_metrics

Unnamed: 0,Purity,AMI,ARI
0,0.688684,0.238122,0.093607
1,0.669559,0.243787,0.107683
2,0.679919,0.265225,0.149491
3,0.679387,0.266196,0.149309
4,0.678856,0.275997,0.166214
5,0.67921,0.278974,0.159709
6,0.66646,0.285122,0.209768
7,0.670444,0.301831,0.277052
8,0.628387,0.283718,0.213493
9,0.669293,0.302092,0.273784


In [26]:
# Monocle3_evaluation_metrics.to_csv("PW_files/Monocle3PW_evaluation_metrics_jobs.csv")

## 2. four

In [31]:
# Import preprocessed Dataset
four_after = pd.read_csv('PW_files/sub_four_main_meta_doc_stemming.csv')
del four_after['Unnamed: 0']
True_Label = list(four_after["user_id_new"])

In [32]:
four_after

Unnamed: 0,user_id,user_id_new,screen_name,text,tweets_processed,Size_Factor
0,27902825,2,UMichFootball,Leave it all on the field! @UMichFootball! Bes...,leav field umichfootbal best rivalri colleg fo...,1.178521
1,27902825,2,UMichFootball,There’s no time to look backwards… only ahead!...,time look ahead watch umichfootbal ball goblu ...,1.047575
2,27902825,2,UMichFootball,It’s called “The Game’ for a reason. \n\n#GoBl...,call game reason goblu beatosu,0.654734
3,27902825,2,UMichFootball,"On Saturday, our seniors will play their final...",saturday senior play final game big hous senio...,1.309468
4,27902825,2,UMichFootball,The Glasgow Decade with @UMichFootball is just...,glasgow decad umichfootbal complet stori goblu...,0.916628
...,...,...,...,...,...,...
12132,19071682,3,breakingweather,A flash flood emergency is in effect for south...,flash flood emerg effect southwestern arkansa ...,2.226096
12133,19071682,3,breakingweather,"Now that Barry, the first hurricane to make U....",barri first hurrican make landfal year come go...,1.440415
12134,19071682,3,breakingweather,"Showers and locally heavy, drenching thunderst...",shower local heavi drench thunderstorm associ ...,2.095149
12135,19071682,3,breakingweather,While Monday felt like a typical summer day in...,monday felt like typic summer day northeast te...,2.226096


In [33]:
# read in Monocle3 clustering results 
Monocle3 = pd.read_csv("PW_files/four_estimated_monocle3PW_HDSCAN_stemming.csv")
Monocle3_results = pd.DataFrame(Monocle3.drop(["Unnamed: 0", "idx"], axis =1))
col_name = Monocle3_results.columns.to_list()

#### (1) Purity

In [35]:
# Purity
Correct_target = pd.DataFrame(four_after[["screen_name", "user_id_new"]])
Monocle3_purity = []
for i, j in enumerate(col_name):
    n = pd.DataFrame(Monocle3_results[j])
    df_compare = pd.concat([n, Correct_target], axis=1)
    numerator = df_compare.groupby([j, "user_id_new"], as_index=False)['screen_name'].count().sort_values('screen_name', ascending=False).drop_duplicates(j)["screen_name"].sum()
    purity = numerator/len(True_Label)
    Monocle3_purity.append(purity)
    
Monocle3_purity

[0.9634176485128121,
 0.9621817582598665,
 0.9621817582598665,
 0.9621817582598665,
 0.9621817582598665,
 0.9615226167916289,
 0.9593804070198566,
 0.9627585070445744,
 0.9616050094751586,
 0.9616050094751586,
 0.9616050094751586,
 0.9616050094751586,
 0.9616050094751586,
 0.9616050094751586,
 0.9616050094751586,
 0.9546016313751339,
 0.9546016313751339,
 0.9546016313751339,
 0.9546016313751339,
 0.9546016313751339,
 0.9546016313751339,
 0.9546016313751339,
 0.9546016313751339,
 0.9546016313751339,
 0.9546016313751339,
 0.9546016313751339,
 0.9546016313751339,
 0.9546016313751339,
 0.9546016313751339,
 0.75348109087913]

#### (2) AMI

In [36]:
Monocle3_AMI = []
for i, j in enumerate(col_name):
    n = Monocle3_results[j]
    AMI = adjusted_mutual_info_score(True_Label, n)
    Monocle3_AMI.append(AMI)
    
Monocle3_AMI

[0.49680567933090386,
 0.5242628321308623,
 0.5459936229547961,
 0.5777470841889292,
 0.5935299510599215,
 0.6201793316469265,
 0.616478579596033,
 0.6462368313877198,
 0.681019413584287,
 0.6904219545384845,
 0.6905116406775983,
 0.6886994735843728,
 0.7287249326439859,
 0.7287372606190091,
 0.7594749117583902,
 0.7436300183275132,
 0.7814833052112686,
 0.7916172426311444,
 0.8324214684421994,
 0.8425218150391218,
 0.8425218150391218,
 0.8425218150391218,
 0.8425218150391218,
 0.8425218150391218,
 0.8425218150391218,
 0.8425218150391218,
 0.8425218150391218,
 0.8425218150391218,
 0.8425218150391218,
 0.7443518873076833]

#### (3) ARI

In [37]:
Monocle3_ARI = []
for i, j in enumerate(col_name):
    n = Monocle3_results[j]
    ARI = adjusted_rand_score(True_Label, n)
    Monocle3_ARI.append(ARI)
    
Monocle3_ARI

[0.18286808679982472,
 0.24878648560636288,
 0.29112495739521194,
 0.36277786859099664,
 0.41446191425569134,
 0.47933511014500196,
 0.4504659075815632,
 0.5404757555556963,
 0.6407650468193086,
 0.649897116956325,
 0.6437811307916844,
 0.6416008806975882,
 0.7258255301819074,
 0.7357033449959302,
 0.7794868988096484,
 0.7516167834983117,
 0.821051491849822,
 0.8315398273352363,
 0.8759580957170252,
 0.8862085166001062,
 0.8862085166001062,
 0.8862085166001062,
 0.8862085166001062,
 0.8862085166001062,
 0.8862085166001062,
 0.8862085166001062,
 0.8862085166001062,
 0.8862085166001062,
 0.8862085166001062,
 0.6683418979805587]

#### Summary

In [38]:
Monocle3_evaluation_metrics = {'Purity': Monocle3_purity, 'AMI': Monocle3_AMI, 'ARI': Monocle3_ARI}
Monocle3_evaluation_metrics = pd.DataFrame(data = Monocle3_evaluation_metrics)
Monocle3_evaluation_metrics

Unnamed: 0,Purity,AMI,ARI
0,0.963418,0.496806,0.182868
1,0.962182,0.524263,0.248786
2,0.962182,0.545994,0.291125
3,0.962182,0.577747,0.362778
4,0.962182,0.59353,0.414462
5,0.961523,0.620179,0.479335
6,0.95938,0.616479,0.450466
7,0.962759,0.646237,0.540476
8,0.961605,0.681019,0.640765
9,0.961605,0.690422,0.649897


In [39]:
# Monocle3_evaluation_metrics.to_csv("PW_files/Monocle3PW_evaluation_metrics_four.csv")