In [70]:
## Packages need for data pre-process
import re
import numpy as np
import pandas as pd
from pprint import pprint
from sklearn.metrics import cohen_kappa_score
# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# spacy for lemmatization
import spacy

from scipy import sparse
from collections import Counter

import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.probability import FreqDist

# word normalization (remove repeated)
nltk.download('wordnet')
import re
from nltk.corpus import wordnet
from repeatedReplacer import RepeatReplacer 
replacer = RepeatReplacer()

import itertools
import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

# Evaluation
from sklearn.metrics.cluster import adjusted_mutual_info_score
from sklearn.metrics.cluster import adjusted_rand_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import NMF

# read in functions from 'preprocessingFunctions.py'
import preprocessingFunctions 

# count time 
import time

[nltk_data] Downloading package wordnet to /home/rep/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# 1. Tweets from 4 distinct users

In [71]:
# Import Dataset
import os 
os.chdir("/home/rep/scRNA-seq_clustering_to_Twitter/P1_preprocessing")
os.getcwd()
four = pd.read_csv('four_users.csv')
del four['Unnamed: 0']

print(four.shape)
four.head

(12780, 4)


<bound method NDFrame.head of         user_id  user_id_new      screen_name  \
0      27902825            2    UMichFootball   
1      27902825            2    UMichFootball   
2      27902825            2    UMichFootball   
3      27902825            2    UMichFootball   
4      27902825            2    UMichFootball   
...         ...          ...              ...   
12775  19071682            3  breakingweather   
12776  19071682            3  breakingweather   
12777  19071682            3  breakingweather   
12778  19071682            3  breakingweather   
12779  19071682            3  breakingweather   

                                                    text  
0                              👇 https://t.co/swtsZWWaJe  
1      Leave it all on the field! @UMichFootball! Bes...  
2      There’s no time to look backwards… only ahead!...  
3         2️⃣4️⃣:0️⃣0️⃣:0️⃣0️⃣ ⏳ https://t.co/eM3yUXJXaq  
4      It’s called “The Game’ for a reason. \r\n\r\n#...  
...                        

In [72]:
# Convert to lowercase and convert to list
data = four.text.str.lower().values.tolist()
data = [preprocessingFunctions.preProcessingFcn(tweet) for tweet in data]

In [73]:
# tokenize the tweets and remove punctuations
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data_words = list(sent_to_words(data))

In [74]:
# Remove Stop Words
stop_words = stopwords.words('english')
data_words_unigrams = [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in data_words]
# Stemming

In [75]:
# Stemming
data = []
for i in data_words_unigrams:
    tweet = ' '.join(i)
    data.append(tweet)

data_stemming = [preprocessingFunctions.stemming(tweet) for tweet in data]

data_stemming_temp = []
for i in data_stemming:
    alist = i.split()
    data_stemming_temp.append(alist)
    
data_stemming = data_stemming_temp

In [76]:
# Remove 80% of the least frequent words
words_dict, data_stemming1, empty_idx = preprocessingFunctions.trim_noise(data_stemming, 80)

Proportion of remaining tweets w.r.t. original tweets: 97.86%
Proportion of removed tweets w.r.t. original tweets: 2.14%


In [77]:
# The lowest word frequency in the remaining tweets 
min(words_dict.values())

10

In [78]:
#######################################
##### Create document-term matrix #####
#######################################

# Create Dictionary
id2word = corpora.Dictionary(data_stemming1)

# Create Corpus
texts = data_stemming1

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

a_s = gensim.matutils.corpus2dense(corpus, num_terms = len(words_dict))
b_s = a_s.T.astype(np.float64)

# Extract Document index
selected_idex = [x for x in list(four.index) if x not in empty_idx]

# Obtain remaining terms
words = [] 
for i,j in enumerate(id2word):
    words.append(id2word[i])

# Create a dataframe for the document-term matrix
b_ss = pd.DataFrame(b_s, columns=words, index=selected_idex)
print(b_ss.shape)
print(b_ss)

(12507, 2043)
       beatosu  best  colleg  field  footbal  goblu  leav  rivalri  \
1          1.0   1.0     1.0    1.0      1.0    1.0   1.0      1.0   
2          1.0   0.0     0.0    0.0      0.0    1.0   0.0      0.0   
4          1.0   0.0     0.0    0.0      0.0    1.0   0.0      0.0   
5          1.0   0.0     0.0    0.0      0.0    1.0   0.0      0.0   
7          1.0   0.0     0.0    0.0      0.0    1.0   0.0      0.0   
...        ...   ...     ...    ...      ...    ...   ...      ...   
12775      0.0   0.0     0.0    0.0      0.0    0.0   0.0      0.0   
12776      0.0   0.0     0.0    0.0      0.0    0.0   0.0      0.0   
12777      0.0   0.0     0.0    0.0      0.0    0.0   0.0      0.0   
12778      0.0   0.0     0.0    0.0      0.0    0.0   0.0      0.0   
12779      0.0   0.0     0.0    0.0      0.0    0.0   0.0      0.0   

       umichfootbal  ahead  ...  antil  lesser  uptick  dorian  \
1               1.0    0.0  ...    0.0     0.0     0.0     0.0   
2            

In [79]:
# obtain attributes for the remaining tweet 
four_after = four.drop(empty_idx, axis=0)

tweets_processed = []
for i in data_stemming1:
    tweet = ' '.join(i)
    tweets_processed.append(tweet)

four_after['tweets_processed'] = list(tweets_processed)

## four_LDA

#### - num of cluster = 4

In [82]:
start_time = time.time()
lda_model = gensim.models.ldamodel.LdaModel(passes=10,corpus=corpus,id2word=id2word,num_topics = 4, random_state = 44)
print("--- %s seconds ---" % (time. time() - start_time))

df_doc_topic = pd.DataFrame(columns = ['Dominant_Topic'])
for i in range(len(corpus)):
    a = lda_model.get_document_topics(corpus[i])
    a.sort(key = lambda x: x[1], reverse=True) #highest prob topic first
    row = [a[0][0]]
    df_doc_topic.loc[i] = row
pred_LDA = list(df_doc_topic["Dominant_Topic"])

--- 10.922403335571289 seconds ---


In [83]:
LDA_four_pred = pd.DataFrame({'LDA_four_pred': pred_LDA})
# LDA_four_pred.to_csv("/home/rep/scRNA-seq_clustering_to_Twitter/P2_scRNAseq_LDA_NMF/LDA_NMF/LDA_NMF_files/LDA_four_pred_contingency.csv")

#### - Evaluation

In [14]:
True_Label = list(four_after["user_id_new"])
Correct_target = pd.DataFrame(four_after[["screen_name", "user_id_new"]])

In [17]:
LDA_purity = []
LDA_AMI = []
LDA_ARI = []

for i in range(0,68):
    num_cluster = i+3
    lda_model = gensim.models.ldamodel.LdaModel(passes=10,corpus=corpus,id2word=id2word,num_topics = num_cluster, random_state = 44)
    df_doc_topic = pd.DataFrame(columns = ['Dominant_Topic'])
    for i in range(len(corpus)):
        a = lda_model.get_document_topics(corpus[i])
        a.sort(key = lambda x: x[1], reverse=True) #highest prob topic first
        row = [a[0][0]]
        df_doc_topic.loc[i] = row
    pred_LDA = list(df_doc_topic["Dominant_Topic"])
    
    # purity
    df_compare = pd.concat([df_doc_topic, Correct_target.reindex(df_doc_topic.index)], axis=1)
    nominator = df_compare.groupby(["Dominant_Topic", "user_id_new"], as_index=False)['screen_name'].count().sort_values('screen_name', ascending=False).drop_duplicates('Dominant_Topic')["screen_name"].sum()
    purity = nominator/len(True_Label)
    LDA_purity.append(purity)
    
    # AMI
    AMI = adjusted_mutual_info_score(True_Label, pred_LDA)
    LDA_AMI.append(AMI)
    
    # ARI
    ARI = adjusted_rand_score(True_Label, pred_LDA)
    LDA_ARI.append(ARI)

In [18]:
num_cluster = []
for i in range(0,68):
    num_cluster.append(i+3)

In [19]:
LDA_evaluation_metrics = {'num_cluster': num_cluster,'Purity': LDA_purity, 'AMI': LDA_AMI, 'ARI': LDA_ARI}
LDA_evaluation_metrics = pd.DataFrame(data = LDA_evaluation_metrics)
LDA_evaluation_metrics 

Unnamed: 0,num_cluster,Purity,AMI,ARI
0,3,0.587271,0.536204,0.484470
1,4,0.880307,0.840129,0.880648
2,5,0.854242,0.737149,0.754559
3,6,0.840010,0.717206,0.766720
4,7,0.794915,0.634953,0.676926
...,...,...,...,...
63,66,0.629168,0.212606,0.099605
64,67,0.635164,0.217742,0.127718
65,68,0.638842,0.212062,0.103063
66,69,0.644279,0.225408,0.123529


In [20]:
# LDA_evaluation_metrics.to_csv("/home/rep/scRNA-seq_clustering_to_Twitter/P2_scRNAseq_LDA_NMF/LDA_NMF/LDA_NMF_files/LDA_evaluation_metrics_four.csv")

## four_NMF

#### - num of cluster = 4

In [90]:
texts = four_after['tweets_processed']
start_time = time.time()
tfidf_vectorizer = TfidfVectorizer()
tfidf = tfidf_vectorizer.fit_transform(texts)
nmf = NMF(n_components=4, random_state=44).fit(tfidf)
nmf_output = nmf.fit_transform(tfidf)
print("--- %s seconds ---" % (time. time() - start_time))

--- 0.3957340717315674 seconds ---


In [91]:
def show_topics(vectorizer=tfidf_vectorizer, lda_model=nmf, n_words=20):
    keywords = np.array(vectorizer.get_feature_names())
    topic_keywords = []
    for topic_weights in lda_model.components_:
        top_keyword_locs = (-topic_weights).argsort()[:n_words]
        topic_keywords.append(keywords.take(top_keyword_locs))
    return topic_keywords

topic_keywords = show_topics(vectorizer=tfidf_vectorizer, lda_model=nmf, n_words=20)        

In [92]:
estimated_labels = []
for i in nmf_output:
    i = list(i)
    index = i.index(max(i))
    estimated_labels.append(index)

In [93]:
NMF_four_pred = pd.DataFrame({'NMF_four_pred': estimated_labels})
# NMF_four_pred.to_csv("/home/rep/scRNA-seq_clustering_to_Twitter/P2_scRNAseq_LDA_NMF/LDA_NMF/LDA_NMF_files/NMF_four_pred_contingency.csv")

#### - Evaluation

In [25]:
NMF_purity = []
NMF_AMI = []
NMF_ARI = []

for i in range(0,68):
    num_cluster = i+3
    
    nmf = NMF(n_components=num_cluster, random_state=44).fit(tfidf)
    nmf_output = nmf.fit_transform(tfidf)
    
    estimate_NMF = []
    for j in nmf_output:
        j = list(j)
        index = j.index(max(j))
        estimate_NMF.append(index)
    
    # purity
    estimate_NMF_matrix = pd.DataFrame({'estimate_NMF': estimate_NMF})
    df_compare = pd.concat([estimate_NMF_matrix, Correct_target], axis=1)
    numerator = df_compare.groupby(['estimate_NMF', "user_id_new"], as_index=False)['screen_name'].count().sort_values('screen_name', ascending=False).drop_duplicates('estimate_NMF')["screen_name"].sum()
    purity = numerator/len(True_Label)
    NMF_purity.append(purity)
    
    # AMI
    AMI = adjusted_mutual_info_score(True_Label, estimate_NMF)
    NMF_AMI.append(AMI)
    
    # ARI
    ARI = adjusted_rand_score(True_Label, estimate_NMF)
    NMF_ARI.append(ARI)

In [26]:
num_cluster = []
for i in range(0,68):
    num_cluster.append(i+3)

In [27]:
NMF_evaluation_metrics = {'num_cluster': num_cluster,'Purity': NMF_purity, 'AMI': NMF_AMI, 'ARI': NMF_ARI}
NMF_evaluation_metrics = pd.DataFrame(data = NMF_evaluation_metrics)
NMF_evaluation_metrics 

Unnamed: 0,num_cluster,Purity,AMI,ARI
0,3,0.660590,0.686622,0.549704
1,4,0.856960,0.795673,0.817598
2,5,0.855361,0.757928,0.770512
3,6,0.863117,0.733888,0.709325
4,7,0.842088,0.673998,0.637351
...,...,...,...,...
63,66,0.764612,0.339966,0.079186
64,67,0.769649,0.344987,0.079101
65,68,0.755977,0.335434,0.075754
66,69,0.757736,0.333756,0.074803


In [28]:
# NMF_evaluation_metrics.to_csv("/home/rep/scRNA-seq_clustering_to_Twitter/P2_scRNAseq_LDA_NMF/LDA_NMF/LDA_NMF_files/NMF_evaluation_metrics_four.csv")

# 2. "jobs" Tweets

In [29]:
# Import Dataset
jobs = pd.read_csv("jobs_tweets_sampled_three_month.csv")
del jobs['Unnamed: 0']

print(jobs.shape)
jobs.head

(27900, 4)


<bound method NDFrame.head of                       time                                               text  \
0      2009-08-01 10:25:36  Now Hiring:  Storage Architect II http://bit.l...   
1      2009-08-01 22:57:06  "The Steve Jobs method" discussion on Hacker N...   
2      2009-08-01 23:27:08  AZ Jobs | Taco Bell Restaurant General Manager...   
3      2009-08-01 09:55:12  TN Jobs | SLP Travel Job in Knoxville Area, TN...   
4      2009-08-01 05:58:39  NJ Jobs | New Jersey Travel or Perm job- OT at...   
...                    ...                                                ...   
27895  2009-11-01 02:15:14  these guys have to wake up. make him work alre...   
27896  2009-11-01 03:04:26  Therapy Jobs at HCR! Physical Therapist / PT -...   
27897  2009-11-01 00:21:24              hospitality jobs http://bit.ly/3XvUT1   
27898  2009-11-01 03:26:41  Obama Tempers Economic News With Caution On Jo...   
27899  2009-11-01 03:21:23  EXCITING, getting ready for my 1st job test =D...  

In [30]:
# Convert to lowercase and convert to list
data = jobs.text.str.lower().values.tolist()
data = [preprocessingFunctions.preProcessingFcn(tweet) for tweet in data]

In [31]:
# tokenize the tweets and remove punctuations
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data_words = list(sent_to_words(data))

In [32]:
# Remove Stop Words
stop_words = stopwords.words('english')
data_words_unigrams = [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in data_words]

In [33]:
# Stemming
data = []
for i in data_words_unigrams:
    tweet = ' '.join(i)
    data.append(tweet)

data_stemming = [preprocessingFunctions.stemming(tweet) for tweet in data]

data_stemming_temp = []
for i in data_stemming:
    alist = i.split()
    data_stemming_temp.append(alist)
    
data_stemming = data_stemming_temp

In [34]:
# Remove 90% of the least frequent words
words_dict, data_stemming1, empty_idx1 = preprocessingFunctions.trim_noise(data_stemming, 90)

Proportion of remaining tweets w.r.t. original tweets: 99.99%
Proportion of removed tweets w.r.t. original tweets: 0.01%


In [35]:
# The lowest word frequency in the remaining tweets 
min(words_dict.values())

16

In [36]:
# print the removed tweets 
for i in empty_idx1:
    print(jobs.iloc[[i]].text)

282    http://bit.ly/rXYm5 :: e_jobs: &#10148;Concurs...
Name: text, dtype: object
13865    legitimate_telecommute_jobs  http://bit.ly/16tkOq
Name: text, dtype: object


In [37]:
#######################################
##### Create document-term matrix #####
#######################################

# Create Dictionary
id2word = corpora.Dictionary(data_stemming1)

# Create Corpus
texts = data_stemming1

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

a_s = gensim.matutils.corpus2dense(corpus, num_terms = len(words_dict))
b_s = a_s.T.astype(np.float64)

# Extract Document index
selected_idex = [x for x in list(jobs.index) if x not in empty_idx1]

# Obtain remaining terms
words = [] 
for i,j in enumerate(id2word):
    words.append(id2word[i])

# Create a dataframe
b_ss = pd.DataFrame(b_s, columns=words, index=selected_idex)
print(b_ss.shape)
print(b_ss)

(27898, 2136)
       architect  hire   ii  job  discuss  news  steve  via   az  azjob  ...  \
0            1.0   1.0  1.0  1.0      0.0   0.0    0.0  0.0  0.0    0.0  ...   
1            0.0   0.0  0.0  1.0      1.0   1.0    1.0  1.0  0.0    0.0  ...   
2            0.0   1.0  0.0  2.0      0.0   0.0    0.0  0.0  2.0    1.0  ...   
3            0.0   1.0  0.0  3.0      0.0   0.0    0.0  0.0  0.0    0.0  ...   
4            0.0   1.0  0.0  3.0      0.0   0.0    0.0  0.0  0.0    0.0  ...   
...          ...   ...  ...  ...      ...   ...    ...  ...  ...    ...  ...   
27895        0.0   0.0  0.0  1.0      0.0   0.0    0.0  0.0  0.0    0.0  ...   
27896        0.0   0.0  0.0  2.0      0.0   0.0    0.0  0.0  0.0    0.0  ...   
27897        0.0   0.0  0.0  1.0      0.0   0.0    0.0  0.0  0.0    0.0  ...   
27898        0.0   0.0  0.0  1.0      0.0   2.0    0.0  0.0  0.0    0.0  ...   
27899        0.0   0.0  0.0  2.0      0.0   0.0    0.0  0.0  0.0    0.0  ...   

       pogu  decis  airwa

In [39]:
# Remove the words that appear in >= 80% of the tweets
word_dict, data_stemming2, b_ss_f, empty_idx2 = preprocessingFunctions.trim_common(b_ss, 80, data_stemming1)
print(b_ss_f.shape)

Proportion of remaining tweets w.r.t. original tweets: 99.45%
Proportion of removed tweets w.r.t. original tweets: 0.55%
(27744, 2135)


In [40]:
# Obtain the idex of all empty tweets after pre-processing
empty_idx = empty_idx1 + empty_idx2

In [42]:
jobs_after = jobs.drop(empty_idx, axis=0)

tweets_processed = []
for i in data_stemming2:
    tweet = ' '.join(i)
    tweets_processed.append(tweet)

jobs_after['tweets_processed'] = list(tweets_processed)

## jobs_LDA 

#### - num of cluster = 5

In [16]:
# Create new Dictionary
id2word = corpora.Dictionary(data_stemming2)

# Create new Corpus
texts = data_stemming2

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

In [17]:
lda_model = gensim.models.ldamodel.LdaModel(passes=10,corpus=corpus,id2word=id2word,num_topics = 5, random_state = 44)
df_doc_topic = pd.DataFrame(columns = ['Dominant_Topic'])
for i in range(len(corpus)):
    a = lda_model.get_document_topics(corpus[i])
    a.sort(key = lambda x: x[1], reverse=True) #highest prob topic first
    row = [a[0][0]]
    df_doc_topic.loc[i] = row
pred_LDA = list(df_doc_topic["Dominant_Topic"])

In [18]:
LDA_jobs_pred = pd.DataFrame({'LDA_jobs_pred': pred_LDA})
# LDA_jobs_pred.to_csv("/home/rep/scRNA-seq_clustering_to_Twitter/P2_scRNAseq_LDA_NMF/LDA_NMF/LDA_NMF_files/LDA_jobs_pred_contingency.csv")

#### - Evaluation

In [18]:
jobs_after = pd.read_csv("/home/rep/scRNA-seq_clustering_to_Twitter/P3_Proposed_workflow/PW_files/doc_metadata_stemming_jobs_hclabeled.csv")
del jobs_after['Unnamed: 0']

print(jobs_after.shape)
jobs_after.head

(27744, 7)


<bound method NDFrame.head of                       time                                               text  \
0      2009-08-01 10:25:36  Now Hiring:  Storage Architect II http://bit.l...   
1      2009-08-01 22:57:06  "The Steve Jobs method" discussion on Hacker N...   
2      2009-08-01 23:27:08  AZ Jobs | Taco Bell Restaurant General Manager...   
3      2009-08-01 09:55:12  TN Jobs | SLP Travel Job in Knoxville Area, TN...   
4      2009-08-01 05:58:39  NJ Jobs | New Jersey Travel or Perm job- OT at...   
...                    ...                                                ...   
27739  2009-11-01 02:15:14  these guys have to wake up. make him work alre...   
27740  2009-11-01 03:04:26  Therapy Jobs at HCR! Physical Therapist / PT -...   
27741  2009-11-01 00:21:24              hospitality jobs http://bit.ly/3XvUT1   
27742  2009-11-01 03:26:41  Obama Tempers Economic News With Caution On Jo...   
27743  2009-11-01 03:21:23  EXCITING, getting ready for my 1st job test =D...  

In [19]:
True_Label = list(jobs_after["category"])
Correct_target = pd.DataFrame(jobs_after[["category", "label"]])

In [34]:
LDA_purity = []
LDA_AMI = []
LDA_ARI = []

for i in range(0,68):
    num_cluster = i+3
    lda_model = gensim.models.ldamodel.LdaModel(passes=10,corpus=corpus,id2word=id2word,num_topics = num_cluster, random_state = 44)
    df_doc_topic = pd.DataFrame(columns = ['Dominant_Topic'])
    for i in range(len(corpus)):
        a = lda_model.get_document_topics(corpus[i])
        a.sort(key = lambda x: x[1], reverse=True) #highest prob topic first
        row = [a[0][0]]
        df_doc_topic.loc[i] = row
    pred_LDA = list(df_doc_topic["Dominant_Topic"])
    
    # purity
    df_compare = pd.concat([df_doc_topic, Correct_target.reindex(df_doc_topic.index)], axis=1)
    nominator = df_compare.groupby(["Dominant_Topic", "label"], as_index=False)['category'].count().sort_values('category', ascending=False).drop_duplicates('Dominant_Topic')["category"].sum()
    purity = nominator/len(True_Label)
    LDA_purity.append(purity)
    
    # AMI
    AMI = adjusted_mutual_info_score(True_Label, pred_LDA)
    LDA_AMI.append(AMI)
    
    # ARI
    ARI = adjusted_rand_score(True_Label, pred_LDA)
    LDA_ARI.append(ARI)

In [35]:
num_cluster = []
for i in range(0,68):
    num_cluster.append(i+3)

In [36]:
LDA_evaluation_metrics = {'num_cluster': num_cluster,'Purity': LDA_purity, 'AMI': LDA_AMI, 'ARI': LDA_ARI}
LDA_evaluation_metrics = pd.DataFrame(data = LDA_evaluation_metrics)
LDA_evaluation_metrics 

Unnamed: 0,num_cluster,Purity,AMI,ARI
0,3,0.593137,0.273284,0.175242
1,4,0.593137,0.224182,0.176947
2,5,0.559436,0.166230,0.123447
3,6,0.554606,0.160758,0.129027
4,7,0.588091,0.166305,0.110241
...,...,...,...,...
63,66,0.582829,0.100485,0.016178
64,67,0.582108,0.101770,0.017655
65,68,0.589028,0.107685,0.019478
66,69,0.604924,0.112639,0.018889


In [38]:
# LDA_evaluation_metrics.to_csv("/home/rep/scRNA-seq_clustering_to_Twitter/P2_scRNAseq_LDA_NMF/LDA_NMF/LDA_NMF_files/LDA_jobs_evaluation_metrics_raw_stemming.csv")

## jobs_NMF

#### - num of cluster = 5

In [43]:
texts = jobs_after['tweets_processed']
tfidf_vectorizer = TfidfVectorizer()
tfidf = tfidf_vectorizer.fit_transform(texts)
nmf = NMF(n_components=5, random_state=44).fit(tfidf)
nmf_output = nmf.fit_transform(tfidf)

In [20]:
def show_topics(vectorizer=tfidf_vectorizer, lda_model=nmf, n_words=20):
    keywords = np.array(vectorizer.get_feature_names())
    topic_keywords = []
    for topic_weights in lda_model.components_:
        top_keyword_locs = (-topic_weights).argsort()[:n_words]
        topic_keywords.append(keywords.take(top_keyword_locs))
    return topic_keywords

topic_keywords = show_topics(vectorizer=tfidf_vectorizer, lda_model=nmf, n_words=20)        

In [21]:
estimated_labels = []
for i in nmf_output:
    i = list(i)
    index = i.index(max(i))
    estimated_labels.append(index)

In [22]:
NMF_jobs_pred = pd.DataFrame({'NMF_jobs_pred': estimated_labels})
# NMF_jobs_pred.to_csv("/home/rep/scRNA-seq_clustering_to_Twitter/P2_scRNAseq_LDA_NMF/LDA_NMF/LDA_NMF_files/NMF_jobs_pred_contingency.csv")

#### - Evaluation

In [41]:
NMF_purity = []
NMF_AMI = []
NMF_ARI = []

for i in range(0,68):
    num_cluster = i+3
    
    nmf = NMF(n_components=num_cluster, random_state=44).fit(tfidf)
    nmf_output = nmf.fit_transform(tfidf)
    
    estimate_NMF = []
    for j in nmf_output:
        j = list(j)
        index = j.index(max(j))
        estimate_NMF.append(index)
    
    # purity
    estimate_NMF_matrix = pd.DataFrame({'estimate_NMF': estimate_NMF})
    df_compare = pd.concat([estimate_NMF_matrix, Correct_target], axis=1)
    numerator = df_compare.groupby(['estimate_NMF', "label"], as_index=False)['category'].count().sort_values('category', ascending=False).drop_duplicates('estimate_NMF')["category"].sum()
    purity = numerator/len(True_Label)
    NMF_purity.append(purity)
    
    # AMI
    AMI = adjusted_mutual_info_score(True_Label, estimate_NMF)
    NMF_AMI.append(AMI)
    
    # ARI
    ARI = adjusted_rand_score(True_Label, estimate_NMF)
    NMF_ARI.append(ARI)

In [44]:
num_cluster = []
for i in range(0,68):
    num_cluster.append(i+3)

In [45]:
NMF_evaluation_metrics = {'num_cluster': num_cluster,'Purity': NMF_purity, 'AMI': NMF_AMI, 'ARI': NMF_ARI}
NMF_evaluation_metrics = pd.DataFrame(data = NMF_evaluation_metrics)
NMF_evaluation_metrics 

Unnamed: 0,num_cluster,Purity,AMI,ARI
0,3,0.506416,0.078822,0.110822
1,4,0.546785,0.192524,0.193621
2,5,0.563365,0.213600,0.259313
3,6,0.550029,0.201867,0.243048
4,7,0.563437,0.195759,0.176873
...,...,...,...,...
63,66,0.668325,0.161217,0.028121
64,67,0.683499,0.172003,0.033676
65,68,0.671749,0.165080,0.032254
66,69,0.678417,0.165137,0.029846


In [48]:
# NMF_evaluation_metrics.to_csv("/home/rep/scRNA-seq_clustering_to_Twitter/P2_scRNAseq_LDA_NMF/LDA_NMF/LDA_NMF_files/NMF_jobs_evaluation_metrics_raw_stemming.csv")