# MAIN NOTEBOOK

#### Josh Gen (jdg9vr@virginia.edu) DS 5001 Spring 2023

In [1]:
import pandas as pd
import seaborn as sns
import nltk
nltk.download("stopwords")
import numpy as np
import re
from numpy.linalg import norm
from scipy.spatial.distance import pdist, squareform
from scipy.linalg import eigh
import plotly.express as px
from nltk.tokenize import RegexpTokenizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation as LDA
from gensim.models import word2vec

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/joshgen/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
OHCO = ['company_num', 'link_num', 'sent_num', 'token_num']

### F0

#### Source Format. The initial source format of a text, which varies by collection, e.g. XML (e.g. TEI and RSS), HTML, plain text (e.g. Gutenberg), JSON, and CSV.

In [3]:
df = pd.read_csv('./data/old_data/CORPUS.tar.gz', compression='gzip', lineterminator='\n')
df

Unnamed: 0,company_num,Text,characters
0,0,"Ahresty, with more than 60 years of experienc...",1709
1,0,"PRODUCTS Ahresty, with more than 60 years of e...",754
2,0,ENVIRONMENTAL,16
3,0,CONTACT Address Ahresty Wilmington Corporation...,439
4,1,Manufacturer ofMetal FastenersandGeneral Hardw...,1025
...,...,...,...
90628,1225,"Home•Careers Together, we build the future We...",2524
90629,1225,Privacy The protection of your personal data i...,12706
90630,1225,Signicast acquires European based CIREX 02.15....,5160
90631,1225,Email Protection You are unable to access this...,558


# Subset data

In [4]:
char_per_comp = df.groupby('company_num').sum('characters').sort_values('characters')
filtered_comps = char_per_comp[(char_per_comp['characters']<20000) & (char_per_comp['characters']>1000) & ~(char_per_comp['characters'].isna())]
filtered_comps

Unnamed: 0_level_0,characters
company_num,Unnamed: 1_level_1
138,1033
858,1035
1093,1035
766,1051
276,1061
...,...
882,19599
356,19617
639,19806
615,19918


In [5]:
companies = filtered_comps.sample(150, random_state=12341).index
filtered_data = df[df['company_num'].isin(companies)]
print("Number of total Companies:", len(filtered_data))

Number of total Companies: 1105


In [6]:
# Displays the first link's text of each company.
filtered_data.groupby('company_num').first()

Unnamed: 0_level_0,Text,characters
company_num,Unnamed: 1_level_1,Unnamed: 2_level_1
3,"3D Prototyping With 3D printing technology, pr...",3279
10,"Congress Drives, established in 1915, is the l...",734
33,1 Single Source ProviderFor A Complete Solutio...,904
34,Engineering Excellence in Zinc Die Cast Manu...,1572
49,The Heavy Metal Company Limited has been casti...,10199
...,...,...
1191,Open The Door Products Soft Close Magnetic Cat...,987
1200,Canterbury Aluminium | Exceptional Windows & D...,1907
1201,About Pioneer Venture Our Products Why US? Pro...,59
1216,"Our Company For over 35 years, the Houston An...",1469


In [7]:
filtered_data.to_csv('./data/raw_text.csv', index=False)

### F1

#### Machine Learning Corpus Format (MLCF). Ideally a table of minimum discursive units indexed by document content hierarchy.

In [8]:
df = pd.read_csv('./data/raw_text.csv', lineterminator='\n')

In [9]:
df

Unnamed: 0,company_num,Text,characters
0,3,"3D Prototyping With 3D printing technology, pr...",3279
1,10,"Congress Drives, established in 1915, is the l...",734
2,10,Variable Pitch Pulleys Congress Drives Variabl...,284
3,10,"Custom Die Cast Components Today, we serve Nor...",336
4,10,Manufacturing Capabilities Congress Drives con...,429
...,...,...,...
1100,1216,Quality Control Quality Control Houston state...,1759
1101,1216,"Testimonials Testimonials ""The business ethic...",570
1102,1222,HOME ABOUT PENTACAST SERVICES CONTACT More Pen...,352
1103,1222,HOME ABOUT PENTACAST SERVICES CONTACT More SER...,5105


# Create LIB

In [10]:
LIB = df[['company_num', 'characters']].groupby('company_num').agg(['sum', 'count'])['characters'].reset_index()\
.rename(columns={'sum':'total_characters', 'count':'total_links'})
LIB

Unnamed: 0,company_num,total_characters,total_links
0,3,3279,1
1,10,5594,11
2,33,4808,6
3,34,9668,7
4,49,10199,1
...,...,...,...
145,1191,17474,17
146,1200,18403,9
147,1201,1405,10
148,1216,13623,9


In [290]:
LIB.to_csv('./data/LIB.csv', index=False)

# Create DOCS

In [11]:
# Add link count column
df['link_num'] = df.groupby('company_num').cumcount()

DOCS = df[["company_num", "link_num" ,"Text", "characters"]]
DOCS = DOCS.rename(columns={'company_num': 'company_id'})
DOCS = DOCS.rename(columns={'Text': 'text'})
DOCS = DOCS.set_index(["company_id", "link_num"])
DOCS

Unnamed: 0_level_0,Unnamed: 1_level_0,text,characters
company_id,link_num,Unnamed: 2_level_1,Unnamed: 3_level_1
3,0,"3D Prototyping With 3D printing technology, pr...",3279
10,0,"Congress Drives, established in 1915, is the l...",734
10,1,Variable Pitch Pulleys Congress Drives Variabl...,284
10,2,"Custom Die Cast Components Today, we serve Nor...",336
10,3,Manufacturing Capabilities Congress Drives con...,429
...,...,...,...
1216,7,Quality Control Quality Control Houston state...,1759
1216,8,"Testimonials Testimonials ""The business ethic...",570
1222,0,HOME ABOUT PENTACAST SERVICES CONTACT More Pen...,352
1222,1,HOME ABOUT PENTACAST SERVICES CONTACT More SER...,5105


In [292]:
DOCS.to_csv('./data/DOCS.csv')

### F2
: Convert the collection from their source formats (F0) into a set of tables that conform to the Standard Text Analytic Data Model

#### Standard Text Analytic Data Model (STADM). A normalized set of tables including DOC, TOKEN, and TERM tables. Produced by the tokenization of F1 data.

# Create SENTS

#### 2. SENTS

In [12]:
## SENTS
sent_pat = r'[.?!;:]+'
SENTS = DOCS['text'].str.split(sent_pat, expand=True).stack().to_frame('sent_str')
SENTS.index.names = ["company_id", "link_num", "sent_num"]
SENTS

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,sent_str
company_id,link_num,sent_num,Unnamed: 3_level_1
3,0,0,"3D Prototyping With 3D printing technology, pr..."
3,0,1,Utilizing rapid prototyping through 3D printi...
3,0,2,Tooling Equipped with onsite tool room facili...
3,0,3,Production Our unique and custom built Zinc d...
3,0,4,Our highly skilled production staff inspect a...
...,...,...,...
1222,2,6,© 2017 by PentaCast Inc
1222,2,7,Tel
1222,2,8,519
1222,2,9,245


In [294]:
SENTS.to_csv('./data/SENTS.csv')

# Create TOKENS/CORPUS

#### 3. TOKENS

### F3 
: NLP Annotated STADM. STADM with annotations added to token and term records indicating stopwords, parts-of-speech, stems and lemmas, named entities, grammatical dependencies, sentiments, etc.

In [13]:
keep_whitespace = True

In [14]:
%%time
if keep_whitespace:
    TOKENS = SENTS.sent_str\
            .apply(lambda x: pd.Series(nltk.pos_tag(nltk.word_tokenize(x))))\
            .stack()\
            .to_frame('pos_tuple')
else:
    TOKENS = SENTS.sent_str\
            .apply(lambda x: pd.Series(nltk.pos_tag(nltk.WhitespaceTokenizer().tokenize(x))))\
            .stack()\
            .to_frame('pos_tuple')



CPU times: user 12.3 s, sys: 596 ms, total: 12.9 s
Wall time: 13 s


In [15]:
TOKENS.index.names = ["company_id", "link_num", "sent_num", "token_num"]
TOKENS

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,pos_tuple
company_id,link_num,sent_num,token_num,Unnamed: 4_level_1
3,0,0,0,"(3D, CD)"
3,0,0,1,"(Prototyping, VBG)"
3,0,0,2,"(With, IN)"
3,0,0,3,"(3D, CD)"
3,0,0,4,"(printing, VBG)"
...,...,...,...,...
1222,2,6,4,"(Inc, NNP)"
1222,2,7,0,"(Tel, NN)"
1222,2,8,0,"(519, CD)"
1222,2,9,0,"(245, CD)"


In [16]:
import pandas as pd

def clean_tokens(df):
    # Extract the token and filter out unwanted tokens
    df['token'] = df['pos_tuple'].apply(lambda x: x[0])
    df['pos'] = df['pos_tuple'].apply(lambda x: x[1])
    df = df[(df['token'].str.isalpha()) & (df['pos'].str.startswith('N')) & (~df['token'].str.isnumeric()) & (df['token'].str.len() <= 15)]

    # Reset the index and set the desired multi-level index
    df = df.reset_index().set_index(['company_id', 'link_num', 'sent_num', 'token_num'])
    
    df = df.drop(columns=['token'])
    
    return df

TOKENS = clean_tokens(TOKENS)
TOKENS

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,pos_tuple,pos
company_id,link_num,sent_num,token_num,Unnamed: 4_level_1,Unnamed: 5_level_1
3,0,0,5,"(technology, NN)",NN
3,0,0,8,"(designs, NNS)",NNS
3,0,0,16,"(tolerance, NN)",NN
3,0,0,18,"(size, NN)",NN
3,0,0,22,"(tool, NN)",NN
...,...,...,...,...,...
1222,2,4,0,"(Success, NN)",NN
1222,2,5,1,"(message, NN)",NN
1222,2,6,3,"(PentaCast, NNP)",NNP
1222,2,6,4,"(Inc, NNP)",NNP


In [17]:
%%time
TOKENS['pos'] = TOKENS.pos_tuple.apply(lambda x: x[1])
TOKENS['token_str'] = TOKENS.pos_tuple.apply(lambda x: x[0])
TOKENS['term_str'] = TOKENS.token_str.str.lower()
TOKENS

CPU times: user 49.1 ms, sys: 2.2 ms, total: 51.3 ms
Wall time: 50.9 ms


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,pos_tuple,pos,token_str,term_str
company_id,link_num,sent_num,token_num,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
3,0,0,5,"(technology, NN)",NN,technology,technology
3,0,0,8,"(designs, NNS)",NNS,designs,designs
3,0,0,16,"(tolerance, NN)",NN,tolerance,tolerance
3,0,0,18,"(size, NN)",NN,size,size
3,0,0,22,"(tool, NN)",NN,tool,tool
...,...,...,...,...,...,...,...
1222,2,4,0,"(Success, NN)",NN,Success,success
1222,2,5,1,"(message, NN)",NN,message,message
1222,2,6,3,"(PentaCast, NNP)",NNP,PentaCast,pentacast
1222,2,6,4,"(Inc, NNP)",NNP,Inc,inc


In [300]:
# SAVE TOKENS TABLE
TOKENS.to_csv("./data/TOKENS.csv")

# Create VOCAB

In [18]:
%%time
VOCAB = TOKENS.term_str.value_counts().to_frame('n')
VOCAB.index.name = 'term_str'
VOCAB['p'] = VOCAB.n / VOCAB.n.sum()
VOCAB['i'] = -np.log2(VOCAB.p)
VOCAB['n_chars'] = VOCAB.index.str.len()
VOCAB['max_pos'] = TOKENS[['term_str','pos']].value_counts().unstack(fill_value=0).idxmax(1)
VOCAB

CPU times: user 74 ms, sys: 7.86 ms, total: 81.9 ms
Wall time: 87.4 ms


Unnamed: 0_level_0,n,p,i,n_chars,max_pos
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
quality,747,0.009743,6.681354,7,NN
casting,721,0.009404,6.732463,7,NNP
products,668,0.008713,6.842614,8,NNS
castings,510,0.006652,7.231965,8,NNS
aluminum,476,0.006209,7.331500,8,NNP
...,...,...,...,...,...
directement,1,0.000013,16.226318,11,NN
tosemco,1,0.000013,16.226318,7,NN
guests,1,0.000013,16.226318,6,NNS
webwall,1,0.000013,16.226318,7,NNP


In [105]:
# SAVE VOCAB TABLE
VOCAB.to_csv("./data/VOCAB.csv")

### F4 
: STADM with Vector Space models. Vector space representations of TOKEN data and resulting statistical data, such as term frequency and TFIDF.

In [19]:
VOCAB['n_pos'] = TOKENS[['term_str','pos']].value_counts().unstack().count(1)
VOCAB['cat_pos'] = TOKENS[['term_str','pos']].value_counts().to_frame('n').reset_index()\
    .groupby('term_str').pos.apply(lambda x: set(x))

In [20]:
VOCAB

Unnamed: 0_level_0,n,p,i,n_chars,max_pos,n_pos,cat_pos
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
quality,747,0.009743,6.681354,7,NN,2,"{NN, NNP}"
casting,721,0.009404,6.732463,7,NNP,2,"{NN, NNP}"
products,668,0.008713,6.842614,8,NNS,3,"{NNPS, NNS, NNP}"
castings,510,0.006652,7.231965,8,NNS,3,"{NNPS, NNS, NNP}"
aluminum,476,0.006209,7.331500,8,NNP,2,"{NN, NNP}"
...,...,...,...,...,...,...,...
directement,1,0.000013,16.226318,11,NN,1,{NN}
tosemco,1,0.000013,16.226318,7,NN,1,{NN}
guests,1,0.000013,16.226318,6,NNS,1,{NNS}
webwall,1,0.000013,16.226318,7,NNP,1,{NNP}


In [21]:
sw = pd.DataFrame(nltk.corpus.stopwords.words('english'), columns=['term_str'])
sw = sw.reset_index().set_index('term_str')
sw.columns = ['dummy']
sw.dummy = 1

VOCAB['stop'] = VOCAB.index.map(sw.dummy)
VOCAB['stop'] = VOCAB['stop'].fillna(0).astype('int')

# Create BOW

In [22]:
def create_bow(CORPUS, bag, item_type='term_str'):
    BOW = CORPUS.groupby(bag+[item_type])[item_type].count().to_frame('n')
    return BOW

BOW = create_bow(TOKENS, ['company_id'])
BOW

Unnamed: 0_level_0,Unnamed: 1_level_0,n
company_id,term_str,Unnamed: 2_level_1
3,absence,1
3,art,1
3,auckland,1
3,australia,1
3,back,1
...,...,...
1222,wall,1
1222,works,1
1222,x,3
1222,years,1


In [386]:
BOW.to_csv('./data/BOW.csv')

# Create TFIDF and DFIDF

In [23]:
def get_tfidf_dfidf(BOW, tf_method='max', df_method='standard', item_type='term_str'):
    '''
    The purpose of this function is to calculate TFIDF and DFIDF for a given BOW representation of a CORPUS.
    
    INPUT:
        BOW           dataframe of a bag of words representation of a corpus
        tf_method     method for calculating term frequency, string
        df_method     method for calculating document frequency, string
        item_type     item type
        
    OUTPUT:
        TFIDF         dataframe of term frequency inverse document frequency for the corpus
        DFIDF         dataframe of document frequency inverse document frequency for the corpus
    '''
            
    DTCM = BOW.n.unstack() # Create Doc-Term Count Matrix
    
    if tf_method == 'sum':
        TF = (DTCM.T / DTCM.T.sum()).T
    elif tf_method == 'max':
        TF = (DTCM.T / DTCM.T.max()).T
    elif tf_method == 'log':
        TF = (np.log2(DTCM.T + 1)).T
    elif tf_method == 'raw':
        TF = DTCM
    elif tf_method == 'bool':
        TF = DTCM.astype('bool').astype('int')
    else:
        raise ValueError(f"TF method {tf_method} not found.")

    DF = DTCM.count() # Assumes NULLs 
    N_docs = len(DTCM)
    
    if df_method == 'standard':
        IDF = np.log10(N_docs/DF) # This what the students were asked to use
    elif df_method == 'textbook':
        IDF = np.log10(N_docs/(DF + 1))
    elif df_method == 'sklearn':
        IDF = np.log10(N_docs/DF) + 1
    elif df_method == 'sklearn_smooth':
        IDF = np.log10((N_docs + 1)/(DF + 1)) + 1
    else:
        raise ValueError(f"DF method {df_method} not found.")
    
    TFIDF = TF * IDF
    
    DFIDF = DF * IDF
    
    TFIDF = TFIDF.fillna(0)

    return TFIDF, DFIDF

In [24]:
TFIDF, DFIDF = get_tfidf_dfidf(BOW)

In [25]:
TFIDF

term_str,a,aacco,aact,aashto,ab,abandonments,abb,abbotsford,abel,abilities,...,équipements,ï,čˢ,ġ,ǀ,ǚ,ǟٽ,δ,ӌ,ӕހi
company_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10,0.053184,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
33,0.041788,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
34,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
49,0.165334,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1191,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1200,0.054846,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1201,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1216,0.007313,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [309]:
TFIDF.to_csv('./data/TFIDF.csv')

In [26]:
VOCAB['dfidf'] = DFIDF
VOCAB['mean_tfidf'] = TFIDF.mean()

In [27]:
VOCAB.sort_values('mean_tfidf', ascending=False)

Unnamed: 0_level_0,n,p,i,n_chars,max_pos,n_pos,cat_pos,stop,dfidf,mean_tfidf
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
casting,721,0.009404,6.732463,7,NNP,2,"{NN, NNP}",0,23.363600,0.059843
castings,510,0.006652,7.231965,8,NNS,3,"{NNPS, NNS, NNP}",0,23.789373,0.052503
die,341,0.004448,7.812690,3,NNP,3,"{NNS, NN, NNP}",0,23.095603,0.052346
foundry,281,0.003665,8.091892,7,NNP,3,"{NNS, NN, NNP}",0,23.436098,0.048414
inc,296,0.003861,8.016865,3,NNP,1,{NNP},0,23.894575,0.046943
...,...,...,...,...,...,...,...,...,...,...
friend,1,0.000013,16.226318,6,NN,1,{NN},0,2.176091,0.000107
foe,1,0.000013,16.226318,3,NN,1,{NN},0,2.176091,0.000107
tactic,1,0.000013,16.226318,6,NN,1,{NN},0,2.176091,0.000107
tailored,1,0.000013,16.226318,8,NNP,1,{NNP},0,2.176091,0.000107


In [28]:
# TOP 30 meaningful words based on mean_tfidf
VOCAB.sort_values('mean_tfidf', ascending=False).index[:30]

Index(['casting', 'castings', 'die', 'foundry', 'inc', 'aluminum', 'sand',
       'brass', 'com', 'cookies', 'bronze', 'zinc', 'cnc', 'machining',
       'precision', 'ltd', 'magnets', 'alloys', 'rights', 'site', 'metal',
       'parts', 'machines', 'solutions', 'design', 'bearing', 'services',
       'information', 'machine', 'copyright'],
      dtype='object', name='term_str')

In [393]:
VOCAB.to_csv('./data/VOCAB.csv')

# Create VIDX and MT

In [29]:
VIDX = VOCAB.sort_values('dfidf', ascending=False)\
    .head(1000).index

In [30]:
VIDX

Index(['technology', 'world', 'please', 'today', 'sales', 'components', 'use',
       'materials', 'email', 'delivery',
       ...
       'rod', 'any', 'signage', 'holes', 'itar', 'apply', 'weather', 'behalf',
       'draft', 'yield'],
      dtype='object', name='term_str', length=1000)

In [31]:
MT = TFIDF[VIDX].groupby('company_id').mean().fillna(0) # MUST FILLNA

In [32]:
MT

term_str,technology,world,please,today,sales,components,use,materials,email,delivery,...,rod,any,signage,holes,itar,apply,weather,behalf,draft,yield
company_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3,0.217864,0.145243,0.000000,0.073950,0.000000,0.206332,0.000000,0.000000,0.000000,0.076681,...,0.0,0.000000,0.23299,0.000000,0.0,0.0,0.232990,0.0,0.000000,0.0
10,0.019806,0.019806,0.000000,0.060504,0.000000,0.075030,0.056272,0.018757,0.000000,0.020913,...,0.0,0.000000,0.00000,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.0
33,0.031123,0.000000,0.000000,0.000000,0.000000,0.088428,0.029476,0.000000,0.000000,0.000000,...,0.0,0.000000,0.00000,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.0
34,0.089709,0.000000,0.000000,0.039150,0.013289,0.097097,0.012137,0.024274,0.000000,0.000000,...,0.0,0.000000,0.00000,0.000000,0.0,0.0,0.000000,0.0,0.041116,0.0
49,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.000000,0.03039,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1191,0.000000,0.000000,0.023343,0.000000,0.008068,0.000000,0.022107,0.000000,0.007369,0.016432,...,0.0,0.000000,0.00000,0.024963,0.0,0.0,0.000000,0.0,0.000000,0.0
1200,0.000000,0.000000,0.040850,0.083193,0.000000,0.000000,0.000000,0.012896,0.000000,0.000000,...,0.0,0.043686,0.00000,0.000000,0.0,0.0,0.043686,0.0,0.000000,0.0
1201,0.000000,0.000000,0.000000,0.000000,0.000000,0.082533,0.000000,0.000000,0.000000,0.000000,...,0.0,0.000000,0.00000,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.0
1216,0.016340,0.010893,0.000000,0.000000,0.022591,0.005158,0.020633,0.025791,0.087691,0.000000,...,0.0,0.000000,0.00000,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.0


# Create L0, L1, L2

In [33]:
L0 = MT.astype('bool').astype('int') # Binary (Pseudo L)
L1 = MT.apply(lambda x: x / x.sum(), 1) # Manhattan (Probabilistic)
L2 = MT.apply(lambda x: x / norm(x), 1) # Euclidean

# Create PAIRS and CORR_MATRIX

In [34]:
PAIRS = 1 - MT.T.corr().stack().to_frame('correl')
PAIRS.index.names = ['doc_a','doc_b']
PAIRS = PAIRS.query("doc_a > doc_b") # Remove identities and reverse duplicates

general_method = 'weighted' # single, complete, average, weighted 
euclidean_method = 'ward' # ward, centroid, median
combos  = [
    (L2, 'euclidean', 'euclidean', euclidean_method),
    (MT,  'cosine', 'cosine', euclidean_method),
    (MT,  'cityblock', 'cityblock', general_method),
    (L0, 'jaccard', 'jaccard', general_method),
    (L1, 'jensenshannon', 'js', general_method),
]

for X, metric, label, _ in combos:
    PAIRS[label] = pdist(X, metric)

In [35]:
corr_type = 'kendall'
CORR_MATRIX = MT.T.corr(corr_type)

#LIB['kendall_sum'] = CORR_MATRIX.sum()

In [36]:
np.fill_diagonal(CORR_MATRIX.values, 0)

In [38]:
CORR_MATRIX.head()

company_id,3,10,33,34,49,58,63,66,77,81,...,1129,1142,1172,1180,1181,1191,1200,1201,1216,1222
company_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3,0.0,0.225871,0.14616,0.155014,0.130344,0.095175,0.042894,0.133576,0.160151,0.040139,...,0.066787,0.045791,0.130567,0.127335,0.064808,0.060241,0.116861,0.139103,0.053791,0.156412
10,0.225871,0.0,0.171082,0.271149,0.053168,0.148526,0.047476,0.206881,0.224896,0.143307,...,0.151704,0.166338,0.188457,0.108727,0.096377,0.092509,0.101418,0.131702,0.162273,0.158236
33,0.14616,0.171082,0.0,0.171405,0.054977,0.146462,0.042639,0.104657,0.164025,0.163128,...,0.133221,0.166097,0.171313,0.129596,0.082735,0.050206,0.118617,0.135113,0.157306,0.1397
34,0.155014,0.271149,0.171405,0.0,0.069836,0.19116,0.067412,0.200808,0.203099,0.1402,...,0.159458,0.139171,0.188709,0.145376,0.080394,0.075721,0.115776,0.112152,0.177812,0.178404
49,0.130344,0.053168,0.054977,0.069836,0.0,0.129506,0.098425,0.072137,0.122069,0.065049,...,0.072439,0.096085,0.068235,0.090151,0.032194,0.017922,0.110902,0.089658,0.026574,0.032048


In [40]:
PAIRS.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,correl,euclidean,cosine,cityblock,jaccard,js
doc_a,doc_b,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
10,3,0.867608,1.274674,0.812397,19.651571,0.809524,0.720284
33,3,0.954771,1.332003,0.887116,23.193908,0.859551,0.756359
33,10,0.975524,1.267244,0.802954,20.749968,0.849138,0.728112
34,3,0.856885,1.337027,0.893821,20.578519,0.872832,0.762107
34,10,0.667347,1.336153,0.892653,24.535068,0.881818,0.7693


In [41]:
CORR_MATRIX.to_csv('./data/CORR_MATRIX.csv')
PAIRS.to_csv('./data/PAIRS.csv')

# Explore CORR_MATRIX

In [322]:
max_corr = CORR_MATRIX.max(axis=0)

In [323]:
max_corr_idx = CORR_MATRIX.idxmax(axis=0)

In [324]:
corr_pairs = pd.concat([max_corr_idx, max_corr], axis=1).rename(columns={0:'Max_id', 1:'Max_correlation'})
corr_pairs

Unnamed: 0_level_0,Max_id,Max_correlation
company_id,Unnamed: 1_level_1,Unnamed: 2_level_1
3,10,0.225871
10,730,0.281390
33,443,0.269674
34,10,0.271149
49,704,0.197382
...,...,...
1191,896,0.206361
1200,503,0.177177
1201,414,0.231855
1216,1142,0.241941


# PCA

In [325]:
# setup

norm_docs = True # This has the effect of exaggerating variance when False
center_term_vectors = True # This has the effect of demoting authorship when False

colors = "Spectral"

sns.set(style='ticks')

TFIDF_SMALL = TFIDF[VIDX]

In [326]:
# normalize doc vector lengths
TFIDF_L2 = (TFIDF_SMALL.T / norm(TFIDF_SMALL, 2, axis=1)).T

# center term vectors
TFIDF_L2 = TFIDF_L2 - TFIDF_L2.mean()

In [327]:
COV = TFIDF_L2.T.dot(TFIDF_L2) / (TFIDF_L2.shape[0] - 1)

In [328]:
eig_vals, eig_vecs = eigh(COV)

In [329]:
EIG_VEC = pd.DataFrame(eig_vecs, index=COV.index, columns=COV.index)
EIG_VAL = pd.DataFrame(eig_vals, index=COV.index, columns=['eig_val'])
EIG_VAL.index.name = 'term_str'

In [330]:
EIG_PAIRS = EIG_VAL.join(EIG_VEC.T)

In [331]:
COMPS = EIG_PAIRS.sort_values('eig_val', ascending=False).head(10).reset_index(drop=True)
COMPS.index.name = 'comp_id'
COMPS.index = ["PC{}".format(i) for i in COMPS.index.tolist()]
COMPS.index.name = 'pc_id'

In [332]:
TFIDF_L2

term_str,technology,world,please,today,sales,components,use,materials,email,delivery,...,rod,any,signage,holes,itar,apply,weather,behalf,draft,yield
company_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3,0.099879,0.061886,-0.012543,0.025122,-0.020175,0.088139,-0.013366,-0.013573,-0.015839,0.029169,...,-0.001708,-0.001605,0.116552,-0.001762,-0.001866,-0.001529,0.116557,-0.001378,-0.002480,-0.001512
10,0.009435,0.009051,-0.012543,0.055200,-0.020175,0.066074,0.050226,0.007625,-0.015839,0.013091,...,-0.001708,-0.001605,-0.004108,-0.001762,-0.001866,-0.001529,-0.004104,-0.001378,-0.002480,-0.001512
33,0.013407,-0.013332,-0.012543,-0.013174,-0.020175,0.056164,0.011594,-0.013573,-0.015839,-0.010543,...,-0.001708,-0.001605,-0.004108,-0.001762,-0.001866,-0.001529,-0.004104,-0.001378,-0.002480,-0.001512
34,0.074543,-0.013332,-0.012543,0.025007,-0.007215,0.075981,-0.001529,0.010102,-0.015839,-0.010543,...,-0.001708,-0.001605,-0.004108,-0.001762,-0.001866,-0.001529,-0.004104,-0.001378,0.037619,-0.001512
49,-0.012948,-0.013332,-0.012543,-0.013174,-0.020175,-0.018715,-0.013366,-0.013573,-0.015839,-0.010543,...,-0.001708,-0.001605,0.031751,-0.001762,-0.001866,-0.001529,-0.004104,-0.001378,-0.002480,-0.001512
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1191,-0.012948,-0.013332,0.003094,-0.013174,-0.014770,-0.018715,0.001443,-0.013573,-0.010903,0.000465,...,-0.001708,-0.001605,-0.004108,0.014961,-0.001866,-0.001529,-0.004104,-0.001378,-0.002480,-0.001512
1200,-0.012948,-0.013332,0.017242,0.047484,-0.020175,-0.018715,-0.013366,-0.004170,-0.015839,-0.010543,...,-0.001708,0.030247,-0.004108,-0.001762,-0.001866,-0.001529,0.027749,-0.001378,-0.002480,-0.001512
1201,-0.012948,-0.013332,-0.012543,-0.013174,-0.020175,0.045893,-0.013366,-0.013573,-0.015839,-0.010543,...,-0.001708,-0.001605,-0.004108,-0.001762,-0.001866,-0.001529,-0.004104,-0.001378,-0.002480,-0.001512
1216,0.011689,0.003093,-0.012543,-0.013174,0.013886,-0.010938,0.017744,0.025315,0.116378,-0.010543,...,-0.001708,-0.001605,-0.004108,-0.001762,-0.001866,-0.001529,-0.004104,-0.001378,-0.002480,-0.001512


In [333]:
# get Document Component Matrix
DCM = TFIDF_L2.dot(COMPS[COV.index].T)

# add metadata for display purposes
# LIB_COLS = LIB.columns
# DCM = DCM.join(LIB[LIB_COLS], on='company_id')

# # define doc field to name each chapter
# DCM['doc'] = DCM.apply(lambda x: f"{x.title} {str(x.name[1]).zfill(2)}", 1)

In [334]:
DCM = DCM.reset_index()

In [335]:
LOADINGS = COMPS[COV.index].T
LOADINGS.index.name = 'term_str'

In [350]:
DCM.to_csv('./data/DCM.csv', index=False)

In [351]:
LOADINGS.to_csv('./data/LOADINGS.csv')

# LDA

In [42]:
colors = "YlGnBu"

# count vectorizer
ngram_range = (1,2)
max_features = 4000
stop_words = 'english'

# Latent Dirichlet Allocation
n_topics = 20
n_components = 20
max_iter = 5
learning_offset = 50
random_state = 0

# hyperparameters
n_top_terms = 7

BAG = ['company_id']

CORPUS_by_chaps = TOKENS[TOKENS.pos.str.match(r'^NNS?$')]\
    .groupby(BAG).term_str\
    .apply(lambda x: ' '.join(x))\
    .to_frame()\
    .rename(columns={'term_str':'doc_str'})

In [43]:
def create_lda_model(DOCS, max_features, ngram_range, stop_words, n_components, max_iter, learning_offset, random_state, n_topics, n_top_terms):
    
    # set the vectorizer engine and run the model
    count_engine = CountVectorizer(max_features=max_features, ngram_range=ngram_range, stop_words=stop_words)
    count_model = count_engine.fit_transform(DOCS.doc_str)
    TERMS = count_engine.get_feature_names_out()
    
    # set VOCAB
    VOCAB = pd.DataFrame(index=TERMS)
    VOCAB.index.name = 'term_str'
    
    # Create DTM
    DTM = pd.DataFrame(count_model.toarray(), index=DOCS.index, columns=TERMS)
    
    VOCAB['doc_count'] = DTM.astype('bool').astype('int').sum()
    DOCS['term_count'] = DTM.sum(1)
             
    # run the LDA model  
    lda_engine = LDA(n_components=n_components, max_iter=max_iter, learning_offset=learning_offset, random_state=random_state)
    TNAMES = [f"T{str(x).zfill(len(str(n_topics)))}" for x in range(n_topics)]
    
    lda_model = lda_engine.fit_transform(count_model)
    
    # Create THETA and PHI
    THETA = pd.DataFrame(lda_model, index=DOCS.index)
    THETA.columns.name = 'topic_id'
    THETA.columns = TNAMES
    
    PHI = pd.DataFrame(lda_engine.components_, columns=TERMS, index=TNAMES)
    PHI.index.name = 'topic_id'
    PHI.columns.name  = 'term_str'
    
    # Create topics
    TOPICS = PHI.stack().to_frame('topic_weight').groupby('topic_id')\
    .apply(lambda x: x.sort_values('topic_weight', ascending=False)\
        .head(n_top_terms).reset_index().drop('topic_id', axis=1)['term_str'])
    
    return VOCAB, DOCS, TNAMES, THETA, PHI, TOPICS

VOCAB_LDA, DOCS_LDA, TNAMES, THETA, PHI, TOPICS = create_lda_model(CORPUS_by_chaps, max_features, ngram_range, stop_words, n_components, max_iter, learning_offset, random_state, n_topics, n_top_terms)


In [47]:
VOCAB_LDA.to_csv('./data/VOCAB_LDA.csv')
DOCS_LDA.to_csv('./data/DOCS_LDA.csv')
TOPICS.to_csv('./data/TOPICS.csv')
THETA.to_csv('./data/THETA.csv')

# Word2Vec performed in seperate Notebook

# Sentiment Analysis performed in seperate Notebook