## Cleaning the data

### Read in raw files

In [1]:
import pandas as pd

In [2]:
# ensure you're working in the same directory
df = pd.read_csv("cf_partners_descriptions.csv")
df.head(15)

Unnamed: 0,Date,url,code,Company,Description
0,2011/05/19,3lm.com,3LM,3LM,"3LM stands for the Three Laws Of Mobility, whi..."
1,2011/10/05,3lm.com,3LM,3LM,"3LM stands for the Three Laws Of Mobility, whi..."
2,2012/02/29,3lm.com,3LM,3LM,"3LM stands for the Three Laws Of Mobility, whi..."
3,2012/06/26,3lm.com,3LM,3LM,"3LM stands for the Three Laws Of Mobility, whi..."
4,2015/11/10,99cloud.net,99cloud,99cloud,OpenStack service and productor provider to fi...
5,2016/03/17,99cloud.net,99cloud,99cloud,OpenStack service and productor provider to fi...
6,2016/09/11,99cloud.net,99cloud,99cloud,OpenStack service and productor provider to fi...
7,2014/07/17,a9.com,A9,A9,"A9. com, a technology subsidiary of Amazon. co..."
8,2015/02/17,a9.com,A9,A9,"A9. com, a technology subsidiary of Amazon. co..."
9,2015/11/10,a9.com,A9,A9,"A9. com, a technology subsidiary of Amazon. co..."


### Tokenize company descriptions

In [3]:
def simple_tokenizer(s):
    """Break str `s` into a list of str.
    1. `s` has all of its peripheral whitespace removed.
    1. `s` is downcased with `lower`.
    2. `s` is split on whitespace.
    3. For each token, any peripheral punctuation on it is stripped
       off. Punctuation is here defined by `string.punctuation`.
    Parameters
    ----------
    s : str
        The string to tokenize.
    Returns
    -------
    list of str
    """
    import string
    punct = string.punctuation
    digs = string.digits
    final_tokens = []
    tokens = s.lower().strip().split()
    for w in tokens:
        final_tokens.append(w.strip(punct).strip(digs))
    return final_tokens

In [4]:
df['preprocessed_desc'] = df['Description'].apply(simple_tokenizer)
df.head(15)

Unnamed: 0,Date,url,code,Company,Description,preprocessed_desc
0,2011/05/19,3lm.com,3LM,3LM,"3LM stands for the Three Laws Of Mobility, whi...","[lm, stands, for, the, three, laws, of, mobili..."
1,2011/10/05,3lm.com,3LM,3LM,"3LM stands for the Three Laws Of Mobility, whi...","[lm, stands, for, the, three, laws, of, mobili..."
2,2012/02/29,3lm.com,3LM,3LM,"3LM stands for the Three Laws Of Mobility, whi...","[lm, stands, for, the, three, laws, of, mobili..."
3,2012/06/26,3lm.com,3LM,3LM,"3LM stands for the Three Laws Of Mobility, whi...","[lm, stands, for, the, three, laws, of, mobili..."
4,2015/11/10,99cloud.net,99cloud,99cloud,OpenStack service and productor provider to fi...,"[openstack, service, and, productor, provider,..."
5,2016/03/17,99cloud.net,99cloud,99cloud,OpenStack service and productor provider to fi...,"[openstack, service, and, productor, provider,..."
6,2016/09/11,99cloud.net,99cloud,99cloud,OpenStack service and productor provider to fi...,"[openstack, service, and, productor, provider,..."
7,2014/07/17,a9.com,A9,A9,"A9. com, a technology subsidiary of Amazon. co...","[a, com, a, technology, subsidiary, of, amazon..."
8,2015/02/17,a9.com,A9,A9,"A9. com, a technology subsidiary of Amazon. co...","[a, com, a, technology, subsidiary, of, amazon..."
9,2015/11/10,a9.com,A9,A9,"A9. com, a technology subsidiary of Amazon. co...","[a, com, a, technology, subsidiary, of, amazon..."


### Remove stopwords

In [5]:
import nltk
from nltk.corpus import stopwords
stopword_set=set(stopwords.words('english'))
stopword_set.add("com")
stopword_set.add("www")
stopword_set.add("")

In [6]:
def remove_stops(word_list,stopword_set):
    processed = []
    for word in word_list:
        if word not in stopword_set:
            processed.append(word)
    return processed

In [7]:
df['tokenized_wo_stopwords'] = df.apply(lambda row: remove_stops(row['preprocessed_desc'],stopword_set),axis=1)
df.head(15)

Unnamed: 0,Date,url,code,Company,Description,preprocessed_desc,tokenized_wo_stopwords
0,2011/05/19,3lm.com,3LM,3LM,"3LM stands for the Three Laws Of Mobility, whi...","[lm, stands, for, the, three, laws, of, mobili...","[lm, stands, three, laws, mobility, serve, gui..."
1,2011/10/05,3lm.com,3LM,3LM,"3LM stands for the Three Laws Of Mobility, whi...","[lm, stands, for, the, three, laws, of, mobili...","[lm, stands, three, laws, mobility, serve, gui..."
2,2012/02/29,3lm.com,3LM,3LM,"3LM stands for the Three Laws Of Mobility, whi...","[lm, stands, for, the, three, laws, of, mobili...","[lm, stands, three, laws, mobility, serve, gui..."
3,2012/06/26,3lm.com,3LM,3LM,"3LM stands for the Three Laws Of Mobility, whi...","[lm, stands, for, the, three, laws, of, mobili...","[lm, stands, three, laws, mobility, serve, gui..."
4,2015/11/10,99cloud.net,99cloud,99cloud,OpenStack service and productor provider to fi...,"[openstack, service, and, productor, provider,...","[openstack, service, productor, provider, fina..."
5,2016/03/17,99cloud.net,99cloud,99cloud,OpenStack service and productor provider to fi...,"[openstack, service, and, productor, provider,...","[openstack, service, productor, provider, fina..."
6,2016/09/11,99cloud.net,99cloud,99cloud,OpenStack service and productor provider to fi...,"[openstack, service, and, productor, provider,...","[openstack, service, productor, provider, fina..."
7,2014/07/17,a9.com,A9,A9,"A9. com, a technology subsidiary of Amazon. co...","[a, com, a, technology, subsidiary, of, amazon...","[technology, subsidiary, amazon, looking, high..."
8,2015/02/17,a9.com,A9,A9,"A9. com, a technology subsidiary of Amazon. co...","[a, com, a, technology, subsidiary, of, amazon...","[technology, subsidiary, amazon, looking, high..."
9,2015/11/10,a9.com,A9,A9,"A9. com, a technology subsidiary of Amazon. co...","[a, com, a, technology, subsidiary, of, amazon...","[technology, subsidiary, amazon, looking, high..."


### Lemmatize words

In [8]:
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import pos_tag
from nltk.corpus import wordnet as wn

In [9]:
def to_lemmas(tokenized_sentence):
    lemmas = []
    lemmatizer = WordNetLemmatizer()
    tag_dict = {'V':wn.VERB,'J':wn.ADJ,'R':wn.ADV}
    for token, pos in pos_tag(tokenized_sentence):
        lemmas.append(lemmatizer.lemmatize(token,pos=(tag_dict[pos[0]] if pos[0] in tag_dict.keys() else wn.NOUN)))
    return lemmas

In [10]:
df['clean_lemmatized'] = df['tokenized_wo_stopwords'].apply(to_lemmas)
df.head(15)

Unnamed: 0,Date,url,code,Company,Description,preprocessed_desc,tokenized_wo_stopwords,clean_lemmatized
0,2011/05/19,3lm.com,3LM,3LM,"3LM stands for the Three Laws Of Mobility, whi...","[lm, stands, for, the, three, laws, of, mobili...","[lm, stands, three, laws, mobility, serve, gui...","[lm, stand, three, law, mobility, serve, guide..."
1,2011/10/05,3lm.com,3LM,3LM,"3LM stands for the Three Laws Of Mobility, whi...","[lm, stands, for, the, three, laws, of, mobili...","[lm, stands, three, laws, mobility, serve, gui...","[lm, stand, three, law, mobility, serve, guide..."
2,2012/02/29,3lm.com,3LM,3LM,"3LM stands for the Three Laws Of Mobility, whi...","[lm, stands, for, the, three, laws, of, mobili...","[lm, stands, three, laws, mobility, serve, gui...","[lm, stand, three, law, mobility, serve, guide..."
3,2012/06/26,3lm.com,3LM,3LM,"3LM stands for the Three Laws Of Mobility, whi...","[lm, stands, for, the, three, laws, of, mobili...","[lm, stands, three, laws, mobility, serve, gui...","[lm, stand, three, law, mobility, serve, guide..."
4,2015/11/10,99cloud.net,99cloud,99cloud,OpenStack service and productor provider to fi...,"[openstack, service, and, productor, provider,...","[openstack, service, productor, provider, fina...","[openstack, service, productor, provider, fina..."
5,2016/03/17,99cloud.net,99cloud,99cloud,OpenStack service and productor provider to fi...,"[openstack, service, and, productor, provider,...","[openstack, service, productor, provider, fina...","[openstack, service, productor, provider, fina..."
6,2016/09/11,99cloud.net,99cloud,99cloud,OpenStack service and productor provider to fi...,"[openstack, service, and, productor, provider,...","[openstack, service, productor, provider, fina...","[openstack, service, productor, provider, fina..."
7,2014/07/17,a9.com,A9,A9,"A9. com, a technology subsidiary of Amazon. co...","[a, com, a, technology, subsidiary, of, amazon...","[technology, subsidiary, amazon, looking, high...","[technology, subsidiary, amazon, look, highly,..."
8,2015/02/17,a9.com,A9,A9,"A9. com, a technology subsidiary of Amazon. co...","[a, com, a, technology, subsidiary, of, amazon...","[technology, subsidiary, amazon, looking, high...","[technology, subsidiary, amazon, look, highly,..."
9,2015/11/10,a9.com,A9,A9,"A9. com, a technology subsidiary of Amazon. co...","[a, com, a, technology, subsidiary, of, amazon...","[technology, subsidiary, amazon, looking, high...","[technology, subsidiary, amazon, look, highly,..."


## Topic models

### Bag of words

In [19]:
from gensim.corpora import Dictionary
# create unique ids for words
desc_dict = Dictionary(df['clean_lemmatized'])

In [22]:
# a list of (id, word count) for each row
desc_bow = [desc_dict.doc2bow(desc) for desc in df['clean_lemmatized']]

### LDA (Latent Dirchlet Allocation)

In [29]:
from gensim.models.ldamodel import LdaModel
desc_lda = LdaModel(desc_bow, num_topics = 10, id2word = desc_dict, passes=10, random_state=1)

In [33]:
topics = desc_lda.print_topics(num_topics=10,num_words=15)

In [56]:
topics = pd.DataFrame(topics, columns = ['topic_num','words'])
#topics[['1','2','3','4','5','6','7','8','9','10','11','12','13','14','15']] = topics.words.str.split("+",expand=True,)
topics[[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]] = topics.words.str.split("+",expand=True,)
topics

Unnamed: 0,topic_num,words,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,0,"0.016*""company"" + 0.013*""people"" + 0.011*""tech...","0.016*""company""","0.013*""people""","0.011*""technology""","0.010*""mobile""","0.009*""customer""","0.008*""connect""","0.008*""make""","0.008*""work""","0.007*""team""","0.007*""way""","0.007*""cloud""","0.007*""billion""","0.007*""venture""","0.006*""network""","0.006*""build"""
1,1,"0.019*""service"" + 0.016*""company"" + 0.014*""pro...","0.019*""service""","0.016*""company""","0.014*""product""","0.012*""technology""","0.010*""search""","0.010*""division""","0.010*""business""","0.010*""world's""","0.009*""google""","0.008*""device""","0.007*""tv""","0.007*""use""","0.006*""adap""","0.006*""provide""","0.006*""web"""
2,2,"0.034*""technology"" + 0.021*""service"" + 0.016*""...","0.034*""technology""","0.021*""service""","0.016*""company""","0.014*""ibm""","0.013*""consult""","0.013*""world's""","0.012*""solution""","0.012*""lead""","0.011*""computer""","0.010*""new""","0.009*""one""","0.009*""business""","0.009*""information""","0.009*""range""","0.009*""telecom"""
3,3,"0.036*""business"" + 0.019*""market"" + 0.019*""aut...","0.036*""business""","0.019*""market""","0.019*""automation""","0.016*""company""","0.016*""software""","0.015*""technology""","0.015*""yahoo""","0.012*""build""","0.010*""palantir""","0.010*""new""","0.009*""platform""","0.008*""search""","0.008*""life""","0.007*""share""","0.007*""core"""
4,4,"0.013*""global"" + 0.011*""financial"" + 0.010*""te...","0.013*""global""","0.011*""financial""","0.010*""technology""","0.010*""world""","0.010*""investment""","0.010*""customer""","0.009*""firm""","0.009*""new""","0.008*""employee""","0.008*""team""","0.008*""lead""","0.008*""company""","0.008*""market""","0.007*""mobile""","0.006*""billion"""
5,5,"0.014*""build"" + 0.013*""company"" + 0.011*""techn...","0.014*""build""","0.013*""company""","0.011*""technology""","0.010*""system""","0.008*""global""","0.008*""product""","0.007*""provide""","0.006*""large""","0.006*""help""","0.006*""game""","0.006*""trading""","0.005*""problem""","0.005*""people""","0.005*""security""","0.005*""million"""
6,6,"0.020*""data"" + 0.017*""company"" + 0.016*""servic...","0.020*""data""","0.017*""company""","0.016*""service""","0.014*""–""","0.013*""business""","0.012*""hitachi""","0.010*""system""","0.010*""use""","0.010*""software""","0.009*""large""","0.009*""technology""","0.009*""country""","0.008*""help""","0.007*""consumer""","0.007*""product"""
7,7,"0.014*""software"" + 0.014*""product"" + 0.013*""in...","0.014*""software""","0.014*""product""","0.013*""include""","0.012*""technology""","0.011*""business""","0.010*""management""","0.010*""company""","0.009*""enterprise""","0.009*""energy""","0.009*""stage""","0.009*""u""","0.009*""customer""","0.008*""service""","0.008*""new""","0.007*""capital"""
8,8,"0.014*""service"" + 0.013*""business"" + 0.012*""co...","0.014*""service""","0.013*""business""","0.012*""company""","0.010*""data""","0.010*""large""","0.008*"",""","0.008*""world""","0.008*""cloud""","0.008*""vmware""","0.008*""size""","0.008*""enable""","0.007*""america""","0.007*""one""","0.007*""customer""","0.007*""deliver"""
9,9,"0.019*""company"" + 0.015*""business"" + 0.013*""cu...","0.019*""company""","0.015*""business""","0.013*""customer""","0.013*""technology""","0.012*""service""","0.012*""product""","0.010*""market""","0.009*""worldwide""","0.009*""large""","0.007*""global""","0.006*""million""","0.006*""include""","0.006*""mobile""","0.006*""solution""","0.006*""platform"""


In [58]:
topics.melt(id_vars=['topic_num'], value_vars=[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]).sort_values(by=['topic_num','variable'])

Unnamed: 0,topic_num,variable,value
0,0,1,"0.016*""company"""
10,0,2,"0.013*""people"""
20,0,3,"0.011*""technology"""
30,0,4,"0.010*""mobile"""
40,0,5,"0.009*""customer"""
50,0,6,"0.008*""connect"""
60,0,7,"0.008*""make"""
70,0,8,"0.008*""work"""
80,0,9,"0.007*""team"""
90,0,10,"0.007*""way"""


We find that there's significant overlap between a lot of these words, so we'll filter out the top 10 most common words to see if that makes the results more differentiable (as mentioned here https://nlp.stanford.edu/software/tmt/tmt-0.4/, although they do 30 -- this is a small corpus so we should start with 10). We also notice that many of the descriptions repeated over time, so running topic modeling on the unique descriptions might be a better way to go