In [25]:
import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
import numpy as np
import textblob
import pandas as pd
from sklearn.ensemble import RandomForestRegressor

In [2]:
doc_a = "Brocolli is good to eat. My brother likes to eat good brocolli, but not my mother. It helps you with heart problems."
doc_b = "My mother spends a lot of time driving my brother around to baseball practice. Baseball if one of my brothers favorite activities"
doc_c = "Some health experts suggest that driving may cause increased tension and blood pressure. Also, driving may cause skin cancer."
doc_d = "I often feel pressure to perform well at school, but my mother never seems to drive my brother to do better."
doc_e = "Health professionals say that brocolli is good for your health. It can also counter the negatives of stress."

# compile sample documents into a list
doc_set = [doc_a, doc_b, doc_c, doc_d, doc_e]

In [3]:
blob  = textblob.blob.TextBlob("".join(doc_set)) # tokenize the blob

In [4]:
blob.tokenize()

WordList(['Brocolli', 'is', 'good', 'to', 'eat', '.', 'My', 'brother', 'likes', 'to', 'eat', 'good', 'brocolli', ',', 'but', 'not', 'my', 'mother', '.', 'It', 'helps', 'you', 'with', 'heart', 'problems.My', 'mother', 'spends', 'a', 'lot', 'of', 'time', 'driving', 'my', 'brother', 'around', 'to', 'baseball', 'practice', '.', 'Baseball', 'if', 'one', 'of', 'my', 'brothers', 'favorite', 'activitiesSome', 'health', 'experts', 'suggest', 'that', 'driving', 'may', 'cause', 'increased', 'tension', 'and', 'blood', 'pressure', '.', 'Also', ',', 'driving', 'may', 'cause', 'skin', 'cancer.I', 'often', 'feel', 'pressure', 'to', 'perform', 'well', 'at', 'school', ',', 'but', 'my', 'mother', 'never', 'seems', 'to', 'drive', 'my', 'brother', 'to', 'do', 'better.Health', 'professionals', 'say', 'that', 'brocolli', 'is', 'good', 'for', 'your', 'health', '.', 'It', 'can', 'also', 'counter', 'the', 'negatives', 'of', 'stress', '.'])

In [5]:
no_features=1000

In [6]:
# LDA can only use raw term counts for LDA because it is a probabilistic graphical model
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=1, max_features=no_features, stop_words='english')
tf = tf_vectorizer.fit_transform(doc_set)
tf_feature_names = tf_vectorizer.get_feature_names()

In [7]:
tf

<5x38 sparse matrix of type '<class 'numpy.int64'>'
	with 47 stored elements in Compressed Sparse Row format>

In [8]:
tf_feature_names

['activities',
 'baseball',
 'better',
 'blood',
 'brocolli',
 'brother',
 'brothers',
 'cancer',
 'cause',
 'counter',
 'drive',
 'driving',
 'eat',
 'experts',
 'favorite',
 'feel',
 'good',
 'health',
 'heart',
 'helps',
 'increased',
 'likes',
 'lot',
 'mother',
 'negatives',
 'perform',
 'practice',
 'pressure',
 'problems',
 'professionals',
 'say',
 'school',
 'skin',
 'spends',
 'stress',
 'suggest',
 'tension',
 'time']

In [9]:
lda_model = LatentDirichletAllocation(n_topics=3, max_iter=5, learning_method='online', learning_offset=50.,random_state=0).fit(tf)


In [10]:
lda_W = lda_model.transform(tf) # documents as the rows, topics as the columns
lda_H = lda_model.components_ # topics as the rows, words as the columns

In [11]:
lda_W

array([[ 0.02908081,  0.94264018,  0.02827901],
       [ 0.02893521,  0.94213988,  0.02892491],
       [ 0.02695629,  0.02646135,  0.94658236],
       [ 0.0416116 ,  0.91698643,  0.04140197],
       [ 0.92633112,  0.03686845,  0.03680043]])

In [12]:
lda_H[0]

array([ 0.95211094,  0.84706215,  0.96155505,  0.74732936,  1.08121608,
        0.8494759 ,  0.8734277 ,  0.82473556,  0.85012361,  1.11074376,
        0.93095234,  0.80411069,  0.64691762,  0.86434555,  0.99410168,
        0.71602261,  1.20676853,  1.4730662 ,  0.75393777,  0.68147528,
        0.9095761 ,  0.90751579,  0.74243208,  0.71969034,  1.04923259,
        0.78538113,  0.70509772,  0.80305647,  0.753064  ,  1.11442345,
        1.08382812,  0.84827887,  0.77094035,  0.78994215,  1.02729989,
        0.69760789,  0.70406667,  0.85035124])

In [13]:

def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print ("Topic %d:" % (topic_idx))
        print (" ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

In [14]:
no_top_words = 10

display_topics(lda_model, tf_feature_names, no_top_words)

Topic 0:
health good professionals counter say brocolli negatives stress favorite better
Topic 1:
mother brother good baseball brocolli eat lot spends school pressure
Topic 2:
cause driving experts cancer suggest blood skin tension increased stress


In [15]:
def display_topics(H, W, feature_names, documents, no_top_words, no_top_documents):
    for topic_idx, topic in enumerate(H):
        print ("Topic %d:" % (topic_idx))
        print (" ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))
        top_doc_indices = np.argsort( W[:,topic_idx] )[::-1][0:no_top_documents]
        for doc_index in top_doc_indices:
            print (f'DOC {doc_index}:',documents[doc_index])

In [16]:
no_top_words = 5
no_top_documents = 2
display_topics(lda_H, lda_W, tf_feature_names, doc_set, no_top_words, no_top_documents)


Topic 0:
health good professionals counter say
DOC 4: Health professionals say that brocolli is good for your health. It can also counter the negatives of stress.
DOC 3: I often feel pressure to perform well at school, but my mother never seems to drive my brother to do better.
Topic 1:
mother brother good baseball brocolli
DOC 0: Brocolli is good to eat. My brother likes to eat good brocolli, but not my mother. It helps you with heart problems.
DOC 1: My mother spends a lot of time driving my brother around to baseball practice. Baseball if one of my brothers favorite activities
Topic 2:
cause driving experts cancer suggest
DOC 2: Some health experts suggest that driving may cause increased tension and blood pressure. Also, driving may cause skin cancer.
DOC 3: I often feel pressure to perform well at school, but my mother never seems to drive my brother to do better.


In [17]:
def display_percent_of_each_topci(lda_W, model, feature_names, no_top_words):
    """Print out the percent of each topic, along with the top five words from each topic"""
    for topic_idx, topic in enumerate(lda_W):
        print(f"""Document {topic_idx} has the following percent of each topic 1 =  {topic[0]:.2%},\
        topic 2 = {topic[1]:.2%}, topic 3 = {topic[2]:.2%}""")
        
    print()
    print('Topics')
    for topic_idx, topic in enumerate(model.components_):
        print ("Topic %d:" % (topic_idx))
        print (" ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))
    

In [18]:
display_percent_of_each_topci(lda_W,lda_model,tf_feature_names, 10)

Document 0 has the following percent of each topic 1 =  2.91%,        topic 2 = 94.26%, topic 3 = 2.83%
Document 1 has the following percent of each topic 1 =  2.89%,        topic 2 = 94.21%, topic 3 = 2.89%
Document 2 has the following percent of each topic 1 =  2.70%,        topic 2 = 2.65%, topic 3 = 94.66%
Document 3 has the following percent of each topic 1 =  4.16%,        topic 2 = 91.70%, topic 3 = 4.14%
Document 4 has the following percent of each topic 1 =  92.63%,        topic 2 = 3.69%, topic 3 = 3.68%

Topics
Topic 0:
health good professionals counter say brocolli negatives stress favorite better
Topic 1:
mother brother good baseball brocolli eat lot spends school pressure
Topic 2:
cause driving experts cancer suggest blood skin tension increased stress


In [19]:
doc_set

['Brocolli is good to eat. My brother likes to eat good brocolli, but not my mother. It helps you with heart problems.',
 'My mother spends a lot of time driving my brother around to baseball practice. Baseball if one of my brothers favorite activities',
 'Some health experts suggest that driving may cause increased tension and blood pressure. Also, driving may cause skin cancer.',
 'I often feel pressure to perform well at school, but my mother never seems to drive my brother to do better.',
 'Health professionals say that brocolli is good for your health. It can also counter the negatives of stress.']

In [65]:
# Test random forest with text - need to conver to categorical representation

In [37]:
df_train = pd.DataFrame({'niche':['software','hardware','software','software'],
                  'employees':[1,4,3,4],'service_level':['gold','silver','gold','bronze'],
                        'revenue':[123,321,14,234],})

In [59]:
df_train = pd.get_dummies( df_train,drop_first=True)

In [60]:
df_train

Unnamed: 0,employees,revenue,niche_hardware,niche_software,service_level_bronze,service_level_gold,service_level_silver
0,1,123,0,1,0,1,0
1,4,321,1,0,0,0,1
2,3,14,0,1,0,1,0
3,4,234,0,1,1,0,0


In [56]:
rf=  RandomForestRegressor(n_estimators=100)

In [62]:
rf.fit(X = df_train.loc[:,('employees', 'niche_hardware','niche_software','service_level_bronze',
                          'service_level_gold','service_level_silver')], y= df_train.revenue)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=100, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False)

In [64]:
rf.predict(np.array([[1,10,1,0,0,1]]))

array([ 206.58])