# Topic Modeling on User Bios

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
from nltk import word_tokenize, tokenize
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
import re
# from sklearn.cluster import KMeans
import string

In [2]:
df_peeps = pd.read_json('radiate_peeps.json')

In [20]:
df_peeps.set_index('pk', inplace=True)

In [21]:
df_peeps.head()

Unnamed: 0_level_0,android_api_level,birthday,country,crush_ratio,gender,has_instagram,ios_system_version,last_location,last_login,matches,peep_bubbles__count,tagline,bio_topic_0,bio_topic_1,bio_topic_2,bio_topic_3,bio_topic_4
pk,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
263742,23.0,1997-09-21,US,0.772937,f,False,10.3.2,"[-117.1880506, 33.7205858]",2017-12-25T21:22:11.543180+00:00,[{u'created_on': u'2018-01-24T09:47:08.339733+...,19,🤷🏼‍♀️,8e-05,0.0,0.0,0.000267,7.5e-05
292558,,1993-03-30,US,0.674425,f,False,10.3.1,"[-84.2233889662, 30.5663066025]",2018-01-24T21:23:44.066424+00:00,[{u'created_on': u'2018-01-12T04:58:52.210761+...,61,I have discount tickets for most festivals:\n\...,0.004704,0.00015,0.001322,0.000498,0.001114
312457,,1990-09-13,US,0.173913,m,False,10.3.3,"[-88.1918462698, 41.5387220045]",2017-10-06T01:28:30.598312+00:00,[{u'created_on': u'2017-09-22T01:11:16.860239+...,4,,0.0,0.0,0.0,0.0,0.0
238027,,1988-05-18,,0.183333,m,False,,"[-122.661078544, 45.5122568272]",2016-07-20T23:57:13.011737+00:00,[{u'created_on': u'2017-05-16T15:52:53.757064+...,3,"Tune in to my radio show ""Learning to Grow"" wi...",0.009298,0.000851,0.00022,0.0,0.000813
326086,,1997-03-02,,0.75,f,False,10.3.3,,2017-08-25T16:06:45.288857+00:00,[{u'created_on': u'2017-08-25T16:03:49.042583+...,3,,0.0,0.0,0.0,0.0,0.0


In [23]:
docs = df_peeps['tagline']

In [25]:
docs.head(10)

pk
263742                                              🤷🏼‍♀️
292558    I have discount tickets for most festivals:\n\...
312457                                                     
238027    Tune in to my radio show "Learning to Grow" wi...
326086                                                     
310246                                                     
260988                                                     
244211                                                 🍄🐢
267361    Always looking for new people to get down with...
292967                       Looking for more rage friends!
Name: tagline, dtype: object

In [5]:
def tokenize(text):
    tokens = word_tokenize(text)
    stems = []
    for token in tokens:
        if token not in string.punctuation and token[0]!="'":
            stems.append(PorterStemmer().stem(token))
    return stems

In [12]:
vectorizer = TfidfVectorizer(tokenizer=tokenize, stop_words='english', ngram_range=(1,2))
dtm = vectorizer.fit_transform(docs)
feature_names = vectorizer.get_feature_names()

In [13]:
num_topics = 5

nmf = NMF(n_components=num_topics)
nmf.fit(dtm)

NMF(alpha=0.0, beta=1, eta=0.1, init=None, l1_ratio=0.0, max_iter=200,
  n_components=5, nls_max_iter=2000, random_state=None, shuffle=False,
  solver='cd', sparseness=None, tol=0.0001, verbose=0)

In [14]:
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        results = ", ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]])
        print "Topic %d:" % (topic_idx+1), results
        
display_topics(nmf, feature_names, 5)

Topic 1: new, peopl, love, meet, meet new
Topic 2: good, vibe, good vibe, time, good time
Topic 3: let, rage, let rage, weird, let weird
Topic 4: rave, bae, rave bae, look, look rave
Topic 5: basshead, headbang, 👽, ig, basshead 👽


Above are the top 5 words/emojis/bigrams for the top 5 bio topics - we can see that the first is clearly about meeting new people, the second is about good vibes, and so on.

In [15]:
W = nmf.transform(dtm)

In [16]:
for i in range(num_topics):
    col_name = 'bio_topic_'+str(i)
    df_peeps[col_name] = W[:,i]