# Bag of Words and Latent Semantic Analysis
Luther Richardson
4-28-21

This project uses the the Indiegogo Dataset (https://webrobots.io/indiegogo-dataset/) to download at least 5 files. It takes the “title” for each project in the dataset. 

Then, the article titles are put into a “bag of words” format. Finally, Latent Semantic Analysis (LSA) is used to cluser them into related topics.


In [1]:
#import modules
import os.path
import gensim
from gensim import corpora
from gensim import models
from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer
from gensim.models.coherencemodel import CoherenceModel
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
import re
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/luther/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
files =["/Users/luther/Desktop/indie/2020-11-Indiegogo.csv",
"/Users/luther/Desktop/indie/2020-11-Indiegogo001.csv",
"/Users/luther/Desktop/indie/2020-11-Indiegogo002.csv",
"/Users/luther/Desktop/indie/2020-12-Indiegogo001.csv",
"/Users/luther/Desktop/indie/2020-12-Indiegogo002.csv",
"/Users/luther/Desktop/indie/2021-02-Indiegogo002.csv",
"/Users/luther/Desktop/indie/2020-11-Indiegogo.csv",
"/Users/luther/Desktop/indie/2020-11-Indiegogo001.csv",
"/Users/luther/Desktop/indie/2020-11-Indiegogo002.csv",
"/Users/luther/Desktop/indie/2020-12-Indiegogo.csv",
"/Users/luther/Desktop/indie/2020-12-Indiegogo001.csv",
"/Users/luther/Desktop/indie/2020-12-Indiegogo002.csv",
"/Users/luther/Desktop/indie/2021-01-Indiegogo.csv",
"/Users/luther/Desktop/indie/2021-01-Indiegogo001.csv",
"/Users/luther/Desktop/indie/2021-01-Indiegogo002.csv",
"/Users/luther/Desktop/indie/2021-02-Indiegogo.csv",
"/Users/luther/Desktop/indie/2021-02-Indiegogo001.csv"]

In [3]:
# import the data
def load_data(path, column):
    data = pd.read_csv(path,
                       low_memory=False)
    data = data[column]
    documents = data.values.tolist()
    print("Documents:",len(data))
    return documents

In [4]:
# import the dataset
documents = []

for file in files:
    data = load_data(file,'title')
    for title in data:
        documents.append(title)

Documents: 32876
Documents: 32856
Documents: 1241
Documents: 32845
Documents: 1495
Documents: 1676
Documents: 32876
Documents: 32856
Documents: 1241
Documents: 32848
Documents: 32845
Documents: 1495
Documents: 32859
Documents: 32854
Documents: 1544
Documents: 32855
Documents: 32855


## How many documents?

In [5]:
len(documents)

370117

In [6]:
print(documents[0:10]) # examples

['The Scoutmother', 'Feet on the Ground Scholarship Fund ', 'Friday the 13th: LOST - A fan film', 'Float with Pierre King!', 'Real Spanish', 'Historical Ecology of Onondaga Lake', 'LANGRIA 6-in-1 Astronaut Memory Foam Travel Pillow', 'Beloved Magazine, The Gather Issue', 'Female Fitness and Bodybuilding Photoshootings', "Birdie's Plus Size Thrift Store"]


## Start creating the bag of words

In [7]:
def clean_doc(item):
    item = re.sub("\W|[0-9]", " ", item) # remove non alpha
    item = re.sub("\s+", " ", item) # remove more than one space
    item =  item.lower() # lowercase 
    return str(item)

tokenized_docs = []

In [8]:
# tokenize
stop_words = set(stopwords.words('english'))
for doc in documents:
    # tokenize removing len less than 3 and over 15 char
    doc = clean_doc(str(doc))
    doc = gensim.utils.simple_preprocess(doc, min_len=3, max_len=15)
    removed_stops = []
    for word in doc:
        if (word not in stop_words):
            removed_stops.append(word)
    tokenized_docs.append(removed_stops)

## Example of tokenized docs

In [9]:
print(tokenized_docs[0:10])

[['scoutmother'], ['feet', 'ground', 'scholarship', 'fund'], ['friday', 'lost', 'fan', 'film'], ['float', 'pierre', 'king'], ['real', 'spanish'], ['historical', 'ecology', 'onondaga', 'lake'], ['langria', 'astronaut', 'memory', 'foam', 'travel', 'pillow'], ['beloved', 'magazine', 'gather', 'issue'], ['female', 'fitness', 'bodybuilding', 'photoshootings'], ['birdie', 'plus', 'size', 'thrift', 'store']]


## Creating the term dictionary out of the tokenized docs
### Then, converting list of documents (corpus) into Document Term Matrix using dictionary

In [10]:
dictionary = corpora.Dictionary(tokenized_docs)
doc_term_matrix = [dictionary.doc2bow(document) for document in tokenized_docs]

## Bag of words representation


In [11]:
print(doc_term_matrix[0:10]) # printing 10 rows for example

[[(0, 1)], [(1, 1), (2, 1), (3, 1), (4, 1)], [(5, 1), (6, 1), (7, 1), (8, 1)], [(9, 1), (10, 1), (11, 1)], [(12, 1), (13, 1)], [(14, 1), (15, 1), (16, 1), (17, 1)], [(18, 1), (19, 1), (20, 1), (21, 1), (22, 1), (23, 1)], [(24, 1), (25, 1), (26, 1), (27, 1)], [(28, 1), (29, 1), (30, 1), (31, 1)], [(32, 1), (33, 1), (34, 1), (35, 1), (36, 1)]]


## Create the LDA model

tfidf stands for "term frequency–inverse document frequency"

In [12]:
tfidf = models.TfidfModel(doc_term_matrix)

In [13]:
corpus_tfidf = tfidf[doc_term_matrix] # convert doc term matrix

In [14]:
# initialize an LSI transformation
lsi_model = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=15)  

In [15]:
topics = lsi_model.get_topics()

In [16]:
for item in lsi_model.show_topics():
    print(clean_doc(item[1]))
    print(" ")

 film short help feature project series new world horror first 
 
 help new world project film camera first short series album 
 
 series web help project book camera new world season album 
 
 project series web help new world first album art photography 
 
 world help first project smart bike wireless camera smallest one 
 
 new help album world camera debut first series game save 
 
 game card board book world project new camera album video 
 
 book game comic photography help photo project series web children 
 
 camera album debut bike music fund action first world lens 
 
 bike world album smart one electric life power art first 
 
 art bike music festival electric book album project studio one 
 
 one life bike album music new campaign wireless world smart 
 
 life bike art electric app save camera album new wireless 
 
 campaign one smart life season app home wireless photography world 
 
 one photography music album smart life campaign new debut art 
 


## Find what topics are associated with each document

In [17]:
corpus_lsi = lsi_model[corpus_tfidf] # get the LSI model from the corpus

In [18]:
# split the topics and docs out into arrays
topics = []
docs = []

for doc, as_text in zip(corpus_lsi, documents):
    topics.append(doc) 
    docs.append(as_text)

In [23]:
# find the highest scoring topic for each document
topic_vals = []

for topic in topics:
    max_val = -99999999.0
    i = 0
    top_index = 0
    for value in topic:
        if(np.all(value[1] > max_val)):
            max_val = value 
            top_index = i
        i += 1
    topic_vals.append(top_index)
    #print(max_val)

In [20]:
print(topic_vals[0:10])

[1, 4, 0, 2, 2, 0, 13, 6, 2, 11]


In [21]:
data = {'Topic':topic_vals,
        'Doc':docs}
data = pd.DataFrame(data)

In [22]:
print(data)

        Topic                                                Doc
0           1                                    The Scoutmother
1           4               Feet on the Ground Scholarship Fund 
2           0                 Friday the 13th: LOST - A fan film
3           2                            Float with Pierre King!
4           2                                       Real Spanish
...       ...                                                ...
370112      3  Frog Conservation Research at Tarleton State U...
370113     11  Decisions: A Minor Variation - A Special One-Shot
370114      2                          INFERTILITY TO FERTILITY!
370115      2                             Stay Nerdy Productions
370116      2  The Invincible Osiris Jackson: A Gaymer Web Se...

[370117 rows x 2 columns]


In [33]:
data['Topic'].value_counts(normalize=False)

9    68435
6    58433
2    54824
0    48259
5    46191
8    31271
4    30078
3    24996
1     6717
7      913
Name: Topic, dtype: int64