In [16]:
import logging
import glob
import argparse
from gensim import models
from gensim import matutils
from sklearn.feature_extraction.text import TfidfVectorizer
from time import time
from nltk import bigrams
from nltk.tokenize import sent_tokenize
from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer
from nltk.stem import SnowballStemmer
from gensim import corpora
from stop_words import get_stop_words
from collections import Counter
from itertools import product, compress
from collections import defaultdict
import math
import pandas as pd

## Topic Modeling

Preprocessing the sample review text data and apply LDA model to get topics.

In [15]:
def preprocess(line):
    split = line.split()
    stemmer = SnowballStemmer("english")
    split = [stemmer.stem(w) for w in split]
    return ' '.join(split)
    

In [83]:
def topics(text, output_file, num_topic, num_display_word):
    vectorizer = TfidfVectorizer(max_df=0.9, 
                                 min_df=0.1, 
                                 stop_words='english', 
                                 preprocessor = preprocess,
                                 use_idf=True)
    
    #with open(input_file, 'r', encoding = 'utf-8') as f:
     #   text = f.readlines()
    
    print("Extracting features from the training dataset using a sparse vectorizer")
    X = vectorizer.fit_transform(text)
    print("n_samples: %d, n_features: %d" % X.shape)
    
    # mapping from feature id to actual word
    id2words = {}
    for i, word in enumerate(vectorizer.get_feature_names()):
        id2words[i] = word
    
    print("Applying topic modeling, using LDA")
    print(str(num_topic) + " topics")
    corpus = matutils.Sparse2Corpus(X, documents_columns=False)
    lda = models.ldamodel.LdaModel(corpus, num_topics=num_topic, id2word=id2words)
    
    output_text = []
    output_text.append(['Topic'])
    for i, item in lda.show_topics(num_topics =num_topic, num_words=num_display_word, formatted=False):
        output_text.append(["Topic.Topic" + str(i)])
        for term, weight in item:
            output_text.append(["Topic." + "Topic" + str(i) + "." + term, str(weight)])
    df = pandas.DataFrame(output_text, columns = ['id', 'value'])
    print("writing topics to file:", output_file)
    df.to_csv(output_file, index = False)
    

In [12]:
topics('../listDes.txt','topics.csv', 10, 15)

Extracting features from the training dataset using a sparse vectorizer
n_samples: 3818, n_features: 68
Applying topic modeling, using LDA
10 topics
writing topics to file: topics.csv


In [77]:
keywordToListId = pd.read_csv('data/keywordToListId.csv')
des = pd.read_csv("../data/listings.csv")

In [84]:
for index, row in keywordToListId.iterrows():
    ids = row.listid.replace('[', '').replace(']', '').split(', ');
    ids = [int(x) for x in ids]
    text = des.ix[ids].space
    topics(text, 'data/topicForKeyword/'+row.keyword+'.csv', 10, 15)

Extracting features from the training dataset using a sparse vectorizer
n_samples: 238, n_features: 125
Applying topic modeling, using LDA
10 topics
writing topics to file: data/topicForKeyword/capitol hill.csv
Extracting features from the training dataset using a sparse vectorizer
n_samples: 209, n_features: 146
Applying topic modeling, using LDA
10 topics
writing topics to file: data/topicForKeyword/downtown seattle.csv
Extracting features from the training dataset using a sparse vectorizer
n_samples: 160, n_features: 159
Applying topic modeling, using LDA
10 topics
writing topics to file: data/topicForKeyword/space needle.csv
Extracting features from the training dataset using a sparse vectorizer
n_samples: 132, n_features: 173
Applying topic modeling, using LDA
10 topics
writing topics to file: data/topicForKeyword/wireless internet.csv
Extracting features from the training dataset using a sparse vectorizer
n_samples: 124, n_features: 170
Applying topic modeling, using LDA
10 topic