### LDA Topic Modeling

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import text #adding stopwords
from nltk.tokenize import RegexpTokenizer
from gensim import matutils, models
from gensim.corpora import Dictionary
import scipy.sparse
from project_functions import *

In [2]:
df = pd.read_csv('csv/sqr_&_comments.csv')

Clean the comments

In [3]:
cleanText(df, 'comments')

Remove additional stop words

In [4]:
add_stop_words = [
    'school', 'schools', 'ps', 'teacher', 'teachers', 'student', 'students', 'kid',
    'kids', 'th', 'year', 'years', 'grade', 'like', 'good', 'parent', 'parents', 'ms',
    'child', 'children', 'read', 'new', 'dont', 'don\'t', 'just', 'like', 'great', 'high',
    'im'
]

In [5]:
stop_words = text.ENGLISH_STOP_WORDS.union(add_stop_words)

Create the document-term matrix

In [6]:
token = RegexpTokenizer(r'[a-zA-Z0-9]+')
cv = CountVectorizer(lowercase=True,stop_words=stop_words, ngram_range = (1,1),tokenizer = token.tokenize)
text_counts = cv.fit_transform(df['comments'])

  'stop_words.' % sorted(inconsistent))


Visualize the document-term matrix

In [7]:
df = pd.DataFrame(text_counts.todense(),columns = cv.get_feature_names())

Convert to term-document matrix

In [8]:
tdm = df.transpose()

Format term-document matrix for gensim

In [9]:
sparse_counts = scipy.sparse.csr_matrix(tdm)
corpus = matutils.Sparse2Corpus(sparse_counts)

Create dictionary of all terms and locations

In [10]:
id2word = dict((v, k) for k, v in cv.vocabulary_.items())

Run LDA Model

In [11]:
lda = models.LdaModel(corpus=corpus, id2word=id2word, num_topics=5, passes=75)
lda.print_topics()

[(0,
  '0.008*"york" + 0.006*"principal" + 0.006*"education" + 0.006*"city" + 0.005*"state" + 0.005*"community" + 0.005*"reportsa" + 0.005*"learning" + 0.004*"list" + 0.004*"program"'),
 (1,
  '0.007*"principal" + 0.004*"quest" + 0.003*"son" + 0.003*"education" + 0.003*"bullying" + 0.003*"staff" + 0.002*"city" + 0.002*"administration" + 0.002*"incidents" + 0.002*"community"'),
 (2,
  '0.006*"class" + 0.006*"principal" + 0.006*"know" + 0.006*"really" + 0.005*"middle" + 0.005*"time" + 0.004*"best" + 0.004*"staff" + 0.004*"say" + 0.004*"help"'),
 (3,
  '0.009*"program" + 0.008*"principal" + 0.005*"class" + 0.005*"learning" + 0.005*"community" + 0.005*"son" + 0.005*"staff" + 0.004*"education" + 0.004*"language" + 0.004*"love"')]