# Topic Modeling

By Amy Weng

Adapted from Heidi Smith's File (Topic_Model_Attempt_1.ipynb)

In [None]:
import pandas as pd
import re

import gensim
from gensim.utils import simple_preprocess
from nltk.corpus import stopwords
import gensim.corpora as corpora

In [None]:
directory = '/home/rapiduser/Materials/'
results = '/home/rapiduser/Materials/results'

In [None]:
# Remove stopwords/preprocess
stop_words = stopwords.words('english')
stop_words.extend(['thus', 'thereof', 'thence', 'thee', 'therein', 
                    'wherein', 'whereby', 'whereas', 'also', 'us', 'upon', 
                    'would', 'within', 'indeed', 'become', 'viz', 'per', 'anno', 
                    'whilst', 'thoe', 'ome', 'uch', 'said', 'shall', 'hath',
                    'may','made','much','one','mr','how','sun','like','full','one',
                    'two','three','four','five','day','say','thou','make','men','man',
                    'sam','tom','done','do','have','well','know','heard','hear',
                    'saying','come','never','time','think','came','till','might',
                    'could','begin','began','took','went','last','matter','seeing',
                    'go','many','few','see','take','found','without','little','long',
                    'put','brought','bring','another','th','aforesaid','old','son',
                    'tell'])

def preprocess(data):
    for text in data:
        yield(gensim.utils.simple_preprocess(str(text)))
        
def remove_stopwords(data):
    return [[word for word in simple_preprocess(str(doc))
            if word not in stop_words] for doc in data]

In [None]:
# lexicons
publica = "economy|economic|money|monies|bullion|bullionist|mercantile|fiscal|taxation|exchange|corporation|corporate|company|merchant|joint-stock|merchants|currency|consumption|consume|consuming|consumables|trade|traffic|traffique|commerce|commercial|price|prices|spending|east indies|east india company|east-india company|eat-india company|east-india-company|monopoly|monopolization|monopolies|monopolium|monopolion|monopolie|monopolist|monopolizer|monopolize|monopolizes|monopolye|monopolyes|debt|credit|debtor|creditor|decoctor|bank|banks|usury|interest rate|interest|interest rates|importation|exportation|coin|employment|austerity|goods|treasure|commodities|income|commodity|revenue|land|profitable|unprofitable|industry|work|usurer|estate|property|substance|consumer|free trade|tobacco|coffee|chocolate|tea|interet|india|stock|dutch|ships|factory|netherlands|king|parliament|england|english|indian|indies|silk|spices|nation|adventurers|bantam|cinnamon|pepper|ginger|gold|silver|copper|kingdom|commonwealth|politic|political|body politic|government|prince|war|ruler|queen|extortion|oppression|restoration|revolution|constitution|navy|military|army|strength|power|england|britain|ireland|netherlands|scotland|spanish|germany|holy roman empire|mughal|country|parliament|crisis|commons|lords|lord|crisis|national|governor|public|publicke|low countries|controversy|controversies|authority|empire|bill|bills|treatise|treatises|civilian|protection|pirates|pirate|civil war|massacre|indonesia|india|china|argier|tunis|kingdom|provincial|scandalous|scandal|majesty|subject|christiandom|enemy|turkish|ally|charles|james|william|mary|tory|whig|tories|whigs|stuart"
religio = 'Christian|charity|luxury|avarice|greed|vanity|pride|sloth|gluttony|simony|fraud|incest|theft|deceit|lucrum|cessans|john|stations|station|velvet|damask|sumptuary|religion|popery|papist|abnegatio|self-denial|mendicancy|sin|root|evil|render|caesar|heresy|prodigality|prodigal|truth|hypocrisy|green|righteous|righteousness|thrive|wise|endure|crown|generation|secure|trust|security|fortune|sun|radiance|moon|splendor|rich|arrogant|tower|pleasant|palace|deceive|tempt|brother|sister|eve|adam|samson|delilah|david|bathsheba|lust|lustful|lustfully|sin|devil|lechery|whore|cuckold|unnatural|natural|obey|disobey|deceive|cheat|cheating|modesty|sermon|christ|heaven|godly|saint|altar|candles|canon|mass|sacrament|cross|pastor|ritual|cleansing|church|salvation|savior|wickedness|depravity|jesus|lord|witness|saved|communion|wine|bread|faith|anathema|anointing|apostle|apocalypse|atonement|baptism|holy|sacred|spirit|bishop|born-again|calvinist|evangelical|protestant|catholic|covenant|conviction|creed|demon|deacon|disciple|disciples|satan|satanic|fellowship|gospel|hallelujah|hell|indulgence|indulgent|justify|justification|lucifer|messiah|manifestation|god|zion|ordained|ordinance|ordinate|congregation|parish|parishioner|prophet|repent|sanctuary|redeem|redeemed|sanctified|sanctify|second|coming|testament|tribulation|trinity|words|bible|word|grace|absolution|adultery|obedience|anoint|antichrist|archangel|armageddon|ascension|atone|vision|biblical|blasphemy|bless|blessing|blessed|chalice|chapel|chaplain|cherub|condemnation|condemn|confession|confess|conscience|consecration|contrite|contrition|damnation|damned|damn|day|divine|doctrine|ecumenical|epistle|eternal|evangelicalism|excommunication|exile|resurrection|forgive|forgiveness|freedom|fundamental|gentile|revelation|heresy|Jehovah|judgment|judaism|supper|liturgy|ministry|missionary|mission|ordination|orthodox|pagan|pagans|paganism|passover|papacy|pope|christianity|abraham|penance|genesis|exodus|leviticus|numbers|deuteronomy|moses|union|pray|prayer|predestination|prophecy|psalm|psalms|providence|purgatory|rapture|reconciliation|reconcile|redemption|reform|reformed|reincarnation|reincarnate|resurrect|roman|rome|sabbath|sacrifice|sacrifices|satanism|save|saved|sinful|nature|creation|create|death|offering|offerings|tongues|soul|commandments|transgression|universal|moral|venial|virgin|perfect|vulgate|worship|scripture|scriptures|priesthood|ten|twelve|annihilate|proverb|proverbs|matthew|mark|luke|john|peter|john|james|samuel|timothy|isaiah|hebrew|hebrews|job|slave|slaves|human|reap|holiness|parable|parables|knowledge|samaritan|sown|abundance|persecution|deceitful|reject|rejected|integrity|rejoice|rejoiced|wealth|wicked|destruction|grievance|desire|desires|entice|enticed|unfaithful|tenant|tenants|guilt|guilty|splendor|curse|almighty|defile|defiled|falsehood|false|lie|lies|confront|confronts|contempt'
medica = 'corruption|consumption|vein|circulation|circulate|body|corrupt|consume|wasting|waste|blood|physician|decay|canker|cancer|disease|illness|remedy|remedies|cure|sickness|hepatitis|fever|spirit|brain|mind|clotting|clots|zodiac|wintergreen|herbs|herb|vital|therapeutic|degeneration|degenerate|plague|smells|surgeon|surgeons|putrid|bad|tetrid|breathing|wholesome|healthy|unhealthy|sane|insane|nervous|death|deceased|dead|dying|sick|languish|faculties|enfeeblement|drinks|tuberculosis|plague|hysteria|corpora|corpus|malaria|medical|medicine|miasma|sanguine|constitution|bile|hot|cold|dry|moist|vita|sana|bezoar|asthma|leech|leeches|phlegm|lung|sores|fog|smoke|diagnosis|prognosis|fiber|atrophy|morbid|mortal|mortality|nerves|inanition|defect|distemper|swelling|upset|stomach|cough|exercise|unwholesome|evacuation|fatal|fatality|vessels|hemorrhage|bleeding|bleed|melancholy|diabetes|asthma|vomit|opiate|opium|ulcer|envy|jealousy|spoil|liver|vein|supple|heart|mouth|cured|pox|putrid|decay|dissolution|contamination|corruptionem|corruptio|spoiling|destroy|perversion|vitiation|vitiare|impairment|vitium|vice|perverto'

pub = re.compile(publica)
rel = re.compile(religio)
med= re.compile(medica)

In [None]:
def model(df):
    data = df.text.values.tolist()
    data = list(preprocess(data))
    data = remove_stopwords(data)

    id2word = corpora.Dictionary(data)
    # Term Document Frequency
    corpus = [id2word.doc2bow(text) for text in data]
    num_topics = 1
    lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                        id2word=id2word,
                                        num_topics=num_topics)
    for idx, topic in lda_model.show_topics(formatted=False, num_words= 10):
        return ('{}'.format(' '.join([w[0] for w in topic])))

def sortByTopics(topics):

    p = len(re.findall(pub, topics))/10
    r = len(re.findall(rel, topics))/10
    m = len(re.findall(med, topics))/10

    maxRatio = max(p,r,m)

    if maxRatio==0:
        return 'altera'  
    elif p==maxRatio: 
        return 'publica'
    elif r==maxRatio:
        return 'religio'
    else:
        return 'medica'


In [None]:
f_name = 'eic_uncensored.csv'
folder = 'Texts/'
myCSV = directory + folder + f_name

df_p = pd.DataFrame(columns=('title','author','publisher','date','text'))
df_r = pd.DataFrame(columns=('title','author','publisher','date','text'))
df_m = pd.DataFrame(columns=('title','author','publisher','date','text'))
df_a = pd.DataFrame(columns=('title','author','publisher','date','text'))

# Read in csv
readFile = pd.read_csv(myCSV)

# Iterate over each text (row) in csv
for i in range(len(readFile.index)):
    
    df = readFile[i:(i+1)]
    
    topics = model(df)
    
    t = sortByTopics(topics)
    
    if t=='publica':
        df_p = df_p.append(df)
    
    elif t=='religio':
        df_r = df_r.append(df)
    
    elif t=='medica':
        df_m = df_m.append(df)
    
    else:
        df_a = df_a.append(df)
    
if not df_p.empty:    
    df_p.to_csv(directory+'topic model/publica/'+f_name) 
    print("Publica: ",len(df_p))

if not df_r.empty:    
    df_r.to_csv(directory+'topic model/religio/'+f_name)
    print("Religio: ",len(df_r))

if not df_m.empty:    
    df_m.to_csv(directory+'topic model/medica/'+f_name)
    print("Medica: ",len(df_m))

if not df_a.empty:    
    df_a.to_csv(directory+'topic model/altera/'+f_name)
    print("Altera: ",len(df_r))


In [None]:
# import pyLDAvis
# import pyLDAvis.gensim_models
# # Visualize the topics
# pyLDAvis.enable_notebook()
# LDAvis_prepared = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word)
# pyLDAvis.save_html(LDAvis_prepared, results+'/post-restoration.html')
# LDAvis_prepared
