<a href="https://colab.research.google.com/github/kelseymour/NEA-Grant-Topic-Modeling/blob/main/NEA_code_scratchpad.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


NEA Grants Text Mining and Topic Modeling


Set Up

In [2]:
# !pip install pyldavis
# !pip install lbl2vec
# !pip install bertopic
# !pip install --upgrade pandas
# !python -m spacy download en_core_web_sm

In [3]:
import pandas as pd
pd.options.plotting.backend = "plotly"
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import nltk
from nltk import pos_tag
from nltk.tokenize import word_tokenize
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
import re
import string
from pprint import pprint
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
from gensim.parsing.preprocessing import strip_tags
from gensim.models.doc2vec import TaggedDocument
import spacy
import pyLDAvis
import pyLDAvis.gensim_models
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])
from transformers import pipeline
from bertopic import BERTopic
from sklearn.compose import make_column_transformer
from sklearn.feature_extraction.text import CountVectorizer
import warnings
warnings.filterwarnings('ignore')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
  if (distutils.version.LooseVersion(tf.__version__) <
  distutils.version.LooseVersion(required_tensorflow_version)):


Cleaning Text Column



In [4]:
def remove_punctuations(text):
    for char in string.punctuation:
        text = text.replace(char, '')
    return text
def lemmatize_it(sent):
    empty = []
    for word, tag in pos_tag(word_tokenize(sent)):
        wntag = tag[0].lower()
        wntag = wntag if wntag in ['a', 'r', 'n', 'v'] else None
        if not wntag:
            lemma = word
            empty.append(lemma)
        else:
            lemma = lemmatizer.lemmatize(word, wntag)
            empty.append(lemma)
    return ' '.join(empty)

In [75]:
df = pd.read_excel("Orchestra_NEA_Grants.xlsx")
df = df.dropna(how="all")
df = df.reset_index()
df["Grant From Date"] = pd.to_datetime(df["Grant From Date"])
df["Grant To Date"] = pd.to_datetime(df["Grant To Date"])
df["grant_length"] = df["Grant To Date"] - df["Grant From Date"]
df["Fiscal Year"] = df["Fiscal Year"].astype(int)
df = df[df["Fiscal Year"] >= 2013]
cols = df.select_dtypes(['object']).columns
df[cols] = df[cols].apply(lambda x: x.str.strip())
df = df.replace('\n',' ', regex=True)
df = df.loc[:, ~df.columns.isin(["Popular Name", "Application Number",
                                 "Zip", "Congressional District",
                                 "Grant From Date", "Grant To Date"])]

df.tail()

Unnamed: 0,index,Organization Name,Discipline / Field,Category,City,State,Project Description,Fiscal Year,Grant Amount,grant_length
2291,2292,Youth Orchestras of San Antonio,"Coronavirus Aid, Relief, and Economic Security...","Coronavirus Aid, Relief, and Economic Security...",SAN ANTONIO,TX,To support personnel costs in response to the ...,2020,50000.0,61 days
2292,2293,Youth Orchestras of San Antonio,Music,Art Works,SAN ANTONIO,TX,To support the annual YOSA Invitational music ...,2019,10000.0,180 days
2293,2294,Youth Orchestras of San Antonio,Challenge America,Challenge America,SAN ANTONIO,TX,To support Mozart at the Opera. Soprano Shana ...,2017,10000.0,30 days
2294,2295,Youth Orchestras of San Antonio,Challenge America,Challenge America,SAN ANTONIO,TX,"To support the production of Carl Orff's ""Carm...",2015,10000.0,180 days
2295,2296,Youth Orchestras of San Antonio,Challenge America,Challenge America,SAN ANTONIO,TX,To support the commission and premiere of a ne...,2013,10000.0,152 days


First look at some visualizations

In [55]:
df["Grant Amount"].hist(bins=10)

Plots of the Categories already labeled by the NEA. These are either not descriptive enough, or relate to specific programs that can contain a broad range of topics.

In [56]:
fig = df["Discipline / Field"].value_counts().plot(kind="bar",
                                                   title="Grants Awarded by NEA Discipline/Field",
                                                   width=1000, height=700)
fig.write_html("count_of_grants_by_discipine.html")
fig.show()


In [57]:
fig = df["Category"].value_counts().plot(kind="bar",
                                         title="Grants Awarded by NEA Category",
                                         width=1000, height=900)
fig.write_html("count_of_grants_by_category.html")
fig.show()

In [58]:
fig = df.plot(kind="box",
              x=df["Discipline / Field"],
              y=df["Grant Amount"],
              title="Amount Awarded by NEA Discipline/Field",
              width=1100, height=900)
fig.write_html("distribution_of_money_by_discipine.html")
fig.show()

In [59]:
fig = df.plot(kind="box",
              x=df["Category"],
              y=df["Grant Amount"],
              title="Amount Awarded by NEA Category",
              width=1100, height=900)
fig.write_html("distribution_of_money_by_category.html")
fig.show()

LDA: First attempt to get better labels based on project descriptions

In [60]:
data = df["Project Description"].values.tolist()
data = [re.sub('\S*@\S*\s?', '', sent) for sent in data]
data = [re.sub('\s+', ' ', sent) for sent in data]
data = [re.sub("\'", "", sent) for sent in data]
#print(data[0])

In [61]:
stop = stopwords.words('english')
extend = ['symphony', 'orchestra', 'philharmonic', 'philharmonia', 'de', 'el', 'annenberg', 'new', 'york',
          'paso', 'orleans', 'chicago', 'tanglewood', 'sarasota', 'berekely', 'cooper', 'st', 'luke',
          'kalamazoo', 'albuquerque', 'alabama', 'alaska', 'arizona', 'arkansas', 'california', 'colorado', 'connecticut',
          'delaware', 'florida', 'georgia', 'hawaii', 'idaho', 'illinois', 'indiana', 'iowa', 'kansas', 'kentucky', 'louisiana',
          'maine', 'maryland', 'massachusetts', 'michigan', 'minnesota', 'mississippi', 'missouri', 'montana', 'nebraska',
          'nevada', 'hampshire', 'jersey', 'mexico', 'north', 'carolina', 'dakota', 'ohio', 'oklahoma',
          'oregon', 'pennsylvania', 'rhode', 'island', 'south', 'tennessee', 'texas',
          'utah', 'vermont', 'virginia', 'washington', 'west', 'wisconsin', 'wyoming', 'with', 'berkeley', 'plymouth',
          'tuscon', 'mcgegan', 'long', 'beach', 'van', 'wilkinsburg', 'bend', 'fort', 'grant', 'people', 'anchorage',
          'concert', 'performance', 'purpose', 'support', 'project', 'columbus', 'john', '20042005', 'city', 'seattle',
          'san', 'francisco', 'los', 'angeles', 'charlotte', 'sonia', 'marie', 'ace', 'park', 'musician', 'ludwig', 'jr', 'philadelphia',
          'lido', 'hampton', 'indianapolis', 'tucson', 'minneapolis', 'nicollet', 'elliot']
stop_extended = stop + extend

In [62]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))#deacc=True removes punctuations
data_words = list(sent_to_words(data))
#print(data_words[:1])

In [63]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)
# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)
#print(trigram_mod[bigram_mod[data_words[0]]])

In [64]:
# Define function for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_extended] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent))
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [65]:
# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)
# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

# Form Bigrams
#data_words_bigrams = make_bigrams(data_lemmatized)

print(data_lemmatized[:1])



[['purpose', 'support', 'series', 'community', 'outreach', 'concert', 'additional', 'project', 'description', 'provide', 'concert', 'community', 'venue', 'such', 'senior', 'care', 'health', 'care', 'facility', 'programming', 'present', 'partnership', 'community', 'organization', 'such', 'development', 'center', 'intend', 'beneficiary', 'resident', 'aiken', 'access', 'performance', 'include', 'individual', 'disability', 'intend', 'outcome', 'beneficiary', 'engage', 'art']]


In [66]:
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)
# Create Corpus
texts = data_lemmatized
# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]
#print(corpus[:1])

In [67]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=7,
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [68]:
# Print the keyword of topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.026*"age" + 0.021*"low" + 0.019*"quartet" + 0.016*"income" + '
  '0.016*"feature" + 0.014*"performance" + 0.014*"live" + 0.013*"concert" + '
  '0.011*"competition" + 0.010*"soloist"'),
 (1,
  '0.063*"school" + 0.059*"music" + 0.058*"student" + 0.046*"program" + '
  '0.021*"support" + 0.017*"youth" + 0.017*"provide" + 0.014*"high" + '
  '0.013*"musician" + 0.013*"string"'),
 (2,
  '0.052*"work" + 0.041*"composer" + 0.032*"support" + 0.028*"project" + '
  '0.027*"new" + 0.025*"music" + 0.021*"include" + 0.019*"performance" + '
  '0.017*"festival" + 0.017*"feature"'),
 (3,
  '0.109*"covid" + 0.108*"pandemic" + 0.105*"cost" + 0.088*"support" + '
  '0.085*"response" + 0.061*"personnel" + 0.043*"general" + 0.032*"operating" '
  '+ 0.006*"model" + 0.005*"collect"'),
 (4,
  '0.029*"orchestra" + 0.021*"location" + 0.019*"more" + 0.018*"development" + '
  '0.017*"train" + 0.015*"professional" + 0.014*"therapy" + 0.013*"expand" + '
  '0.012*"therapist" + 0.009*"enable"'),
 (5,
  '0.052*

In [69]:
# Compute Perplexity (a measure of how good the model is; lower the better)
print('\nPerplexity: ', lda_model.log_perplexity(corpus))

# Compute Coherence Score (measure of human readability, higher better)
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -6.345036048343259

Coherence Score:  0.4525256643668202


In [70]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word)
vis

In [71]:
pyLDAvis.save_html(vis, 'lda_7_topics.html')

In [72]:
 all_topics = lda_model.get_document_topics(corpus, minimum_probability=0.0)
 all_topics_csr = gensim.matutils.corpus2csc(all_topics)
 all_topics_numpy = all_topics_csr.T.toarray()
 all_topics_df = pd.DataFrame(all_topics_numpy)

In [73]:
all_topics_df.iloc[:3]

Unnamed: 0,0,1,2,3,4,5,6
0,0.004777,0.052708,0.06184,0.001876,0.001832,0.872656,0.00431
1,0.003897,0.11116,0.545329,0.020507,0.001413,0.261568,0.056126
2,0.006022,0.1118,0.08486,0.002361,0.059179,0.730367,0.005412


In [76]:
print(len(df))
print(len(all_topics_df))

947
947


In [78]:
df_lda = pd.merge(df, all_topics_df, left_index=True, right_index=True)

In [98]:
#df_lda = pd.concat([df, all_topics_df], axis=1)
df_lda = df_lda.rename(columns={0: 'lda_7', 1: 'lda_1', 2: 'lda_2', 3: 'lda_3', 4: 'lda_4', 5: 'lda_5', 6: 'lda_6'})
df_lda['LDA_label_7_topics'] = df_lda[["lda_7", "lda_1", "lda_2", "lda_3", "lda_4", "lda_5", "lda_6"]].idxmax(axis=1)
df_lda.tail()

Unnamed: 0.1,Unnamed: 0,index,Organization Name,Discipline / Field,Category,City,State,Project Description,Fiscal Year,Grant Amount,...,lda_7,lda_1,lda_2,lda_3,lda_4,lda_5,lda_6,lemma_list,processed_text,LDA_label_7_topics
398,933,934,Grant Park Orchestral Association,Music,Art Works,CHICAGO,IL,To support the Grant Park Music Festival. The ...,2017,30000.0,...,0.336753,0.413541,0.048189,0.005202,0.005081,0.179267,0.011967,"['support', 'young', 'musician', 'program', 'c...",music festival free summer festival take place...,lda_1
399,934,935,Grant Park Orchestral Association,Music,Art Works,CHICAGO,IL,To support the Grant Park Music Festival. The ...,2016,34000.0,...,0.004797,0.430956,0.195608,0.001874,0.001831,0.278761,0.086172,"['support', 'performance', 'music', 'education...",music festival free summer festival take place...,lda_1
400,935,936,Grant Park Orchestral Association,Music,Art Works,CHICAGO,IL,To support the Grant Park Music Festival. The ...,2015,30000.0,...,0.005696,0.563277,0.100505,0.002231,0.002179,0.237702,0.088409,"['support', 'annual', 'foosa', 'summer', 'fest...",music festival free summer festival take place...,lda_1
401,936,937,Grant Park Orchestral Association,Music,Art Works,CHICAGO,IL,To support the Grant Park Music Festival. The ...,2014,30000.0,...,0.006594,0.137145,0.503053,0.034942,0.002446,0.310075,0.005746,"['support', 'commissioning', 'come', 'home', '...",music festival free summer festival take place...,lda_2
402,946,947,Greater Connecticut Youth Orchestras Inc.,Challenge America,Challenge America,FAIRFIELD,CT,Purpose: To support youth orchestra activities...,2023,10000.0,...,0.073683,0.345543,0.199139,0.001793,0.001751,0.060114,0.317977,"['support', 'commission', 'premiere', 'new', '...",youth activity underserved student additional ...,lda_1


In [26]:
#df_lda['LDA_label'] = df_lda[[1, 2, 3, 4, 5, 6, 7]].idxmax(axis=1)

In [99]:
df_lda["lemma_list"] = pd.Series(data_lemmatized)
#df_lda.head()

In [100]:
df_lda.to_csv("df_lda.csv")

In [101]:
df_lda = pd.read_csv("df_lda.csv")

In [102]:
keyword_list = ["concert series", "composer residency", "composer commission",
                "festival", "conference", "operational cost", "competition",
                "job development", "covid relief", "opera", "cultural diversity",
                "community outreach", "student education", "masterclass teaching",
                "chamber music", "tour", "recording", "jazz"]



BART: Second attempt to get better labels with one-shot classification

In [103]:
classifier = pipeline("zero-shot-classification",
                      model="facebook/bart-large-mnli")

In [104]:
sequence_to_classify = "Purpose: To support a composer residency by Brian Nabors including commissions, performances, and educational activities. Additional Project Description: Plans include the completion of two works for orchestra and performances of these and other works by the composer. Nabors will lead master classes in local schools and colleges. Intended Beneficiaries: Programs will benefit the composer-in-residence as well as general audiences in Birmingham, Alabama, and area public school students. Intended Outcome: Artistic activities and traditions are supported to strengthen the nation's cultural infrastructure."
candidate_labels = keyword_list
result = classifier(sequence_to_classify, candidate_labels)

In [105]:
#sanity check
print(result['labels'])
print(result['labels'][0])
print(result['labels'][1])
print(result['labels'][2])
print(result['scores'])
print(result['scores'][0])
print(result['scores'][1])
print(result['scores'][2])

['composer residency', 'student education', 'masterclass teaching', 'composer commission', 'community outreach', 'concert series', 'cultural diversity', 'tour', 'recording', 'covid relief', 'operational cost', 'conference', 'festival', 'chamber music', 'job development', 'competition', 'jazz', 'opera']
composer residency
student education
masterclass teaching
[0.5183343887329102, 0.1170012578368187, 0.1134801134467125, 0.0954497903585434, 0.0757998526096344, 0.01959664560854435, 0.01060319971293211, 0.01045725867152214, 0.009680742397904396, 0.008682592771947384, 0.00487194349989295, 0.0037699646782130003, 0.003521531354635954, 0.0020360550843179226, 0.0018643977819010615, 0.0018038754351437092, 0.0015917744021862745, 0.0014546791790053248]
0.5183343887329102
0.1170012578368187
0.1134801134467125


In [106]:
def get_bart_labels(text_sample):
  candidate_labels = ["concert series", "composer residency", "composer commission",
                "festival", "conference", "operational cost", "competition",
                "job development", "covid relief", "opera", "cultural diversity",
                "community outreach", "student education", "masterclass teaching",
                "chamber music", "tour", "recording", "jazz"]
  sequence_to_classify = text_sample
  result = classifier(sequence_to_classify, candidate_labels)
  return result['labels'][0], result['labels'][1], result['labels'][2], result['scores'][0], result['scores'][1], result['scores'][2]

In [37]:
#Label extraction is not fully repoduced in this notebook because
#running this classification takes SO long - saved csv from previous time
#df_lda[["bart_label_1", "bart_label_2", "bart_label_3", "bart_score_1", "bart_score_2", "bart_score_3"]] = df_lda.apply(lambda x: get_bart_labels(x["Project Description"]), axis=1)

In [46]:
#df_all = pd.concat([df_lda, pd.DataFrame(df["Project Description"])], axis=1)

In [89]:
#df_lda["Project Description"].tail(20)

In [107]:

lemmatizer = WordNetLemmatizer()
df_lda["processed_text"] = df_lda["Project Description"].str.replace('\n',' ').str.lower()
df_lda['processed_text'] = df_lda['processed_text'].apply(lemmatize_it)
df_lda["processed_text"] = df_lda['processed_text'].apply(remove_punctuations)
df_lda['processed_text'] = df_lda['processed_text'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_extended]))

In [108]:
df_lda['processed_text'][1]

'composer residency brian nabors include commission educational activity additional description plan include completion two work work composer nabors lead master class local school college intend beneficiary program benefit composerinresidence well general audience birmingham area public school student intend outcome artistic activity tradition strengthen nation cultural infrastructure'

BERT Topic Modeling: Third attempt to get better labels

In [109]:
docs_processed = df_lda["processed_text"]

topic_model = BERTopic()
topics, probs = topic_model.fit_transform(docs_processed)
topic_model.get_topic_info()



Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,0,366,0_music_include_student_program,"[music, include, student, program, activity, s...",[festival contemporary music music center addi...
1,1,21,1_personnel_response_pandemic_covid19,"[personnel, response, pandemic, covid19, cost,...","[personnel cost response covid19 pandemic, per..."
2,2,16,2_operating_general_response_pandemic,"[operating, general, response, pandemic, covid...",[general operating cost response covid19 pande...


In [110]:
topic_model.get_document_info(docs_processed)

Unnamed: 0,Document,Topic,Name,Representation,Representative_Docs,Top_n_words,Probability,Representative_document
0,series community outreach additional descripti...,0,0_music_include_student_program,"[music, include, student, program, activity, s...",[festival contemporary music music center addi...,music - include - student - program - activity...,1.0,False
1,composer residency brian nabors include commis...,0,0_music_include_student_program,"[music, include, student, program, activity, s...",[festival contemporary music music center addi...,music - include - student - program - activity...,1.0,False
2,outreach activity plan include instrument pet ...,0,0_music_include_student_program,"[music, include, student, program, activity, s...",[festival contemporary music music center addi...,music - include - student - program - activity...,1.0,False
3,sound investment residency commissioning plan ...,0,0_music_include_student_program,"[music, include, student, program, activity, s...",[festival contemporary music music center addi...,music - include - student - program - activity...,1.0,False
4,outreach activity feature flutist joseph firec...,0,0_music_include_student_program,"[music, include, student, program, activity, s...",[festival contemporary music music center addi...,music - include - student - program - activity...,1.0,False
...,...,...,...,...,...,...,...,...
398,music festival free summer festival take place...,0,0_music_include_student_program,"[music, include, student, program, activity, s...",[festival contemporary music music center addi...,music - include - student - program - activity...,1.0,False
399,music festival free summer festival take place...,0,0_music_include_student_program,"[music, include, student, program, activity, s...",[festival contemporary music music center addi...,music - include - student - program - activity...,1.0,False
400,music festival free summer festival take place...,0,0_music_include_student_program,"[music, include, student, program, activity, s...",[festival contemporary music music center addi...,music - include - student - program - activity...,1.0,False
401,music festival free summer festival take place...,0,0_music_include_student_program,"[music, include, student, program, activity, s...",[festival contemporary music music center addi...,music - include - student - program - activity...,1.0,False


In [111]:
bertopic_df_processed = pd.DataFrame(topic_model.get_document_info(docs_processed))
data_topics_processed = df_lda.merge(bertopic_df_processed, left_on="processed_text", right_on="Document")
data_topics_processed.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,index,Organization Name,Discipline / Field,Category,City,State,Project Description,Fiscal Year,...,processed_text,LDA_label_7_topics,Document,Topic,Name,Representation,Representative_Docs,Top_n_words,Probability,Representative_document
0,0,0,1,Aiken Symphony Orchestra Incorporated,Challenge America,Challenge America,Aiken,SC,Purpose: To support a series of community outr...,2022,...,series community outreach additional descripti...,lda_5,series community outreach additional descripti...,0,0_music_include_student_program,"[music, include, student, program, activity, s...",[festival contemporary music music center addi...,music - include - student - program - activity...,1.0,False
1,1,1,2,"Alabama Symphonic Association, Inc.",Music,Grants for Arts Projects,BIRMINGHAM,AL,Purpose: To support a composer residency by Br...,2022,...,composer residency brian nabors include commis...,lda_2,composer residency brian nabors include commis...,0,0_music_include_student_program,"[music, include, student, program, activity, s...",[festival contemporary music music center addi...,music - include - student - program - activity...,1.0,False
2,2,2,3,"Alabama Symphonic Association, Inc.",Music,Art Works,BIRMINGHAM,AL,To support concerts and outreach activities by...,2015,...,outreach activity plan include instrument pet ...,lda_5,outreach activity plan include instrument pet ...,0,0_music_include_student_program,"[music, include, student, program, activity, s...",[festival contemporary music music center addi...,music - include - student - program - activity...,1.0,False
3,3,3,4,"Alabama Symphonic Association, Inc.",Music,Art Works,BIRMINGHAM,AL,"To support Sound Investment, a residency and c...",2013,...,sound investment residency commissioning plan ...,lda_1,sound investment residency commissioning plan ...,0,0_music_include_student_program,"[music, include, student, program, activity, s...",[festival contemporary music music center addi...,music - include - student - program - activity...,1.0,False
4,4,14,15,"Albany Symphony Association, Inc.",Challenge America,Challenge America,ALBANY,GA,To support a performance and outreach activiti...,2016,...,outreach activity feature flutist joseph firec...,lda_2,outreach activity feature flutist joseph firec...,0,0_music_include_student_program,"[music, include, student, program, activity, s...",[festival contemporary music music center addi...,music - include - student - program - activity...,1.0,False


In [None]:
#data_topics_processed = df.merge(bertopic_df_processed, left_on="processed_text", right_on="Document")

Bucket grants into categories based in funding amount

In [112]:
interval_range = pd.interval_range(start=0, freq=10000, end=160000)
data_topics_processed['10k_bin'] = pd.cut(data_topics_processed['Grant Amount'], bins=interval_range)
data_topics_processed['decile'] = pd.cut(data_topics_processed['Grant Amount'], bins=10)

In [113]:
data_topics_processed.to_csv("full_data.csv")

In [121]:
df_full = pd.read_csv("df_clean_10_yrs.csv")

Visualizations based on new - better - labels

In [123]:
fig = df_full["LDA_label_6_topics"].value_counts().plot(kind="bar",
                                                   title="Grants Awarded by LDA Topic",
                                                   width=1000, height=700)
fig.write_html("count_of_grants_by_lda_topic.html")
fig.show()

In [124]:
fig = df_full["bart_label_1"].value_counts().plot(kind="bar",
                                                   title="Grants Awarded by LLM First Topic",
                                                   width=1000, height=700)
fig.write_html("count_of_grants_by_llm_topic_1.html")
fig.show()

In [125]:
fig = df_full["bart_label_2"].value_counts().plot(kind="bar",
                                                   title="Grants Awarded by LLM Second Topic",
                                                   width=1000, height=700)
fig.write_html("count_of_grants_by_llm_topic_2.html")
fig.show()

In [126]:
fig = df_full["LDA_label_6_topics"].plot(kind="box",
              x=df_full["LDA_label_6_topics"],
              y=df_full["Grant Amount"],
              title="Amount Awarded by LDA Topic",
              width=1100, height=900)
# fig = df.plot.box(y=df["Grant Amount"],
#               x=df["LDA_topic"])
              #title="Amount Awarded by LDA Topic",
              #width=11000, height=900)
fig.write_html("distribution_of_money_by_lda_topic.html")
fig.show()

In [127]:
fig = df_full.plot(kind="box",
              x=df_full["bart_label_1"],
              y=df_full["Grant Amount"],
              title="Amount Awarded by LLM First Topic",
              width=1100, height=900)
fig.write_html("distribution_of_money_by_llm_topic_1.html")
fig.show()

In [128]:
fig = df_full.plot(kind="box",
              x=df_full["bart_label_2"],
              y=df_full["Grant Amount"],
              title="Amount Awarded by LLM Second Topic",
              width=1100, height=900)
fig.write_html("distribution_of_money_by_llm_topic_2.html")
fig.show()

Top words found in each funding amount bucket

In [130]:
df_full_clean = df_full[df_full['bart_label_1'].notna()]

In [154]:
vectorizer = CountVectorizer()
transformer = make_column_transformer((vectorizer, 'bart_label_1'), (vectorizer, 'bart_label_2'))
features = transformer.fit_transform(df_full_clean[["bart_label_1", "bart_label_2"]])


In [155]:
topwords = pd.DataFrame(features.toarray(), index=pd.MultiIndex.from_frame(pd.DataFrame(df_full_clean['10k_bin'])))
topwords.columns = transformer.get_feature_names_out()
topwords = topwords.reset_index().melt(id_vars=['10k_bin'],
                                                 var_name='WORDS',
                                                 value_name='Value')
topwords = topwords.groupby(['10k_bin', 'WORDS'])['Value'].sum().reset_index()
topwords = topwords.groupby(['10k_bin']).apply(lambda x: x.nlargest(20, 'Value'))
topwords.reset_index(drop=True, inplace=True)

In [156]:
topwords

Unnamed: 0,10k_bin,WORDS,Value
0,"(0, 10000]",countvectorizer-1__community,42
1,"(0, 10000]",countvectorizer-1__outreach,42
2,"(0, 10000]",countvectorizer-2__concert,34
3,"(0, 10000]",countvectorizer-2__series,34
4,"(0, 10000]",countvectorizer-2__community,31
...,...,...,...
215,"(90000, 100000]",countvectorizer-1__concert,0
216,"(90000, 100000]",countvectorizer-1__covid,0
217,"(90000, 100000]",countvectorizer-1__cultural,0
218,"(90000, 100000]",countvectorizer-1__diversity,0
