In [1]:
import pandas as pd
import re

In [2]:
df = pd.read_csv("joined_dataset.csv", usecols=["summary", "title",
                                                "category"])
df.head()

Unnamed: 0,summary,title,category
0,Help us set up a new training classroom to tea...,Cents Ability Classroom Set-up,Strengthening Communities
1,Mentor homeless children between the ages of 2...,Mentor Homeless Youth,Education
2,"Are you interested in journalism, research, wr...","Journalism, Copywriting, Creative Writing Oppo...",
3,"In celebration of Father's Day, we invite fath...",Volunteers Needed for Senior Lunch Program!,Helping Neighbors in Need
4,The Police Athletic League of New York City se...,Special Events Assistant,Strengthening Communities


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1125 entries, 0 to 1124
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   summary   1108 non-null   object
 1   title     1108 non-null   object
 2   category  706 non-null    object
dtypes: object(3)
memory usage: 26.5+ KB


In [4]:
df.describe()

Unnamed: 0,summary,title,category
count,1108,1108,706
unique,1058,1054,10
top,We are New York City’s largest girls-only yout...,Grant Writer,Strengthening Communities
freq,6,9,307


Let's remove the NaN from the "summary" and "category" column:

In [5]:
df = df.dropna(subset=["summary", "category"])

In [6]:
df.describe()

Unnamed: 0,summary,title,category
count,706,706,706
unique,680,679,10
top,We are New York City’s largest girls-only yout...,Teach Adults and Kids how to Ride Bikes!,Strengthening Communities
freq,6,5,307


Getting rid of punctuation marks and move to lowercase:

In [7]:
df["summary"] = df["summary"].str.replace('[,\.!?]', '')
df["summary"] = df["summary"].str.lower()
df["summary"] = df["summary"].apply(lambda x: re.sub(r'\d+(\w+|-\w+| )', '',
                                                     x.lower()))

df["title"] = df["title"].str.replace('[,\.!?]', '')
df["title"] = df["title"].str.lower()

df["category"] = df["category"].str.replace('[,\.!?]', '')
df["category"] = df["category"].str.lower()

In [9]:
df.head()

Unnamed: 0,summary,title,category
0,help us set up a new training classroom to tea...,cents ability classroom set-up,strengthening communities
1,mentor homeless children between the ages of t...,mentor homeless youth,education
3,in celebration of father's day we invite fathe...,volunteers needed for senior lunch program,helping neighbors in need
4,the police athletic league of new york city se...,special events assistant,strengthening communities
5,helping children design a stage for a decembe...,stage design,strengthening communities


In [93]:
df.category.unique()

array(['strengthening communities', 'education',
       'helping neighbors in need', 'environment', 'health',
       'emergency preparedness', 'communities and neighbors',
       'health and well-being', 'economic and workforce development',
       'emergency preparedness and response'], dtype=object)

Remove unnecessary categories:

In [94]:
df["category"] = df["category"].str.replace('emergency preparedness and response',
                                            'emergency preparedness')

df["category"] = df["category"].str.replace('health and well-being',
                                            'health')

In [95]:
df.category.unique()

array(['strengthening communities', 'education',
       'helping neighbors in need', 'environment', 'health',
       'emergency preparedness', 'communities and neighbors',
       'economic and workforce development'], dtype=object)

In [98]:
df.loc[df["category"]=="communities and neighbors", :]

Unnamed: 0,summary,title,category
666,arts and crafts volunteers are needed to assis...,arts and crafts volunteers,communities and neighbors
683,volunteers on new york cares' hunger projects ...,hunger projects with new york cares,communities and neighbors
701,lead seniors in monthly arts and crafts evening,evening art instructor - jasa co-op city,communities and neighbors
703,help serve lunch to older adults,lunch service volunteer - jasa manhattan beach...,communities and neighbors
704,it's always fun to learn a new language,russian language instructor - jasa co-op city ...,communities and neighbors
707,the 9/memorial &amp; museum honors and remembe...,9/11 memorial & museum: visitor services volun...,communities and neighbors
714,the actors theatre workshop (atw) is currently...,it volunteers for non-profit theatre and commu...,communities and neighbors
732,wibo continuously seeks out caring professiona...,discussion leader,communities and neighbors
733,assist a non profit yoga studio in administrat...,admin/front desk volunteers,communities and neighbors
734,hosh kids is making a difference in making enr...,staff development coordinator,communities and neighbors


In [10]:
import numpy as np
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import TfidfVectorizer
import pickle
import spacy

# TF-IDF

Creating a TF-IDF vectorizer:

In [9]:
tfidf_vec = TfidfVectorizer(stop_words="english", lowercase=True,
                            ngram_range=(1, 2))

In [15]:
tfidf_vec.vocabulary_

{'help': 5032,
 'set': 10008,
 'new': 7351,
 'training': 11200,
 'classroom': 2116,
 'teach': 10919,
 'young': 12316,
 'adults': 615,
 'personal': 8177,
 'finance': 4249,
 'help set': 5150,
 'set new': 10017,
 'new training': 7368,
 'training classroom': 11203,
 'classroom teach': 2121,
 'teach young': 10934,
 'young adults': 12318,
 'adults personal': 629,
 'personal finance': 8182,
 'mentor': 6917,
 'homeless': 5281,
 'children': 1965,
 'ages': 719,
 '12': 78,
 'weekend': 11969,
 'enrichment': 3698,
 'program': 8655,
 'mentor homeless': 6922,
 'homeless children': 5282,
 'children ages': 1969,
 'ages 12': 721,
 '12 weekend': 86,
 'weekend enrichment': 11971,
 'enrichment program': 3704,
 'celebration': 1834,
 'father': 4162,
 'day': 2901,
 'invite': 5852,
 'fathers': 4164,
 'son': 10281,
 'daughter': 2897,
 'volunteer': 11641,
 'senior': 9860,
 'lunch': 6561,
 'sunday': 10743,
 'june': 5993,
 '19th': 157,
 '12pm': 93,
 'celebration father': 1836,
 'father day': 4163,
 'day invite': 2

In [10]:
tfidf_summary = tfidf_vec.fit_transform(df["summary"])

In [11]:
print(tfidf_summary)

  (0, 8182)	0.2737302971058278
  (0, 629)	0.2737302971058278
  (0, 12318)	0.23721008897959234
  (0, 10934)	0.2737302971058278
  (0, 2121)	0.2737302971058278
  (0, 11203)	0.2737302971058278
  (0, 7368)	0.2737302971058278
  (0, 10017)	0.2737302971058278
  (0, 5150)	0.23721008897959234
  (0, 4249)	0.2575698475662482
  (0, 8177)	0.20578488091242172
  (0, 615)	0.1763867150467356
  (0, 12316)	0.19617298787374657
  (0, 10919)	0.18195714705713897
  (0, 2116)	0.2299433766050215
  (0, 11200)	0.19617298787374657
  (0, 7351)	0.11437572035837502
  (0, 10008)	0.19342316847878607
  (0, 5032)	0.09858261510260274
  (1, 3704)	0.2889884172562213
  (1, 11971)	0.3071201309592041
  (1, 86)	0.3071201309592041
  (1, 721)	0.2889884172562213
  (1, 1969)	0.2889884172562213
  (1, 5282)	0.3071201309592041
  :	:
  (705, 4846)	0.15018949470591195
  (705, 9061)	0.1435036346183432
  (705, 10322)	0.15018949470591195
  (705, 3911)	0.1435036346183432
  (705, 9320)	0.12220862801387922
  (705, 3066)	0.1435036346183432
  (7

In [14]:
top_word = np.array(tfidf_summary.argmax(axis=1))
top_word

array([[  629],
       [   86],
       [  157],
       [ 3855],
       [   83],
       [  523],
       [  222],
       [ 3174],
       [  989],
       [  918],
       [10579],
       [ 7424],
       [ 1179],
       [ 9498],
       [ 1128],
       [ 9178],
       [ 2719],
       [ 7182],
       [ 4745],
       [11562],
       [ 9178],
       [ 1677],
       [  449],
       [ 4631],
       [ 2217],
       [ 3301],
       [ 1186],
       [11824],
       [ 9906],
       [ 3844],
       [  639],
       [ 5115],
       [ 8177],
       [ 1859],
       [  397],
       [  932],
       [ 3823],
       [ 5037],
       [  627],
       [ 3837],
       [ 4770],
       [11092],
       [ 1616],
       [ 1366],
       [ 1075],
       [ 2053],
       [ 4623],
       [ 3835],
       [  859],
       [ 1630],
       [  227],
       [ 1517],
       [  772],
       [11864],
       [ 1597],
       [ 2130],
       [  989],
       [ 5698],
       [ 1839],
       [ 4467],
       [ 7567],
       [ 3821],
       [

In [31]:
pred_labels = [[key for key, val in tfidf_vec.vocabulary_.items() if val == x]
               for x in top_word]

In [32]:
pred_labels

[['adults personal'],
 ['12 weekend'],
 ['19th'],
 ['events'],
 ['12 design'],
 ['activiteis'],
 ['21st 22nd'],
 ['dimension'],
 ['arts'],
 ['architecture'],
 ['street games'],
 ['nottage'],
 ['attendance event'],
 ['river'],
 ['assistant hour'],
 ['recycling'],
 ['crafts children'],
 ['national'],
 ['grades learn'],
 ['veterans'],
 ['recycling'],
 ['calls designing'],
 ['accessible manner'],
 ['general'],
 ['coaches'],
 ['documents'],
 ['attention cause'],
 ['walk'],
 ['september'],
 ['event publicity'],
 ['advancing'],
 ['help nutrition'],
 ['personal'],
 ['center jamaica'],
 ['abandoned'],
 ['area queens'],
 ['event forest'],
 ['help american'],
 ['adults new'],
 ['event organization'],
 ['graphics'],
 ['time'],
 ['browsing'],
 ['beds produce'],
 ['assist director'],
 ['city currently'],
 ['ged'],
 ['event new'],
 ['apartment'],
 ['build nurture'],
 ['22 2011'],
 ['boxes'],
 ['aloud'],
 ['wants'],
 ['brooklyn days'],
 ['clean free'],
 ['arts'],
 ['instructor'],
 ['celebration servic

# LDA

In [67]:
count_vec = CountVectorizer(preprocessor=lambda x: 
                            re.sub(r'\d+(\w+|-\w+| )', '', x.lower()),
                            stop_words="english", ngram_range=(1, 2),
                            lowercase=True)

count_summary = count_vec.fit_transform(df['summary'])

In [68]:
count_vec.get_feature_names()

['aaany',
 'aaany looking',
 'abandoned',
 'abandoned cemetery',
 'abilities',
 'abilities capable',
 'ability',
 'ability manage',
 'able',
 'able bodied',
 'able cultivate',
 'able dig',
 'able experience',
 'able grow',
 'able help',
 'able instruct',
 'able medical',
 'able participate',
 'able teach',
 'aboard',
 'aboard lilac',
 'abuse',
 'abuse incest',
 'abuse treatment',
 'abused',
 'abused women',
 'academic',
 'academic achievement',
 'academic activities',
 'academic career',
 'academic coaching',
 'academic emotional',
 'academic enrichment',
 'academic tutors',
 'academic year',
 'academics',
 'academics school',
 'academy',
 'academy harlem',
 'academy looking',
 'accelerate',
 'accelerate academic',
 'accepting',
 'accepting applications',
 'accepting inclusive',
 'access',
 'access books',
 'access health',
 'access medicaid',
 'access music',
 'access truck',
 'accessible',
 'accessible bus',
 'accessible develop',
 'accessible manner',
 'accessible schools',
 'access

In [121]:
lda = LatentDirichletAllocation(n_components=10, n_jobs=-1, random_state=1,
                                verbose=1)

In [127]:
lda_summary = lda.fit_transform(count_summary)
lda_summary

iteration: 1 of max_iter: 10
iteration: 2 of max_iter: 10
iteration: 3 of max_iter: 10
iteration: 4 of max_iter: 10
iteration: 5 of max_iter: 10
iteration: 6 of max_iter: 10
iteration: 7 of max_iter: 10
iteration: 8 of max_iter: 10
iteration: 9 of max_iter: 10
iteration: 10 of max_iter: 10


array([[0.00500045, 0.00500053, 0.9549913 , ..., 0.00500149, 0.00500041,
        0.00500066],
       [0.00714361, 0.00714423, 0.00714359, ..., 0.00714345, 0.0071438 ,
        0.00714357],
       [0.00384635, 0.00384647, 0.00384643, ..., 0.00384659, 0.00384632,
        0.96538222],
       ...,
       [0.00172424, 0.00172429, 0.00172427, ..., 0.98448123, 0.00172438,
        0.00172437],
       [0.00312516, 0.00312535, 0.00312532, ..., 0.00312527, 0.00312523,
        0.00312535],
       [0.00200021, 0.00200013, 0.00200011, ..., 0.0020003 , 0.00200006,
        0.00200021]])

In [133]:
lda_summary.shape

(706, 10)

In [134]:
def print_topics(model, count_vectorizer, n_top_words):
    words = count_vectorizer.get_feature_names()
    
    for topic_idx, topic in enumerate(model.components_):
        print("\nTopic #%d:" % topic_idx)
        
        print(" ".join([words[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))

In [137]:
print("Topics found via LDA:")
print_topics(lda, count_vec, 1)

Topics found via LDA:

Topic #0:
volunteers

Topic #1:
help

Topic #2:
help

Topic #3:
program

Topic #4:
volunteers

Topic #5:
help

Topic #6:
help

Topic #7:
volunteers

Topic #8:
recreation

Topic #9:
help


___

In [11]:
summary_list = df["summary"].to_list()

In [12]:
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
import string

In [32]:
stop = stopwords.words('english')
stop.extend(["volunteer", "help", "new", "get", "nyc", "york", "city",
             "assist"])

exclude = set(string.punctuation)
lemma = WordNetLemmatizer()

In [179]:
# stop = set(stopwords.words('english'))
# exclude = set(string.punctuation)
# lemma = WordNetLemmatizer()

In [15]:
def clean(doc):
    stop_free = " ".join([i for i in doc.lower().split() if i not in stop])
    punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
    normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())
    
    return normalized

In [16]:
doc_clean = [clean(doc).split() for doc in summary_list] 
doc_clean

[['u',
  'set',
  'training',
  'classroom',
  'teach',
  'young',
  'adult',
  'personal',
  'finance'],
 ['mentor', 'homeless', 'child', 'age', 'weekend', 'enrichment', 'program'],
 ['celebration',
  'father',
  'day',
  'invite',
  'father',
  'son',
  'daughter',
  'senior',
  'lunch',
  'program',
  'sunday',
  'june'],
 ['police',
  'athletic',
  'league',
  'seek',
  'talented',
  'creative',
  'planning',
  'delivery',
  'special',
  'event',
  'hosted',
  'pal',
  'throughout',
  'event',
  'cater',
  'youth',
  'adult',
  'fun',
  'filled'],
 ['helping', 'child', 'design', 'stage', 'december', 'production'],
 ['jewish',
  'home',
  'lifecare',
  'seeking',
  'event',
  'setup',
  'transport',
  'resident',
  'special',
  'event',
  'activiteis',
  'holiday',
  'weekend',
  'evening'],
 ['please', 'earth', 'day', 'event', 'april'],
 ['share',
  'information',
  'nycs',
  'street',
  'vendor',
  'right',
  'responsibility',
  'know',
  'another',
  'dimension'],
 ['art',
  'eas

In [17]:
import gensim
from gensim import corpora

In [18]:
dictionary = corpora.Dictionary(doc_clean)

doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]
doc_term_matrix

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1)],
 [(9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1)],
 [(14, 1),
  (16, 1),
  (17, 1),
  (18, 1),
  (19, 2),
  (20, 1),
  (21, 1),
  (22, 1),
  (23, 1),
  (24, 1),
  (25, 1)],
 [(0, 1),
  (26, 1),
  (27, 1),
  (28, 1),
  (29, 1),
  (30, 2),
  (31, 1),
  (32, 1),
  (33, 1),
  (34, 1),
  (35, 1),
  (36, 1),
  (37, 1),
  (38, 1),
  (39, 1),
  (40, 1),
  (41, 1),
  (42, 1)],
 [(10, 1), (43, 1), (44, 1), (45, 1), (46, 1), (47, 1)],
 [(15, 1),
  (30, 2),
  (39, 1),
  (48, 1),
  (49, 1),
  (50, 1),
  (51, 1),
  (52, 1),
  (53, 1),
  (54, 1),
  (55, 1),
  (56, 1),
  (57, 1)],
 [(18, 1), (30, 1), (58, 1), (59, 1), (60, 1)],
 [(61, 1),
  (62, 1),
  (63, 1),
  (64, 1),
  (65, 1),
  (66, 1),
  (67, 1),
  (68, 1),
  (69, 1),
  (70, 1)],
 [(71, 3),
  (72, 1),
  (73, 1),
  (74, 1),
  (75, 1),
  (76, 1),
  (77, 1),
  (78, 1),
  (79, 1),
  (80, 1),
  (81, 1),
  (82, 1),
  (83, 1),
  (84, 1),
  (85, 1),
  (86, 1),
  (8

In [19]:
Lda = gensim.models.ldamodel.LdaModel

ldamodel = Lda(doc_term_matrix, num_topics=3, id2word = dictionary, passes=50)

In [20]:
print(ldamodel.print_topics(num_topics=3, num_words=3))

[(0, '0.016*"organization" + 0.011*"needed" + 0.008*"event"'), (1, '0.010*"program" + 0.009*"student" + 0.008*"event"'), (2, '0.012*"program" + 0.008*"school" + 0.007*"community"')]


___

In [75]:
stop2 = stopwords.words('english')
stop2.extend(["volunteer", "help", "new", "get", "nyc", "york", "city",
             "assist"])

lemma2 = WordNetLemmatizer()

In [76]:
df.category.unique()

array(['strengthening communities', 'education',
       'helping neighbors in need', 'environment', 'health',
       'emergency preparedness', 'communities and neighbors',
       'health and well-being', 'economic and workforce development',
       'emergency preparedness and response'], dtype=object)

In [91]:
nlp("well-being")[1]

-

In [92]:
filtered_categories = []

for cat in df.category.unique():
    lemma_cat = list( map( lambda word: word.lemma_ if word.lemma_\
                                                    else word.text, nlp(cat) ) )
#     lemma_cat = list(map(lambda x: lemma2.lemmatize(x), cat.split()))
    print(lemma_cat)
    filtered_categories.append(" ".join(list(filter(lambda x: x not in
                                                    stop2, lemma_cat))))

['strengthen', 'community']
['education']
['help', 'neighbor', 'in', 'need']
['environment']
['health']
['emergency', 'preparedness']
['community', 'and', 'neighbor']
['health', 'and', 'well', '-', 'being']
['economic', 'and', 'workforce', 'development']
['emergency', 'preparedness', 'and', 'response']


In [78]:
filtered_categories

['strengthen community',
 'education',
 'neighbor need',
 'environment',
 'health',
 'emergency preparedness',
 'community neighbor',
 'health well -',
 'economic workforce development',
 'emergency preparedness response']

In [None]:
stop_free = " ".join([i for i in doc.lower().split() if i not in stop])

In [25]:
from nltk.corpus import wordnet

In [30]:
synonyms = [] 

for cat in category.unique():
    for word in cat.split():
        for syn in wordnet.synsets(cat.split()): 
        for l in syn.lemmas(): 
            synonyms.append(l.name()) 
            
print(set(synonyms)) 

{'enduringness', 'speciality', 'potency', 'persuasiveness', 'strength', 'long_suit', 'specialty', 'durability', 'metier', 'forte', 'force', 'lastingness', 'military_posture', 'military_strength', 'strong_point', 'intensity', 'posture', 'forcefulness', 'effectiveness', 'military_capability', 'strong_suit', 'intensity_level'}
