In [1]:
import pandas as pd

from nltk.util import ngrams
from textblob import TextBlob

from collections import defaultdict
from operator import itemgetter

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [12]:
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import re

from scipy.cluster.hierarchy import ward, dendrogram, linkage
#from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import LatentDirichletAllocation


In [3]:
non_profit_dict = {'V. Human Services': 'human_services', 'IV. Health':'health', 'II. Education':'education',
                  'III. Environment and Animals':'env_animals', 'VII. Public, Societal Benefit':'public_benefit',
                  'I. Arts, Culture, and Humanities':'arts',
                   'X. Unknown, Unclassified or Other': 'other', 'IX. Mutual/Membership Benefit':'other',
                 'VI. International, Foreign Affairs' : 'foreign_affairs', 'VIII. Religion Related':'religion'}

In [4]:
df = pd.read_csv('All_Data_Combined_v3.csv', encoding = 'latin1')
df.columns

Index(['year', 'rowkey', 'story', 'storyclean', 'ipaddress', 'charityname',
       'ein', 'nonprofit_focus', 'ntee_short', 'ntee_desc', 'age',
       'genderclean', 'gender', 'zip', 'state', 'votes_lower', 'votes_higher',
       'rankpercentile_lower', 'rankpercentile_higher', 'domain', 'extention',
       'username'],
      dtype='object')

In [5]:
df['nonprofit_cat'] = [non_profit_dict[focus.strip()] for focus in df.nonprofit_focus]

In [6]:
df.groupby('nonprofit_cat').describe().iloc[:,2:].ix['arts'] #choose any nonprofit_cat to look at stats per category

Unnamed: 0,rankpercentile_lower,votes_higher,votes_lower,year
count,81.0,81.0,81.0,81.0
mean,0.429557,211.209877,30.382716,2016.444444
std,0.330079,1251.005892,69.204691,0.5
min,0.0,0.0,0.0,2016.0
25%,0.100861,1.0,1.0,2016.0
50%,0.437884,4.0,3.0,2016.0
75%,0.693727,42.0,23.0,2017.0
max,0.97294,11200.0,506.0,2017.0


In [7]:
df.genderclean.value_counts()

Female              1041
Male                 280
Decline to State       2
Name: genderclean, dtype: int64

In [8]:
df.genderclean.isnull().sum()

94

### Unsupervised Clustering using Kmeans.

#### Females.

In [9]:
females = df[df.genderclean == 'Female']
stories = females.storyclean
len(stories)

1041

In [10]:
def mytokenizer(doc):
    
    stop = stopwords.words('english')
    stop += ['.', ',', '(', ')', "'", '"']
    
    tokenized = []
    words = word_tokenize(doc)
    for w in words:
        if re.match("([a-zA-Z]+'[a-zA-Z]+)$|([a-zA-Z]+')$|([a-zA-Z]+)$",w):
            if w not in stop:
                tokenized.append(w)
    return(tokenized)

In [13]:
n_clusters =5
tf_idf = TfidfVectorizer(encoding='latin-1',max_df=0.9, min_df=2, tokenizer= mytokenizer, ngram_range=(1,1), max_features=100000)
tf_idf.fit(stories)

X = tf_idf.fit_transform(stories).toarray()
terms = tf_idf.get_feature_names()

km = KMeans(n_clusters).fit(X)
km_result = km.fit_predict(X)

In [14]:
print("Top terms per cluster:")
print()

#sort cluster centers by proximity to centroid
order_centroids = km.cluster_centers_.argsort()[:, ::-1] 

for i in range(n_clusters):
    print("Cluster %d words:" % i, end='')
    
    for ind in order_centroids[i, :10]: #replace 6 with n words per cluster
        print(' %s,' % terms[ind],end='')
    print()
    print()

Top terms per cluster:

Cluster 0 words: student, school, education, child, program, art, wa, college, girl, music,

Cluster 1 words: cancer, wa, family, diagnosed, treatment, research, year, life, patient, disease,

Cluster 2 words: dog, animal, rescue, pet, shelter, wa, home, help, foster, cat,

Cluster 3 words: wa, child, life, help, family, year, give, community, people, time,

Cluster 4 words: horse, riding, equine, carrot, wa, golden, casey, ranch, ride, care,



#### Counts of documents per each cluster.

In [15]:
pd.Series(km.labels_).value_counts()

3    735
0    117
1     98
2     63
4     28
dtype: int64

#### Males.

In [16]:
males = df[df.gender == 'Male']
stories = males.storyclean
len(stories)

114

In [17]:
def mytokenizer(doc):
    
    stop = stopwords.words('english')
    stop += ['.', ',', '(', ')', "'", '"', 'wa']
    
    tokenized = []
    words = word_tokenize(doc)
    for w in words:
        if re.match("([a-zA-Z]+'[a-zA-Z]+)$|([a-zA-Z]+')$|([a-zA-Z]+)$",w):
            if w not in stop:
                tokenized.append(w)
    return(tokenized)

In [18]:
n_clusters = 3
tf_idf = TfidfVectorizer(encoding='latin-1',max_df=0.5, min_df=0.01, tokenizer= mytokenizer, ngram_range=(1,1), max_features=10000)
tf_idf.fit(stories)

X = tf_idf.fit_transform(stories).toarray()
terms = tf_idf.get_feature_names()

km = KMeans(n_clusters).fit(X)
km_result = km.fit_predict(X)

In [19]:
print("Top terms per cluster:")
print()

#sort cluster centers by proximity to centroid
order_centroids = km.cluster_centers_.argsort()[:, ::-1] 

for i in range(n_clusters):
    print("Cluster %d words:" % i, end='')
    
    for ind in order_centroids[i, :20]: #replace with n words per cluster
        print(' %s,' % terms[ind],end='')
    print()
    print()

Top terms per cluster:

Cluster 0 words: food, meal, shelter, need, donated, city, benefit, woman, support, scholarship, dignity, pet, time, large, technology, obstacle, baby, mission, card, cause,

Cluster 1 words: community, child, world, one, giving, give, people, u, school, program, need, student, day, ha, project, every, work, time, could, new,

Cluster 2 words: cancer, kid, cure, time, disease, one, breast, day, truck, research, many, mom, camp, raise, im, people, child, money, friend, want,



#### Counts of documents per each cluster.

In [20]:
pd.Series(km.labels_).value_counts()

1    74
2    31
0     9
dtype: int64

### Hierarchichal clustering using stories did not work. Use cosine metric for distance as we do not want the length of the story to influence the clustering.
I tried to use age, and nonprofit_cat as labels. 

In [21]:
def mytokenizerHC(doc):

    stop = []
    #stopwords.words('english')
    stop += ['.', ',', '(', ')', "'", '"']
    
    tokenized = []
    words = word_tokenize(doc)
    for w in words:
        if re.match("([a-zA-Z]+'[a-zA-Z]+)|([a-zA-Z]+')|([a-zA-Z]+)",w):
            if w not in stop:
                tokenized.append(w)
    return(tokenized)

In [None]:
data = df.storyclean

vectorizer = CountVectorizer(stop_words= None,ngram_range=(3, 3), tokenizer = mytokenizerHC,
                             encoding='latin-1', max_features=1000, min_df=0.01, max_df=0.85)
tf = vectorizer.fit(data)
tf_matrix = tf.fit_transform(data).toarray()

linkage_matrix = linkage(tf_matrix, method='weighted', metric='cosine')

sns.set_style("white")
fig, ax = plt.subplots(figsize=(20, 8)) # set size
ax = dendrogram(linkage_matrix, orientation="top", labels = df.nonprofit_cat);
plt.title('Dendrogram of GivingTuesday Stories.', fontsize = 22)
plt.xticks(rotation=15,fontsize =17)

plt.tight_layout() #show plot with tight layout

### LDA Analysis - topic modeling.

In [23]:
stories = df.storyclean

In [24]:
def mytokenizerLDA(doc):
    
    stop = stopwords.words('english')
    stop.extend([s.capitalize() for s in stop])
    stop += ['.', ',', '(', ')', "'", '"', 'ha', 'u', 'would', 'one']
    
    tokenized = []
    words = word_tokenize(doc)
    for w in words:
        if re.match("([a-zA-Z]+'[a-zA-Z]+)|([a-zA-Z]+')|([a-zA-Z]+)",w):
            if w not in stop:
                tokenized.append(w)
    return(tokenized)

In [25]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic {0}:".format(topic_idx))
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print()
    

In [26]:
def sorted_num_of_docs_per_topic(corpus, fitted_model, tf_matrix):
    topics = []
    docs_by_topics_df = pd.DataFrame(fitted_model.transform(tf_matrix))
    for row, val in docs_by_topics_df.iterrows():
        topics.append(val.argmax())
    result = pd.DataFrame(pd.Series(topics).value_counts()).reset_index()
    result.columns = ['topic_num', 'docs_num']
    return result

#### Males.

In [28]:
n_features = 10000
n_topics = 3
n_top_words = 20

In [29]:
males = df[df.genderclean == 'Male']
stories = males.storyclean
len(stories)

280

In [30]:
print("Extracting tf features for LDA...")
tf_vectorizer = CountVectorizer(max_df=0.7, min_df=2, ngram_range=(1, 1), max_features=n_features, tokenizer = mytokenizerLDA)
tf = tf_vectorizer.fit_transform(stories)

Extracting tf features for LDA...


In [31]:
print("Fitting LDA models with tf features ..")
lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=200,
                                learning_method='batch', learning_offset=50.,
                                random_state=0)
lda.fit(tf)
tf_feature_names = tf_vectorizer.get_feature_names()
print_top_words(lda, tf_feature_names, n_top_words)

Fitting LDA models with tf features ..
Topic 0:
cancer family year life help time people child kid disease many give foundation could friend animal money day support like
Topic 1:
life community school student program give child year organization help world giving time people family opportunity need project many way
Topic 2:
year time day people help many life could get like family back need work home every food volunteer world first



In [32]:
sorted_num_of_docs_per_topic(stories, lda, tf)

Unnamed: 0,topic_num,docs_num
0,1,130
1,2,86
2,0,64


#### Females.

In [33]:
def mytokenizerLDA(doc):
    
    stop = stopwords.words('english')
    stop.extend([s.capitalize() for s in stop])
    stop += ['.', ',', '(', ')', "'", '"', 'wa', 'ha', 'give', 'one', 'could', 'u', 'would']
    
    tokenized = []
    words = word_tokenize(doc)
    for w in words:
        if re.match("([a-zA-Z]+'[a-zA-Z]+)|([a-zA-Z]+')|([a-zA-Z]+)",w):
            if w not in stop:
                tokenized.append(w)
    return(tokenized)

In [34]:
females = df[df.genderclean == 'Female']
stories = females.storyclean
len(stories)

1041

In [35]:
n_features = 10000
n_topics = 8
n_top_words = 10

In [36]:
print("Extracting tf features for LDA...")
tf_vectorizer = CountVectorizer(max_df=0.7, min_df=2, ngram_range=(1, 1), max_features=n_features, tokenizer = mytokenizerLDA)
tf = tf_vectorizer.fit_transform(stories)

print("Fitting LDA models with tf features ..")
lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=200,
                                learning_method='batch', learning_offset=50.,
                                random_state=0)
lda.fit(tf)
tf_feature_names = tf_vectorizer.get_feature_names()
print_top_words(lda, tf_feature_names, n_top_words)

Extracting tf features for LDA...
Fitting LDA models with tf features ..
Topic 0:
child community school help program organization year family girl support
Topic 1:
year life time family day cancer people help get know
Topic 2:
time life room bird israel take home experience building able
Topic 3:
riding school music firefighter student program camp year kid deaf
Topic 4:
nicu community de day support womb neonatal free world foundation
Topic 5:
school student year child life time like world kid first
Topic 6:
help child life family dog people need many animal home
Topic 7:
test disease wildlife center psp virginia alzheimers eagle cure hope



In [37]:
sorted_num_of_docs_per_topic(stories, lda, tf)

Unnamed: 0,topic_num,docs_num
0,0,274
1,6,271
2,1,264
3,5,195
4,7,12
5,4,12
6,2,7
7,3,6


In [None]:
#readability, textstat packages - readability measures