## Imports

In [1]:
import numpy as np
import pandas as pd
import datetime
import math
import random

pd.options.display.max_columns = 50

In [2]:
import scipy.sparse as sparse
from scipy.sparse.linalg import spsolve
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

In [44]:
import pprint
from collections import defaultdict
from gensim import corpora, models, similarities

## Functions

In [102]:
def process_corpus(text_corpus):
    '''
    '''
    # Create a set of frequent words
    stoplist = ""
    stoplist = set('for a of the and to in is'.split(' '))
    
    # Lowercase each document, split it by white space and filter out stopwords
    texts = ""
    texts = [[word for word in document.lower().split() if word not in stoplist] for document in text_corpus]
    
    # Count word frequencies
    frequency = defaultdict(int)
    
    for text in texts:
        for token in text:
            frequency[token] += 1

    # Only keep words that appear more than once
    processed_corpus = [[token for token in text if frequency[token] > 1] for text in texts]
    
    return processed_corpus

In [103]:
def create_index_from_corpus(processed_corpus):
    '''
    '''
    dictionary = corpora.Dictionary(processed_corpus)
    features = (len(dictionary))
    
    bow_corpus = [dictionary.doc2bow(text) for text in processed_corpus]
    
    # train the model
    tfidf = models.TfidfModel(bow_corpus)
    
    index = similarities.SparseMatrixSimilarity(tfidf[bow_corpus], num_features=features)
    
    return index

In [159]:
def find_similar_charities(train_df,test_df):
    '''
    '''
    total = 0
    category_counter = {1:0,2:0,3:0}
    
    print("1. Processing Training Corpus")
    char_desc_trimmed = []
    for doc in train_df['description']:
        if len(doc) >= 200:
            char_desc_trimmed.append(doc)
    char_desc_trimmed = np.array(char_desc_trimmed)
    
    corpus = char_desc_trimmed
    processed_corpus = process_corpus(corpus)
    
    print("2. Creating Index from Training Corpus")
    index = create_index_from_corpus(processed_corpus)
    
    print("3. Starting Test Corpus Similarity Analysis\n")
    for ind, document in enumerate(test_df['description']):
        total +=1 
        
        #print("Top 3 Charities Similar to:", test_df['name'].iloc[ind],'\n')
        
        query_bow = dictionary.doc2bow(document.split())
        sims = index[tfidf[query_bow]]
        
        top_3_sim = dict()
        count = 3
        
        for document_number, score in sorted(enumerate(sims), key=lambda x: x[1], reverse=True):
            if count > 0:
                top_3_sim[document_number] = score
            count -= 1
            
        category_count = 1
        for doc, score in top_3_sim.items():
            #print(charity_navigator_df['name'][doc])
            #print(charity_navigator_df['description'][doc])
            #print(doc, score, "\n")
            
            # print (charity_navigator_df['category'][doc])
            # print (test_df['category'].iloc[ind])
            
            if charity_navigator_df['category'][doc] == test_df['category'].iloc[ind]:
                category_counter[category_count] += 1
            category_count +=1
            
    # Print Scores
    
    first_rec_score = round((category_counter[1] / total)*100,2)
    second_rec_score = round((category_counter[2] / total)*100,2)
    third_rec_score = round((category_counter[3] / total)*100,2)
    
    print ("First Recommendation Score:", first_rec_score,"%")
    print ("Second Recommendation Score:", second_rec_score,"%")   
    print ("Third Recommendation Score:", third_rec_score,"%\n")
    
    print ("AVG Recommendation Score:", (math.round((first_rec_score+second_rec_score+third_rec_score)/3),2),"%\n")
    
    pass
    

## Loading Charity Navigator Data

In [4]:
charity_navigator_df = pd.read_csv('../data/CLEAN_charity_data.csv')
charity_navigator_df = charity_navigator_df[['name','ein','category','description','motto','score','state']]
charity_navigator_df['ein'] = charity_navigator_df['ein'].apply(lambda x: int(x.replace("-","")))

In [5]:
charity_navigator_df.head()

Unnamed: 0,name,ein,category,description,motto,score,state
0,1000 Friends of Oregon,930642086,Environment,Working with Oregonians to enhance our quality...,Great communities. Working lands. Iconic Places.,91.94,OR
1,WYPR,311770828,"Arts, Culture, Humanities",Serving the metropolitan Baltimore area and th...,88.1 FM -. Your NPR News Station,85.59,MD
2,VSS Catholic Communications,911857425,Religion,VSS Catholic Communications is dedicated to an...,Spirit Catholic Radio Network,76.8,NE
3,Utah Symphony & Opera,510145980,"Arts, Culture, Humanities",The mission of the Utah Symphony & Opera is to...,"Engaging, educating, and enriching lives",91.95,UT
4,Two Ten Footwear Foundation,222579809,Human Services,"Funded solely by the footwear industry, Two Te...",Shoepeople Helping Shoepeople,90.26,MA


In [119]:
len(charity_navigator_df.groupby('category').size())

11

In [122]:
# There are 11 Categories Total
## A "Random Guess" Baseline is 1/11 = 9.09%

## Modeling

In [144]:
train_df = charity_navigator_df[:6000]
char_desc_trimmed = []
for doc in train_df['description']:
    if len(doc) >= 200:
        char_desc_trimmed.append(doc)
char_desc_trimmed = np.array(char_desc_trimmed)

In [145]:
len(char_desc_trimmed)

5616

### Sequential Train/Test Split

In [147]:
train_df = charity_navigator_df[:6000]
test_df = charity_navigator_df[-2000:]

find_similar_charities(train_df,test_df)

1. Processing Training Corpus
2. Creating Index from Training Corpus
3. Starting Test Corpus Similarity Analysis

First Recommendation Score: 11.75 %
Second Recommendation Score: 12.3 %
Third Recommendation Score: 11.3 %


### Random Train/Test Split

In [None]:
train_df, test_df = train_test_split(charity_navigator_df,test_size = 0.30)

find_similar_charities(train_df,test_df)

In [None]:
train_df, test_df = train_test_split(charity_navigator_df,test_size = 0.25)

find_similar_charities(train_df,test_df)

In [160]:
train_df, test_df = train_test_split(charity_navigator_df,test_size = 0.20)

find_similar_charities(train_df,test_df)

1. Processing Training Corpus
2. Creating Index from Training Corpus
3. Starting Test Corpus Similarity Analysis

First Recommendation Score: 12.31 %
Second Recommendation Score: 13.08 %
Third Recommendation Score: 13.14 %



AttributeError: module 'math' has no attribute 'round'