## Imports

In [2]:
import numpy as np
import pandas as pd
import datetime
import math
import random

pd.options.display.max_columns = 50

In [268]:
import scipy.sparse as sparse
from scipy.sparse.linalg import spsolve
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

## Functions

In [28]:
#------------------#
# BUILD OUR MODEL  #
#------------------#

def implicit_als(sparse_data, alpha_val=40, iterations=10, lambda_val=0.1, features=10):
    """ 
    Function
    --------
        Implementation of Alternating Least Squares with implicit data. We iteratively
        compute the user (x_u) and item (y_i) vectors using the following formulas:

        x_u = ((Y.T*Y + Y.T*(Cu - I) * Y) + lambda*I)^-1 * (X.T * Cu * p(u))
        y_i = ((X.T*X + X.T*(Ci - I) * X) + lambda*I)^-1 * (Y.T * Ci * p(i))
 
    Parameters
    ----------
        sparse_data (csr_matrix): Our sparse user-by-item matrix
 
        alpha_val (int): The rate in which we'll increase our confidence
            in a preference with more interactions.
 
        iterations (int): How many times we alternate between fixing and 
            updating our user and item vectors
 
        lambda_val (float): Regularization value
 
        features (int): How many latent features we want to compute.
    
    Returns:
    --------
        X (csr_matrix): user vectors of size users-by-features
        
        Y (csr_matrix): item vectors of size items-by-features
     """

    # Calculate the confidence for each value in our data
    confidence = sparse_data * alpha_val
    
    # Get the size of user rows and item columns
    user_size, item_size = sparse_data.shape
    
    # We create the user vectors X of size users-by-features, the item vectors
    # Y of size items-by-features and randomly assign the values.
    X = sparse.csr_matrix(np.random.normal(size = (user_size, features)))
    Y = sparse.csr_matrix(np.random.normal(size = (item_size, features)))
    
    #Precompute I and lambda * I
    X_I = sparse.eye(user_size)
    Y_I = sparse.eye(item_size)
    
    I = sparse.eye(features)
    lI = lambda_val * I

    # Start main loop. For each iteration we first compute X and then Y
    for i in range(iterations):
        print ('iteration %d of %d' % (i+1, iterations))
        
        # Precompute Y-transpose-Y and X-transpose-X
        yTy = Y.T.dot(Y)
        xTx = X.T.dot(X)

        # Loop through all users
        for u in range(user_size):

            # Get the user row.
            u_row = confidence[u,:].toarray() 

            # Calculate the binary preference p(u)
            p_u = u_row.copy()
            p_u[p_u != 0] = 1.0

            # Calculate Cu and Cu - I
            CuI = sparse.diags(u_row, [0])
            Cu = CuI + Y_I

            # Put it all together and compute the final formula
            yT_CuI_y = Y.T.dot(CuI).dot(Y)
            yT_Cu_pu = Y.T.dot(Cu).dot(p_u.T)
            X[u] = spsolve(yTy + yT_CuI_y + lI, yT_Cu_pu)

        for i in range(item_size):

            # Get the item column and transpose it.
            i_row = confidence[:,i].T.toarray()

            # Calculate the binary preference p(i)
            p_i = i_row.copy()
            p_i[p_i != 0] = 1.0

            # Calculate Ci and Ci - I
            CiI = sparse.diags(i_row, [0])
            Ci = CiI + X_I

            # Put it all together and compute the final formula
            xT_CiI_x = X.T.dot(CiI).dot(X)
            xT_Ci_pi = X.T.dot(Ci).dot(p_i.T)
            Y[i] = spsolve(xTx + xT_CiI_x + lI, xT_Ci_pi)

    return X, Y
    

In [375]:
#------------------------------#
# CREATE USER RECOMMENDATIONS  #
#------------------------------#

def recommend(user_id, data_sparse, user_vecs, item_vecs, item_lookup, num_items=3):
    """
    Function:
    --------
        Recommend items for a given user given a trained model
    
    Parameters:
    ----------
        user_id (int): The id of the user we want to create recommendations for.
        
        data_sparse (csr_matrix): Our original training data.
        
        user_vecs (csr_matrix): The trained user x features vectors
        
        item_vecs (csr_matrix): The trained item x features vectors
        
        item_lookup (pandas.DataFrame): Used to map charity_id to charity names
        
        num_items (int): How many recommendations we want to return:
        
    Returns:
    -------
        recommendations (pandas.DataFrame): DataFrame with num_items charity names and scores
    
    """
    
    # Get all interactions by the user
    # Return Top 3 from Training Data if user not in DataBase
    user_interactions = data_sparse[user_id,:].toarray()

    
    # We don't want to recommend items the user has consumed. So let's
    # set them all to 0 and the unknowns to 1.
    user_interactions = user_interactions.reshape(-1) + 1 #Reshape to turn into 1D array
    # user_interactions[user_interactions > 1] = 0

    # This is where we calculate the recommendation by taking the 
    # dot-product of the user vectors with the item vectors.
    rec_vector = user_vecs[user_id,:].dot(item_vecs.T).toarray()

    # Let's scale our scores between 0 and 1 to make it all easier to interpret.
    min_max = MinMaxScaler()
    rec_vector_scaled = min_max.fit_transform(rec_vector.reshape(-1,1))[:,0]
    recommend_vector = user_interactions*rec_vector_scaled
   
    # Get all the charity indices in order of recommendations (descending) and
    # select only the top "num_items" items. 
    item_idx = np.argsort(recommend_vector)[::-1][:num_items]

    charities = []
    scores = []

    # Loop through our recommended charity indicies and look up the actual charity name
    for idx in item_idx:
        charities.append(item_lookup['Advised Charity'].loc[item_lookup.charity_id == str(idx)].iloc[0])
        scores.append(recommend_vector[idx])

    # Create a new dataframe with recommended charity names and scores
    recommendations = pd.DataFrame({'charity': charities, 'score': scores})
    
    return recommendations

In [317]:
def recommend_similar_charities(item_lookup,item_vec,item_name="",item_id=-1,num_sim_char=10):
    '''
    Function:
    --------
        Recommend similar charities to a given charity based on model
    
    Parameters:
    ----------
        item_lookup (pandas.DataFrame): Used to map charity_id to charity names
        
        item_vec (csr_matrix): The trained item x features vectors
        
        item_name (str): Name of the charity to find similar charities to
        
        item_id (int): Number of the charity to find similar charities to
        
        num_sim_char (int): Number of similar charities to return in DESC order
        
    Returns:
    -------
        similar (pandas.DataFrame): DataFrame with charity names and scores
    
    '''
    if item_id == -1:
        # Charity Name --> Charity_id
        item_id = int(item_lookup['charity_id'].loc[item_lookup['Advised Charity'] == str(item_name)].iloc[0])

    # Get the item row for selected charity.
    item_vec = item_vecs[item_id].T

    # Calculate the similarity score between selected charity and other charities
    # and select the top 10 most similar.
    scores = item_vecs.dot(item_vec).toarray().reshape(1,-1)[0]
    top_10 = np.argsort(scores)[::-1][:10]

    charities = []
    charity_scores = []

    print("Charities Similar to:", item_lookup['Advised Charity'].loc[item_lookup.charity_id == str(item_id)].iloc[0],"\n")

    # Get and print the actual charity names and scores
    for idx in top_10:
        charities.append(item_lookup['Advised Charity'].loc[item_lookup.charity_id == str(idx)].iloc[0])
        charity_scores.append(scores[idx])

    similar = pd.DataFrame({'charities': charities, 'score': charity_scores})

    print (similar)
    return similar

In [318]:
def create_collab_filtering_model(df):
    '''
    Function:
    --------
        Create the collaborative filtering model using ALS
    
    Parameters:
    ----------
        df (pandas.DataFrame): Dataframe to use to train the model
        
    Returns:
    -------
        user_vecs (csr_matrix): The trained user x features vectors
        
        item_vecs (csr_matrix): The trained item x features vectors
        
        data_sparse (csr_matrix): Sparse matrix with user and item interactions
        
        item_lookup (pandas.DataFrame): Used to map charity_id to charity names 
        
        user_lookup (pandas.DataFrame): Used to map user_id to user names
    '''
    raw_data = df[['User ID','Advised Charity','Amount']]
    # Drop Rows with Missing Values
    data = raw_data.dropna()

    # Convert charity names into numerical IDs
    data['user_id'] = data['User ID'].astype("category").cat.codes
    data['charity_id'] = data['Advised Charity'].astype("category").cat.codes

    # Create a lookup frame so we can get the charity names back in 
    # readable form later.
    user_lookup = data[['user_id', 'User ID']].drop_duplicates()
    user_lookup['user_id'] = user_lookup['user_id'].astype(str)
    
    # Create a lookup frame so we can get the charity names back in 
    # readable form later.
    item_lookup = data[['charity_id', 'Advised Charity']].drop_duplicates()
    item_lookup['charity_id'] = item_lookup['charity_id'].astype(str)
    
    data = data.drop(['User ID', 'Advised Charity'], axis=1)

    # Drop Rows with $0 Donation
    data = data.loc[data.Amount > 0]
    
    # Create lists of all users, charities and amounts
    users = list(np.sort(data['user_id'].unique()))
    charities = list(np.sort(data['charity_id'].unique()))
    amounts = list(data.Amount)

    # Get the rows and columns for our new matrix
    rows = data['user_id'].astype(int)
    cols = data['charity_id'].astype(int)

    # Contruct a sparse matrix for our users and items containing amounts
    data_sparse = sparse.csr_matrix((amounts, (rows, cols)), shape=(len(users), len(charities)))
    
    # Training The Model 
    # (Creating User-Latent Vector and Item-Latent Vector)
    user_vecs, item_vecs = implicit_als(data_sparse, iterations=20, features=20, alpha_val=40)
    
    return user_vecs, item_vecs, data_sparse, item_lookup, user_lookup

In [319]:
def print_scoring_results(pop_score,pop_total,cat_pop_score,
                         rec_score,rec_total,cat_rec_score):
    '''
    Function:
    --------
        Print the Scoring Results of a tested model
    
    Parameters:
    ----------
        pop_score,pop_total,cat_pop_score (int): Scores of Popularity Recommendations
        
        rec_score,rec_total,cat_rec_score (int): Scores of User History Recommendations
        
    Returns:
    -------
        None : Prints Scores
    '''
    print("Popular Recommendation Results:")
    print("Pop Score:", pop_score)
    print("Pop Total:", pop_total)
    print("Pop Accuracy %:", round((pop_score/pop_total)*100,2))    

    print("")

    print("Cat Pop Score:", cat_pop_score)
    print("Cat Pop Total:", pop_total)
    print("Cat Pop Accuracy %:", round((cat_pop_score/pop_total)*100,2))

    print("\n")

    print("Actual Recommendation Results:")
    print("Rec Score:", rec_score)
    print("Rec Total:", rec_total)
    print("Rec Accuracy %:", round((rec_score/rec_total)*100,2))

    print("")

    print("Cat Rec Score:", cat_rec_score)
    print("Cat Rec Total:", rec_total)
    print("Cat Rec Accuracy %:", round((cat_rec_score/rec_total)*100,2))
    
    print("\n")

    print("Total Recommendation Results:")
    print("Total Rec Score:", (rec_score+pop_score))
    print("Total:", (rec_total+pop_total))
    print("Total Accuracy %:", round(((rec_score+pop_score)/(rec_total+pop_total))*100,2))

    print("")

    print("Total Cat Rec Score:", (cat_rec_score+cat_pop_score))
    print("Total:", (rec_total+pop_total))
    print("Total Cat Rec Accuracy %:", round(((cat_rec_score+cat_pop_score)/(rec_total+pop_total))*100,2))
    pass

In [270]:
def get_user_id_from_table(user_lookup,User_ID):
    '''
    Function:
    --------
        Get user_id from User ID in user_lookup table
    
    Parameters:
    ----------
        user_lookup (pandas.DataFrame): Table of user_id and User ID
        
        User_ID (int): User ID to find user_id for
        
    Returns:
    -------
        user_id (int): user_id in user_lookup table. Returns -1 if user has not made a previous donation
    '''
    try:
        user_id = user_lookup[user_lookup['User ID'] == User_ID].user_id.iloc[0]
    except:
        user_id = -1
    
    return int(user_id)

In [320]:
def create_popular_lists_from_train(train_df):
    '''
    Function:
    --------
        Create Popular Lists from Training Data
    
    Parameters:
    ----------
        train_df (pandas.DataFrame): Data Used to Train Model
        
    Returns:
    -------
        None: Prints out results from training 
    '''
    top_3_from_train = train_df.groupby('Advised Charity')['Amount'].median().sort_values(ascending=False).head(3)
    top_3_pop_charities_list = list(top_3_from_train.index)
    
    top_3_train_categories = []
    for char in top_3_pop_charities_list:
        top_3_train_categories.append(train_df[train_df['Advised Charity']==char].NTEE_Major_Category.iloc[0])
    
    return top_3_pop_charities_list, top_3_train_categories

In [373]:
def train_model_and_score_testing_data(test_df, train_df):
    '''
    Function:
    --------
        Train and Score Model
    
    Parameters:
    ----------
        train_df (pandas.DataFrame): Data Used to Train Model
        
        test_df (pandas.DataFrame): Data Used to Test Model
        
    Returns:
    -------
        user_vecs (csr_matrix): The trained user x features vectors
        
        item_vecs (csr_matrix): The trained item x features vectors
        
        data_sparse (csr_matrix): Sparse matrix with user and item interactions
        
        item_lookup (pandas.DataFrame): Used to map charity_id to charity names 
        
        user_lookup (pandas.DataFrame): Used to map user_id to user names
    '''
    # Train the model from training data -> Default 20 iterations ALS
    user_vecs, item_vecs, data_sparse, item_lookup, user_lookup = create_collab_filtering_model(train_df)
    
    # Creating "Popular" Charity List and Categories
    top_3_pop_charities_list, top_3_train_catgories = create_popular_lists_from_train(train_df)
    
    pop_total,pop_score,cat_pop_score = 0,0,0
    rec_total,rec_score,cat_rec_score = 0,0,0

    for i,row in test_df[:1000].iterrows():
        user_id = get_user_id_from_table(user_lookup,int(row['User ID']))

        top_3_recs_list = []

        # Scoring "Popular Recommendations" for Users not in DataBase
        if user_id == -1:
            pop_total += 1
            top_3_recs_list = top_3_pop_charities_list

            if(row['Advised Charity']) in top_3_recs_list:
                pop_score += 1

            if(row['NTEE_Major_Category'] in top_3_train_catgories):
                cat_pop_score += 1


        # Scoring "Actual Recommendations" for Users previously in DataBase
        if user_id != -1:
            rec_total += 1
            # Let's generate and print our recommendations
            recommendations = recommend(user_id, data_sparse, user_vecs, item_vecs, item_lookup,num_items=3)

            #print("\n User-User Recommendations:")
            for char in (recommendations.charity):
                top_3_recs_list.append(char)

            if(row['Advised Charity']) in top_3_recs_list:
                rec_score += 1

            top_3_test_catgories = []
            for char in top_3_recs_list:
                top_3_test_catgories.append(train_df[train_df['Advised Charity']==char].NTEE_Major_Category.iloc[0])

            if(row['NTEE_Major_Category'] in top_3_test_catgories):
                cat_rec_score += 1

    print("\n")
    print_scoring_results(pop_score,pop_total,cat_pop_score,
                             rec_score,rec_total,cat_rec_score)
    
    return user_vecs, item_vecs, data_sparse, item_lookup, user_lookup

## Loading User Donation Data

In [83]:
user_and_charity_df = pd.read_csv('../data/user_and_charity_df.csv',parse_dates=True)
user_and_charity_df['Date'] = user_and_charity_df['Date'].apply(lambda x: pd.to_datetime(x))

In [310]:
user_and_charity_df.head()

Unnamed: 0,Donation ID,User ID,Amount,Date,Advised Charity,Advised Charity EIN,In Honor Of?,Event?,Cover fee?,Tip,Year,Month,Day,Hour,Message,Givz Everywhere?,Recur_Monthly,Recur_Annually,Recur_O,Date_Created_Year,Date_Created_Month,Account_Age,INCOME_CD,ZIP_FIVE,NTEE_Major_Category,NTEE_Minor_Category,County,rating
0,2753,2837,25.0,2019-09-11 03:17:52.392953-04:00,Room to Grow National Inc,134012096,0,0,0,0.0,2019,9,11,3,0,0,0,0,1,2019,9,0.16,7,10001,Human Services - Multipurpose and Other,"Children's, Youth Services",New York County,4.0
1,2749,2847,1.0,2019-09-10 16:52:30.401878-04:00,Room to Grow National Inc,134012096,0,0,0,0.0,2019,9,10,16,0,0,0,0,1,2019,9,0.15,7,10001,Human Services - Multipurpose and Other,"Children's, Youth Services",New York County,4.0
2,2746,2845,1.0,2019-09-09 17:17:28.541816-04:00,Room to Grow National Inc,134012096,0,0,0,0.0,2019,9,9,17,0,0,0,0,1,2019,9,0.15,7,10001,Human Services - Multipurpose and Other,"Children's, Youth Services",New York County,4.0
3,2752,1629,20.0,2019-09-10 22:55:52.758423-04:00,Wildlife Conservation Society,131740011,0,0,0,0.0,2019,9,10,22,0,0,1,0,0,2018,12,0.92,9,10460,Animal-Related,D500,Bronx County,4.0
4,2703,1629,20.0,2019-08-10 22:55:53.127860-04:00,Wildlife Conservation Society,131740011,0,0,0,0.0,2019,8,10,22,0,0,1,0,0,2018,12,0.92,9,10460,Animal-Related,D500,Bronx County,4.0


## Loading Charity Navigator Data

In [331]:
charity_navigator_df = pd.read_csv('../data/CLEAN_charity_data.csv')
charity_navigator_df = charity_navigator_df[['name','ein','category','description','motto','score','state']]
charity_navigator_df['ein'] = charity_navigator_df['ein'].apply(lambda x: int(x.replace("-","")))

In [330]:
charity_navigator_df.head()

Unnamed: 0,name,ein,category,description,motto,score,state
0,1000 Friends of Oregon,930642086,Environment,Working with Oregonians to enhance our quality...,Great communities. Working lands. Iconic Places.,91.94,OR
1,WYPR,311770828,"Arts, Culture, Humanities",Serving the metropolitan Baltimore area and th...,88.1 FM -. Your NPR News Station,85.59,MD
2,VSS Catholic Communications,911857425,Religion,VSS Catholic Communications is dedicated to an...,Spirit Catholic Radio Network,76.8,NE
3,Utah Symphony & Opera,510145980,"Arts, Culture, Humanities",The mission of the Utah Symphony & Opera is to...,"Engaging, educating, and enriching lives",91.95,UT
4,Two Ten Footwear Foundation,222579809,Human Services,"Funded solely by the footwear industry, Two Te...",Shoepeople Helping Shoepeople,90.26,MA


## Joining User Donation Data with Charity Navigator Data

In [346]:
nav_df = user_and_charity_df.merge(charity_navigator_df,how='left',left_on='Advised Charity EIN',right_on='ein')
nav_df.drop(columns=['rating','name','ein','category','state'],inplace=True)

In [350]:
nav_df.head(1000)

Unnamed: 0,Donation ID,User ID,Amount,Date,Advised Charity,Advised Charity EIN,In Honor Of?,Event?,Cover fee?,Tip,Year,Month,Day,Hour,Message,Givz Everywhere?,Recur_Monthly,Recur_Annually,Recur_O,Date_Created_Year,Date_Created_Month,Account_Age,INCOME_CD,ZIP_FIVE,NTEE_Major_Category,NTEE_Minor_Category,County,description,motto,score
0,2753,2837,25.0,2019-09-11 03:17:52.392953-04:00,Room to Grow National Inc,134012096,0,0,0,0.0,2019,9,11,3,0,0,0,0,1,2019,9,0.16,7,10001,Human Services - Multipurpose and Other,"Children's, Youth Services",New York County,"Founded in 1998, Room to Grow's innovative thr...",Building a strong foundation for babies in pov...,91.27
1,2749,2847,1.0,2019-09-10 16:52:30.401878-04:00,Room to Grow National Inc,134012096,0,0,0,0.0,2019,9,10,16,0,0,0,0,1,2019,9,0.15,7,10001,Human Services - Multipurpose and Other,"Children's, Youth Services",New York County,"Founded in 1998, Room to Grow's innovative thr...",Building a strong foundation for babies in pov...,91.27
2,2746,2845,1.0,2019-09-09 17:17:28.541816-04:00,Room to Grow National Inc,134012096,0,0,0,0.0,2019,9,9,17,0,0,0,0,1,2019,9,0.15,7,10001,Human Services - Multipurpose and Other,"Children's, Youth Services",New York County,"Founded in 1998, Room to Grow's innovative thr...",Building a strong foundation for babies in pov...,91.27
3,2752,1629,20.0,2019-09-10 22:55:52.758423-04:00,Wildlife Conservation Society,131740011,0,0,0,0.0,2019,9,10,22,0,0,1,0,0,2018,12,0.92,9,10460,Animal-Related,D500,Bronx County,"The Wildlife Conservation Society (WCS), found...",Saving wildlife,91.96
4,2703,1629,20.0,2019-08-10 22:55:53.127860-04:00,Wildlife Conservation Society,131740011,0,0,0,0.0,2019,8,10,22,0,0,1,0,0,2018,12,0.92,9,10460,Animal-Related,D500,Bronx County,"The Wildlife Conservation Society (WCS), found...",Saving wildlife,91.96
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,1623,1750,250.0,2018-12-14 22:14:16.153438-05:00,Tree of Life Congregation,250979381,0,0,0,0.0,2018,12,14,22,0,1,0,0,1,2018,12,0.89,0,15217,"Religion-Related, Spiritual Development",Jewish X40 Islamic X50 Buddhist X70 Hindu,Allegheny County,,,
996,1622,1747,540.0,2018-12-14 22:13:19.658898-05:00,Tree of Life Congregation,250979381,0,0,0,0.0,2018,12,14,22,0,1,0,0,1,2018,12,0.89,0,15217,"Religion-Related, Spiritual Development",Jewish X40 Islamic X50 Buddhist X70 Hindu,Allegheny County,,,
997,1621,1748,180.0,2018-12-14 22:10:00.035752-05:00,Tree of Life Congregation,250979381,0,0,0,0.0,2018,12,14,22,0,1,0,0,1,2018,12,0.89,0,15217,"Religion-Related, Spiritual Development",Jewish X40 Islamic X50 Buddhist X70 Hindu,Allegheny County,,,
998,1620,1746,180.0,2018-12-14 22:05:20.430139-05:00,Tree of Life Congregation,250979381,0,0,0,0.0,2018,12,14,22,0,1,0,0,1,2018,12,0.89,0,15217,"Religion-Related, Spiritual Development",Jewish X40 Islamic X50 Buddhist X70 Hindu,Allegheny County,,,


In [344]:
len(nav_df['description'].unique())

197

## Implicit Collaborative Filtering

In [352]:
user_and_charity_df.head()

Unnamed: 0,Donation ID,User ID,Amount,Date,Advised Charity,Advised Charity EIN,In Honor Of?,Event?,Cover fee?,Tip,Year,Month,Day,Hour,Message,Givz Everywhere?,Recur_Monthly,Recur_Annually,Recur_O,Date_Created_Year,Date_Created_Month,Account_Age,INCOME_CD,ZIP_FIVE,NTEE_Major_Category,NTEE_Minor_Category,County,rating
0,2753,2837,25.0,2019-09-11 03:17:52.392953-04:00,Room to Grow National Inc,134012096,0,0,0,0.0,2019,9,11,3,0,0,0,0,1,2019,9,0.16,7,10001,Human Services - Multipurpose and Other,"Children's, Youth Services",New York County,4.0
1,2749,2847,1.0,2019-09-10 16:52:30.401878-04:00,Room to Grow National Inc,134012096,0,0,0,0.0,2019,9,10,16,0,0,0,0,1,2019,9,0.15,7,10001,Human Services - Multipurpose and Other,"Children's, Youth Services",New York County,4.0
2,2746,2845,1.0,2019-09-09 17:17:28.541816-04:00,Room to Grow National Inc,134012096,0,0,0,0.0,2019,9,9,17,0,0,0,0,1,2019,9,0.15,7,10001,Human Services - Multipurpose and Other,"Children's, Youth Services",New York County,4.0
3,2752,1629,20.0,2019-09-10 22:55:52.758423-04:00,Wildlife Conservation Society,131740011,0,0,0,0.0,2019,9,10,22,0,0,1,0,0,2018,12,0.92,9,10460,Animal-Related,D500,Bronx County,4.0
4,2703,1629,20.0,2019-08-10 22:55:53.127860-04:00,Wildlife Conservation Society,131740011,0,0,0,0.0,2019,8,10,22,0,0,1,0,0,2018,12,0.92,9,10460,Animal-Related,D500,Bronx County,4.0


In [353]:
user_and_charity_implicit = user_and_charity_df.copy()

In [355]:
user_and_charity_implicit = user_and_charity_df.copy()
user_and_charity_implicit['Amount'] = user_and_charity_implicit['Amount'].apply(lambda x: 1)

In [356]:
user_and_charity_implicit.head()

Unnamed: 0,Donation ID,User ID,Amount,Date,Advised Charity,Advised Charity EIN,In Honor Of?,Event?,Cover fee?,Tip,Year,Month,Day,Hour,Message,Givz Everywhere?,Recur_Monthly,Recur_Annually,Recur_O,Date_Created_Year,Date_Created_Month,Account_Age,INCOME_CD,ZIP_FIVE,NTEE_Major_Category,NTEE_Minor_Category,County,rating
0,2753,2837,1,2019-09-11 03:17:52.392953-04:00,Room to Grow National Inc,134012096,0,0,0,0.0,2019,9,11,3,0,0,0,0,1,2019,9,0.16,7,10001,Human Services - Multipurpose and Other,"Children's, Youth Services",New York County,4.0
1,2749,2847,1,2019-09-10 16:52:30.401878-04:00,Room to Grow National Inc,134012096,0,0,0,0.0,2019,9,10,16,0,0,0,0,1,2019,9,0.15,7,10001,Human Services - Multipurpose and Other,"Children's, Youth Services",New York County,4.0
2,2746,2845,1,2019-09-09 17:17:28.541816-04:00,Room to Grow National Inc,134012096,0,0,0,0.0,2019,9,9,17,0,0,0,0,1,2019,9,0.15,7,10001,Human Services - Multipurpose and Other,"Children's, Youth Services",New York County,4.0
3,2752,1629,1,2019-09-10 22:55:52.758423-04:00,Wildlife Conservation Society,131740011,0,0,0,0.0,2019,9,10,22,0,0,1,0,0,2018,12,0.92,9,10460,Animal-Related,D500,Bronx County,4.0
4,2703,1629,1,2019-08-10 22:55:53.127860-04:00,Wildlife Conservation Society,131740011,0,0,0,0.0,2019,8,10,22,0,0,1,0,0,2018,12,0.92,9,10460,Animal-Related,D500,Bronx County,4.0


In [386]:
user_and_charity_implicit.groupby('NTEE_Major_Category').size().sort_values(ascending=False)

NTEE_Major_Category
Education                                                365
International, Foreign Affairs and National Security     331
Youth Development                                        285
Human Services - Multipurpose and Other                  234
Recreation, Sports, Leisure, Athletics                   164
Health - General and Rehabilitative                      148
Public, Society Benefit - Multipurpose and Other         115
Animal-Related                                           101
Civil Rights, Social Action, Advocacy                     90
Diseases, Disorders, Medical Disciplines                  85
Philanthropy, Voluntarism and Grantmaking Foundations     69
Arts, Culture and Humanities                              66
Crime, Legal-Related                                      52
Environmental Quality, Protection and Beautification      50
Mental Health, Crisis Intervention                        46
Religion-Related, Spiritual Development                   42
Medi

In [382]:
user_and_charity_implicit.groupby('Advised Charity').size().sort_values(ascending=False)

Advised Charity
Reconnect Brooklyn                                 80
Trustees Of Hamilton College                       79
Memphis Inner City Rugby                           78
Tzahal Shalom Of N Westchester Incorporated        74
Play Soccer to Give Corp                           71
                                                   ..
Heaven Can Wait Rescue Inc                          1
Richmond Symphony                                   1
Hias Inc                                            1
High School Network For Global Philanthropy Inc     1
A New Chance Animal Rescue Inc                      1
Length: 574, dtype: int64

In [357]:
raw_data = user_and_charity_df[['User ID','Advised Charity','Amount']]

In [20]:
# Drop Rows with Missing Values
data = raw_data.dropna()

# Convert charity names into numerical IDs
data['user_id'] = data['User ID'].astype("category").cat.codes
data['charity_id'] = data['Advised Charity'].astype("category").cat.codes

# Create a lookup frame so we can get the charity names back in 
# readable form later.
item_lookup = data[['charity_id', 'Advised Charity']].drop_duplicates()
item_lookup['charity_id'] = item_lookup['charity_id'].astype(str)

In [22]:
data = data.drop(['User ID', 'Advised Charity'], axis=1)

# Drop Rows with $0 Donation
data = data.loc[data.Amount > 0]

In [24]:
# Create lists of all users, charities and amounts
users = list(np.sort(data['user_id'].unique()))
charities = list(np.sort(data['charity_id'].unique()))
amounts = list(data.Amount)

# Get the rows and columns for our new matrix
rows = data['user_id'].astype(int)
cols = data['charity_id'].astype(int)

# Contruct a sparse matrix for our users and items containing amounts
data_sparse = sparse.csr_matrix((amounts, (rows, cols)), shape=(len(users), len(charities)))

In [44]:
# Charity Name --> Charity_id
item_lookup['charity_id'].loc[item_lookup['Advised Charity'] == str('Room to Grow National Inc')].iloc[0]

'439'

In [27]:
# Training The Model 
# (Creating User-Latent Vector and Item-Latent Vector)
user_vecs, item_vecs = implicit_als(data_sparse, iterations=20, features=20, alpha_val=40)

iteration 1 of 20
iteration 2 of 20
iteration 3 of 20
iteration 4 of 20
iteration 5 of 20
iteration 6 of 20
iteration 7 of 20
iteration 8 of 20
iteration 9 of 20
iteration 10 of 20
iteration 11 of 20
iteration 12 of 20
iteration 13 of 20
iteration 14 of 20
iteration 15 of 20
iteration 16 of 20
iteration 17 of 20
iteration 18 of 20
iteration 19 of 20
iteration 20 of 20


In [38]:
# Charity_id --> Charity Name
item_lookup['Advised Charity'].loc[item_lookup.charity_id == str(439)].iloc[0]

'Room to Grow National Inc'

In [39]:
item_lookup.iloc[439]

charity_id                              336
Advised Charity    Navy Seal Foundation Inc
Name: 2125, dtype: object

In [49]:
recommend_similar_charities(item_lookup,item_vec,item_name='Navy Seal Foundation Inc');

Charities Similar to: Navy Seal Foundation Inc 

                                   charities     score
0                               Lionsraw Inc  0.021511
1  Everytown For Gun Safety Support Fund Inc  0.019936
2                   Navy Seal Foundation Inc  0.019833
3                            One Mission Inc  0.019671
4                       Hungry for Music Inc  0.018806
5                     Athlife Foundation Inc  0.015616
6                       Habitat for Humanity  0.015547
7                       Exhale to Inhale Inc  0.014914
8                   Memphis Inner City Rugby  0.014672
9                       Grateful Peoples Inc  0.014529


In [30]:
#------------------------------
# FIND SIMILAR ITEMS
#------------------------------

# Let's find similar charities to __________ (Use item_lookup to locate charity_id). 
item_id = 30

# Get the item row for selected charity.
item_vec = item_vecs[item_id].T

# Calculate the similarity score between selected charity and other charities
# and select the top 10 most similar.
scores = item_vecs.dot(item_vec).toarray().reshape(1,-1)[0]
top_10 = np.argsort(scores)[::-1][:10]

charities = []
charity_scores = []

print("Charities Similar to:", item_lookup['Advised Charity'].loc[item_lookup.charity_id == str(item_id)].iloc[0],"\n")

# Get and print the actual charity names and scores
for idx in top_10:
    charities.append(item_lookup['Advised Charity'].loc[item_lookup.charity_id == str(idx)].iloc[0])
    charity_scores.append(scores[idx])

similar = pd.DataFrame({'charities': charities, 'score': charity_scores})

print (similar)

Charities Similar to: American Society For The Prevention Of Cruelty To Animals 

                                           charities     score
0  American Society For The Prevention Of Cruelty...  0.012739
1                       Dana-Farber Cancer Institute  0.009687
2                           Play Soccer to Give Corp  0.008045
3                      New York Shakespeare Festival  0.008029
4  Alzheimers Disease And Related Disorders Assoc...  0.007571
5                          The Animal Medical Center  0.007026
6               Organization For Autism Research Inc  0.006788
7                      Planned Parenthood Global Inc  0.006787
8                       Hearing Charities Of America  0.006707
9                                   Social Good Fund  0.006656


In [58]:
# Let's say we want to recommend charities for user
user_id = 20

#------------------------------
# GET ITEMS Donated to BY USER
#------------------------------

# Let's print out what the user has donated to
consumed_idx = data_sparse[user_id,:].nonzero()[1].astype(str)
consumed_items = item_lookup.loc[item_lookup.charity_id.isin(consumed_idx)]
print (consumed_items)


# Let's generate and print our recommendations
recommendations = recommend(user_id, data_sparse, user_vecs, item_vecs, item_lookup)
print("\n User-User Recommendations:")
print (recommendations)

    charity_id                  Advised Charity
358        525  Trustees For Harvard University
912        311         Memphis Inner City Rugby

 User-User Recommendations:
                              charity     score
0         Wounded Warrior Project Inc  0.941374
1                Hungry for Music Inc  0.796742
2                Bent On Learning Inc  0.686778
3                    Bent On Learning  0.649153
4              Pat Tillman Foundation  0.624113
5  Bob Woodruff Family Foundation Inc  0.577210
6                Kenya Education Fund  0.574245
7   Temple Beth El Of Northern Valley  0.517592
8                          Spirits Up  0.489190
9                                 USO  0.482612


## Testing and Scoring the Model

### Train-Test Attempt #1 - Based on Time Series Split

In [307]:
print("Attempt #1 - Based on Time-Series Split")

print("\n Splitting Data into Train/Test")
# Splitting Data into Train and Split based on Time Series
train_df = (user_and_charity_df[user_and_charity_df['Date'] < pd.to_datetime('2019-04-01 08:24:10.798807-04:00')])
test_df = (user_and_charity_df[user_and_charity_df['Date'] > pd.to_datetime('2019-04-01 08:24:10.798807-04:00')])

print("\n Training and Scoring Model: \n")
user_vecs, item_vecs, data_sparse, item_lookup, user_lookup = train_model_and_score_testing_data(test_df, train_df)

Attempt #1 - Based on Time-Series Split

 Splitting Data into Train/Test

 Training and Scoring Model: 

iteration 1 of 20
iteration 2 of 20
iteration 3 of 20
iteration 4 of 20
iteration 5 of 20
iteration 6 of 20
iteration 7 of 20
iteration 8 of 20
iteration 9 of 20
iteration 10 of 20
iteration 11 of 20
iteration 12 of 20
iteration 13 of 20
iteration 14 of 20
iteration 15 of 20
iteration 16 of 20
iteration 17 of 20
iteration 18 of 20
iteration 19 of 20
iteration 20 of 20


Popular Recommendation Results:
Pop Score: 0
Pop Total: 396
Pop Accuracy %: 0.0

Cat Pop Score: 67
Cat Pop Total: 396
Cat Pop Accuracy %: 16.92


Actual Recommendation Results:
Rec Score: 0
Rec Total: 191
Rec Accuracy %: 0.0

Cat Rec Score: 29
Cat Rec Total: 191
Cat Rec Accuracy %: 15.18


Total Recommendation Results:
Total Rec Score: 0
Total: 587
Total Accuracy %: 0.0

Total Cat Rec Score: 96
Total: 587
Total Cat Rec Accuracy %: 16.35


### Train-Test Attempt #2 - Based on Random Split  -----------     *Current Best Model*

In [379]:
print("Attempt #2 - Based on Random Split")

print("\n Splitting Data into Train/Test")
# Splitting Data into Train and Split randomly
train_df, test_df = train_test_split(user_and_charity_df,test_size = 0.3)

print("\n Training and Scoring Model: \n")
user_vecs, item_vecs, data_sparse, item_lookup, user_lookup = train_model_and_score_testing_data(test_df, train_df)

Attempt #2 - Based on Random Split

 Splitting Data into Train/Test

 Training and Scoring Model: 

iteration 1 of 20
iteration 2 of 20
iteration 3 of 20
iteration 4 of 20
iteration 5 of 20
iteration 6 of 20
iteration 7 of 20
iteration 8 of 20
iteration 9 of 20
iteration 10 of 20
iteration 11 of 20
iteration 12 of 20
iteration 13 of 20
iteration 14 of 20
iteration 15 of 20
iteration 16 of 20
iteration 17 of 20
iteration 18 of 20
iteration 19 of 20
iteration 20 of 20


Popular Recommendation Results:
Pop Score: 0
Pop Total: 211
Pop Accuracy %: 0.0

Cat Pop Score: 66
Cat Pop Total: 211
Cat Pop Accuracy %: 31.28


Actual Recommendation Results:
Rec Score: 214
Rec Total: 498
Rec Accuracy %: 42.97

Cat Rec Score: 325
Cat Rec Total: 498
Cat Rec Accuracy %: 65.26


Total Recommendation Results:
Total Rec Score: 214
Total: 709
Total Accuracy %: 30.18

Total Cat Rec Score: 391
Total: 709
Total Cat Rec Accuracy %: 55.15


### Train-Test Attempt #3 - Based on Implicit Training Model (Time-Series Split)

In [360]:
user_and_charity_implicit = user_and_charity_df.copy()
user_and_charity_implicit['Amount'] = user_and_charity_implicit['Amount'].apply(lambda x: 1)

In [361]:
print("Attempt #3 - Based on Implicit Training Model (Time-Series Split)")

print("\n Splitting Data into Train/Test")
# Splitting Data into Train and Split based on Time Series
train_df = (user_and_charity_implicit[user_and_charity_implicit['Date'] < pd.to_datetime('2019-04-01 08:24:10.798807-04:00')])
test_df = (user_and_charity_implicit[user_and_charity_implicit['Date'] > pd.to_datetime('2019-04-01 08:24:10.798807-04:00')])

print("\n Training and Scoring Model: \n")
user_vecs, item_vecs, data_sparse, item_lookup, user_lookup = train_model_and_score_testing_data(test_df, train_df)

Attempt #3 - Based on Implicit Training Model (Time-Series Split)

 Splitting Data into Train/Test

 Training and Scoring Model: 

iteration 1 of 20
iteration 2 of 20
iteration 3 of 20
iteration 4 of 20
iteration 5 of 20
iteration 6 of 20
iteration 7 of 20
iteration 8 of 20
iteration 9 of 20
iteration 10 of 20
iteration 11 of 20
iteration 12 of 20
iteration 13 of 20
iteration 14 of 20
iteration 15 of 20
iteration 16 of 20
iteration 17 of 20
iteration 18 of 20
iteration 19 of 20
iteration 20 of 20


Popular Recommendation Results:
Pop Score: 0
Pop Total: 396
Pop Accuracy %: 0.0

Cat Pop Score: 16
Cat Pop Total: 396
Cat Pop Accuracy %: 4.04


Actual Recommendation Results:
Rec Score: 0
Rec Total: 191
Rec Accuracy %: 0.0

Cat Rec Score: 53
Cat Rec Total: 191
Cat Rec Accuracy %: 27.75


Total Recommendation Results:
Total Rec Score: 0
Total: 587
Total Accuracy %: 0.0

Total Cat Rec Score: 69
Total: 587
Total Cat Rec Accuracy %: 11.75


### Train-Test Attempt #4 - Based on Implicit Training Model (Random Split)

In [377]:
user_and_charity_implicit = user_and_charity_df.copy()
user_and_charity_implicit['Amount'] = user_and_charity_implicit['Amount'].apply(lambda x: 1)

In [378]:
print("Attempt #4 - Based on Implicit Model (Random Split)")

print("\n Splitting Data into Train/Test")
# Splitting Data into Train and Split randomly
train_df, test_df = train_test_split(user_and_charity_implicit,test_size = 0.3)

print("\n Training and Scoring Model: \n")
user_vecs, item_vecs, data_sparse, item_lookup, user_lookup = train_model_and_score_testing_data(test_df, train_df)

Attempt #4 - Based on Implicit Model (Random Split)

 Splitting Data into Train/Test

 Training and Scoring Model: 

iteration 1 of 20
iteration 2 of 20
iteration 3 of 20
iteration 4 of 20
iteration 5 of 20
iteration 6 of 20
iteration 7 of 20
iteration 8 of 20
iteration 9 of 20
iteration 10 of 20
iteration 11 of 20
iteration 12 of 20
iteration 13 of 20
iteration 14 of 20
iteration 15 of 20
iteration 16 of 20
iteration 17 of 20
iteration 18 of 20
iteration 19 of 20
iteration 20 of 20


Popular Recommendation Results:
Pop Score: 1
Pop Total: 225
Pop Accuracy %: 0.44

Cat Pop Score: 66
Cat Pop Total: 225
Cat Pop Accuracy %: 29.33


Actual Recommendation Results:
Rec Score: 238
Rec Total: 484
Rec Accuracy %: 49.17

Cat Rec Score: 312
Cat Rec Total: 484
Cat Rec Accuracy %: 64.46


Total Recommendation Results:
Total Rec Score: 239
Total: 709
Total Accuracy %: 33.71

Total Cat Rec Score: 378
Total: 709
Total Cat Rec Accuracy %: 53.31
