## Imports

In [2]:
import numpy as np
import pandas as pd
import datetime
import math
import random

pd.options.display.max_columns = 50

In [3]:
import scipy.sparse as sparse
from scipy.sparse.linalg import spsolve
from sklearn.preprocessing import MinMaxScaler

## Functions

In [28]:
#------------------#
# BUILD OUR MODEL  #
#------------------#

def implicit_als(sparse_data, alpha_val=40, iterations=10, lambda_val=0.1, features=10):
    """ 
    Function
    --------
        Implementation of Alternating Least Squares with implicit data. We iteratively
        compute the user (x_u) and item (y_i) vectors using the following formulas:

        x_u = ((Y.T*Y + Y.T*(Cu - I) * Y) + lambda*I)^-1 * (X.T * Cu * p(u))
        y_i = ((X.T*X + X.T*(Ci - I) * X) + lambda*I)^-1 * (Y.T * Ci * p(i))
 
    Parameters
    ----------
        sparse_data (csr_matrix): Our sparse user-by-item matrix
 
        alpha_val (int): The rate in which we'll increase our confidence
        in a preference with more interactions.
 
        iterations (int): How many times we alternate between fixing and 
        updating our user and item vectors
 
        lambda_val (float): Regularization value
 
        features (int): How many latent features we want to compute.
    
    Returns:
    --------
        X (csr_matrix): user vectors of size users-by-features
        
        Y (csr_matrix): item vectors of size items-by-features
     """

    # Calculate the confidence for each value in our data
    confidence = sparse_data * alpha_val
    
    # Get the size of user rows and item columns
    user_size, item_size = sparse_data.shape
    
    # We create the user vectors X of size users-by-features, the item vectors
    # Y of size items-by-features and randomly assign the values.
    X = sparse.csr_matrix(np.random.normal(size = (user_size, features)))
    Y = sparse.csr_matrix(np.random.normal(size = (item_size, features)))
    
    #Precompute I and lambda * I
    X_I = sparse.eye(user_size)
    Y_I = sparse.eye(item_size)
    
    I = sparse.eye(features)
    lI = lambda_val * I

    # Start main loop. For each iteration we first compute X and then Y
    for i in range(iterations):
        print ('iteration %d of %d' % (i+1, iterations))
        
        # Precompute Y-transpose-Y and X-transpose-X
        yTy = Y.T.dot(Y)
        xTx = X.T.dot(X)

        # Loop through all users
        for u in range(user_size):

            # Get the user row.
            u_row = confidence[u,:].toarray() 

            # Calculate the binary preference p(u)
            p_u = u_row.copy()
            p_u[p_u != 0] = 1.0

            # Calculate Cu and Cu - I
            CuI = sparse.diags(u_row, [0])
            Cu = CuI + Y_I

            # Put it all together and compute the final formula
            yT_CuI_y = Y.T.dot(CuI).dot(Y)
            yT_Cu_pu = Y.T.dot(Cu).dot(p_u.T)
            X[u] = spsolve(yTy + yT_CuI_y + lI, yT_Cu_pu)

        for i in range(item_size):

            # Get the item column and transpose it.
            i_row = confidence[:,i].T.toarray()

            # Calculate the binary preference p(i)
            p_i = i_row.copy()
            p_i[p_i != 0] = 1.0

            # Calculate Ci and Ci - I
            CiI = sparse.diags(i_row, [0])
            Ci = CiI + X_I

            # Put it all together and compute the final formula
            xT_CiI_x = X.T.dot(CiI).dot(X)
            xT_Ci_pi = X.T.dot(Ci).dot(p_i.T)
            Y[i] = spsolve(xTx + xT_CiI_x + lI, xT_Ci_pi)

    return X, Y
    

In [32]:
#------------------------------#
# CREATE USER RECOMMENDATIONS  #
#------------------------------#

def recommend(user_id, data_sparse, user_vecs, item_vecs, item_lookup, num_items=10):
    """
    Function:
    --------
        Recommend items for a given user given a trained model
    
    Parameters:
    ----------
        user_id (int): The id of the user we want to create recommendations for.
        
        data_sparse (csr_matrix): Our original training data.
        
        user_vecs (csr_matrix): The trained user x features vectors
        
        item_vecs (csr_matrix): The trained item x features vectors
        
        item_lookup (pandas.DataFrame): Used to map charity_id to charity names
        
        num_items (int): How many recommendations we want to return:
        
    Returns:
    -------
        recommendations (pandas.DataFrame): DataFrame with num_items charity names and scores
    
    """
  
    # Get all interactions by the user
    user_interactions = data_sparse[user_id,:].toarray()

    # We don't want to recommend items the user has consumed. So let's
    # set them all to 0 and the unknowns to 1.
    user_interactions = user_interactions.reshape(-1) + 1 #Reshape to turn into 1D array
    user_interactions[user_interactions > 1] = 0

    # This is where we calculate the recommendation by taking the 
    # dot-product of the user vectors with the item vectors.
    rec_vector = user_vecs[user_id,:].dot(item_vecs.T).toarray()

    # Let's scale our scores between 0 and 1 to make it all easier to interpret.
    min_max = MinMaxScaler()
    rec_vector_scaled = min_max.fit_transform(rec_vector.reshape(-1,1))[:,0]
    recommend_vector = user_interactions*rec_vector_scaled
   
    # Get all the charity indices in order of recommendations (descending) and
    # select only the top "num_items" items. 
    item_idx = np.argsort(recommend_vector)[::-1][:num_items]

    charities = []
    scores = []

    # Loop through our recommended charity indicies and look up the actual charity name
    for idx in item_idx:
        charities.append(item_lookup['Advised Charity'].loc[item_lookup.charity_id == str(idx)].iloc[0])
        scores.append(recommend_vector[idx])

    # Create a new dataframe with recommended charity names and scores
    recommendations = pd.DataFrame({'charity': charities, 'score': scores})
    
    return recommendations

In [50]:
def recommend_similar_charities(item_lookup,item_vec,item_name="",item_id=-1,num_sim_char=10):
    '''
    '''
    if item_id == -1:
        # Charity Name --> Charity_id
        item_id = int(item_lookup['charity_id'].loc[item_lookup['Advised Charity'] == str(item_name)].iloc[0])

    # Get the item row for selected charity.
    item_vec = item_vecs[item_id].T

    # Calculate the similarity score between selected charity and other charities
    # and select the top 10 most similar.
    scores = item_vecs.dot(item_vec).toarray().reshape(1,-1)[0]
    top_10 = np.argsort(scores)[::-1][:10]

    charities = []
    charity_scores = []

    print("Charities Similar to:", item_lookup['Advised Charity'].loc[item_lookup.charity_id == str(item_id)].iloc[0],"\n")

    # Get and print the actual charity names and scores
    for idx in top_10:
        charities.append(item_lookup['Advised Charity'].loc[item_lookup.charity_id == str(idx)].iloc[0])
        charity_scores.append(scores[idx])

    similar = pd.DataFrame({'charities': charities, 'score': charity_scores})

    print (similar)
    return similar

## Loading Data

In [6]:
user_and_charity_df = pd.read_csv('../data/user_and_charity_df.csv')
user_and_charity_df.drop(columns=['Unnamed: 0'],inplace=True)

In [7]:
user_and_charity_df.head()

Unnamed: 0,Donation ID,User ID,Amount,Date,To Charity,To Charity EIN,Advised Charity,Advised Charity EIN,Unnamed: 10,In Honor Of?,Event?,Cover fee?,Tip,Year,Month,Day,Hour,Message,Givz Everywhere?,Recur_Monthly,Recur_Annually,Recur_O,Date_Created_Year,Date_Created_Month,Account_Age,Gender_M,Gender_F,INCOME_CD,ZIP_FIVE,NTEE_Major_Category,NTEE_Minor_Category,County,rating
0,2753,2837,25.0,2019-09-11 03:17:52.392953-04:00,Social Good Fund,461323531,Room to Grow National Inc,134012096,0,0,0,0,0.0,2019,9,11,3,0,0,0,0,1,2019,9,0.16,0,0,7,10001,Human Services - Multipurpose and Other,"Children's, Youth Services",New York County,4.0
1,2749,2847,1.0,2019-09-10 16:52:30.401878-04:00,Social Good Fund,461323531,Room to Grow National Inc,134012096,0,0,0,0,0.0,2019,9,10,16,0,0,0,0,1,2019,9,0.15,0,0,7,10001,Human Services - Multipurpose and Other,"Children's, Youth Services",New York County,4.0
2,2746,2845,1.0,2019-09-09 17:17:28.541816-04:00,Social Good Fund,461323531,Room to Grow National Inc,134012096,0,0,0,0,0.0,2019,9,9,17,0,0,0,0,1,2019,9,0.15,0,0,7,10001,Human Services - Multipurpose and Other,"Children's, Youth Services",New York County,4.0
3,2752,1629,20.0,2019-09-10 22:55:52.758423-04:00,Social Good Fund,461323531,Wildlife Conservation Society,131740011,0,0,0,0,0.0,2019,9,10,22,0,0,1,0,0,2018,12,0.92,0,0,9,10460,Animal-Related,D500,Bronx County,4.0
4,2703,1629,20.0,2019-08-10 22:55:53.127860-04:00,Social Good Fund,461323531,Wildlife Conservation Society,131740011,0,0,0,0,0.0,2019,8,10,22,0,0,1,0,0,2018,12,0.92,0,0,9,10460,Animal-Related,D500,Bronx County,4.0


## Implicit Collaborative Filtering

In [19]:
raw_data = user_and_charity_df[['User ID','Advised Charity','Amount']]

In [20]:
# Drop Rows with Missing Values
data = raw_data.dropna()

# Convert charity names into numerical IDs
data['user_id'] = data['User ID'].astype("category").cat.codes
data['charity_id'] = data['Advised Charity'].astype("category").cat.codes

# Create a lookup frame so we can get the charity names back in 
# readable form later.
item_lookup = data[['charity_id', 'Advised Charity']].drop_duplicates()
item_lookup['charity_id'] = item_lookup['charity_id'].astype(str)

In [21]:
item_lookup.head()

Unnamed: 0,charity_id,Advised Charity
0,439,Room to Grow National Inc
3,557,Wildlife Conservation Society
14,396,Play Soccer to Give Corp
85,280,Lemon Bay Junior Golf Foundation Inc
89,410,Purple Heart Homes Inc


In [22]:
data = data.drop(['User ID', 'Advised Charity'], axis=1)

# Drop Rows with $0 Donation
data = data.loc[data.Amount > 0]

In [23]:
data.head()

Unnamed: 0,Amount,user_id,charity_id
0,25.0,959,439
1,1.0,963,439
2,1.0,961,439
3,20.0,593,557
4,20.0,593,557


In [24]:
# Create lists of all users, charities and amounts
users = list(np.sort(data['user_id'].unique()))
charities = list(np.sort(data['charity_id'].unique()))
amounts = list(data.Amount)

# Get the rows and columns for our new matrix
rows = data['user_id'].astype(int)
cols = data['charity_id'].astype(int)

# Contruct a sparse matrix for our users and items containing amounts
data_sparse = sparse.csr_matrix((amounts, (rows, cols)), shape=(len(users), len(charities)))

In [27]:
# Training The Model 
# (Creating User-Latent Vector and Item-Latent Vector)
user_vecs, item_vecs = implicit_als(data_sparse, iterations=20, features=20, alpha_val=40)

iteration 1 of 20
iteration 2 of 20
iteration 3 of 20
iteration 4 of 20
iteration 5 of 20
iteration 6 of 20
iteration 7 of 20
iteration 8 of 20
iteration 9 of 20
iteration 10 of 20
iteration 11 of 20
iteration 12 of 20
iteration 13 of 20
iteration 14 of 20
iteration 15 of 20
iteration 16 of 20
iteration 17 of 20
iteration 18 of 20
iteration 19 of 20
iteration 20 of 20


In [29]:
item_lookup.head()

Unnamed: 0,charity_id,Advised Charity
0,439,Room to Grow National Inc
3,557,Wildlife Conservation Society
14,396,Play Soccer to Give Corp
85,280,Lemon Bay Junior Golf Foundation Inc
89,410,Purple Heart Homes Inc


In [36]:
item_lookup

Unnamed: 0,charity_id,Advised Charity
0,439,Room to Grow National Inc
3,557,Wildlife Conservation Society
14,396,Play Soccer to Give Corp
85,280,Lemon Bay Junior Golf Foundation Inc
89,410,Purple Heart Homes Inc
...,...,...
2354,571,Zen Hospice Project Inc
2356,322,Muscular Dystrophy Association
2357,520,Treatment Advocacy Center
2358,7,Ajiri Foundation


In [39]:
item_lookup.iloc[439]

charity_id                              336
Advised Charity    Navy Seal Foundation Inc
Name: 2125, dtype: object

In [44]:
# Charity Name --> Charity_id
item_lookup['charity_id'].loc[item_lookup['Advised Charity'] == str('Room to Grow National Inc')].iloc[0]

'439'

In [38]:
# Charity_id --> Charity Name
item_lookup['Advised Charity'].loc[item_lookup.charity_id == str(439)].iloc[0]

'Room to Grow National Inc'

In [49]:
recommend_similar_charities(item_lookup,item_vec,item_name='Navy Seal Foundation Inc');

Charities Similar to: Navy Seal Foundation Inc 

                                   charities     score
0                               Lionsraw Inc  0.021511
1  Everytown For Gun Safety Support Fund Inc  0.019936
2                   Navy Seal Foundation Inc  0.019833
3                            One Mission Inc  0.019671
4                       Hungry for Music Inc  0.018806
5                     Athlife Foundation Inc  0.015616
6                       Habitat for Humanity  0.015547
7                       Exhale to Inhale Inc  0.014914
8                   Memphis Inner City Rugby  0.014672
9                       Grateful Peoples Inc  0.014529


In [30]:
#------------------------------
# FIND SIMILAR ITEMS
#------------------------------

# Let's find similar charities to __________ (Use item_lookup to locate charity_id). 
item_id = 30

# Get the item row for selected charity.
item_vec = item_vecs[item_id].T

# Calculate the similarity score between selected charity and other charities
# and select the top 10 most similar.
scores = item_vecs.dot(item_vec).toarray().reshape(1,-1)[0]
top_10 = np.argsort(scores)[::-1][:10]

charities = []
charity_scores = []

print("Charities Similar to:", item_lookup['Advised Charity'].loc[item_lookup.charity_id == str(item_id)].iloc[0],"\n")

# Get and print the actual charity names and scores
for idx in top_10:
    charities.append(item_lookup['Advised Charity'].loc[item_lookup.charity_id == str(idx)].iloc[0])
    charity_scores.append(scores[idx])

similar = pd.DataFrame({'charities': charities, 'score': charity_scores})

print (similar)

Charities Similar to: American Society For The Prevention Of Cruelty To Animals 

                                           charities     score
0  American Society For The Prevention Of Cruelty...  0.012739
1                       Dana-Farber Cancer Institute  0.009687
2                           Play Soccer to Give Corp  0.008045
3                      New York Shakespeare Festival  0.008029
4  Alzheimers Disease And Related Disorders Assoc...  0.007571
5                          The Animal Medical Center  0.007026
6               Organization For Autism Research Inc  0.006788
7                      Planned Parenthood Global Inc  0.006787
8                       Hearing Charities Of America  0.006707
9                                   Social Good Fund  0.006656


In [33]:
# Let's say we want to recommend charities for user
user_id = 12

#------------------------------
# GET ITEMS Donated to BY USER
#------------------------------

# Let's print out what the user has donated to
consumed_idx = data_sparse[user_id,:].nonzero()[1].astype(str)
consumed_items = item_lookup.loc[item_lookup.charity_id.isin(consumed_idx)]
print (consumed_items)


# Let's generate and print our recommendations
recommendations = recommend(user_id, data_sparse, user_vecs, item_vecs, item_lookup)
print("\n User-User Recommendations:")
print (recommendations)

     charity_id                         Advised Charity
1512        309  Memorial Sloan Kettering Cancer Center

 User-User Recommendations:
                                             charity     score
0  Alzheimers Disease And Related Disorders Assoc...  0.605219
1                              Dirt Road Project Inc  0.497044
2                                   Social Good Fund  0.475375
3                            Puppies Behind Bars Inc  0.460700
4                                  Save the Children  0.452697
5                         Ptsd Foundation of America  0.445819
6                                 Stroke Association  0.441714
7               Alzheimers Foundation Of America Inc  0.434626
8                             Hands of the Carpenter  0.430569
9                                      Philabundance  0.426798
