# Smart Recommender using Turi Create & Content Similarity
https://apple.github.io/turicreate/docs/userguide/
https://apple.github.io/turicreate/docs/api/generated/turicreate.recommender.item_similarity_recommender.create.html#turicreate.recommender.item_similarity_recommender.create

Originally written for sproul.club.

Idea: User-based recommendations. Use collaborative filtering to get recommendations based on what similar users favorite. Remove any recommendations that don't fall into the user's interests, and then fill with clubs with similar description to user favorites.

### Imports

In [461]:
# import packages
# Native imports
import os
!pip install dnspython
import json

# MongoDB Python driver import
import pymongo

# 3rd-party data science imports
import pandas as pd
import seaborn as sns

# NLP Pre-processing tools
import re, string, unicodedata
import nltk
from nltk import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import LancasterStemmer, WordNetLemmatizer

# Main NLP tool kit
import gensim           # https://radimrehurek.com/gensim/auto_examples/index.html

# Model training
from sklearn.pipeline import Pipeline

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier

from sklearn.manifold import TSNE

import bs4 as bs
import nltk
from nltk.tokenize import sent_tokenize # tokenizes sentences
import re
from nltk.stem import PorterStemmer
from nltk.tag import pos_tag
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
import numpy as np



In [462]:
import matplotlib.pyplot as plt
plt.rcParams["figure.figsize"] = (12, 12)

### Connect to MongoDB

In [None]:
!pip install pymongo[srv]
conn_string = #REDACTED
client = pymongo.MongoClient(conn_string)

In [464]:
ALL_CLUB_TAGS = []

# Fetch list of tags
for tag in client['production-db']['tag'].find():
    ALL_CLUB_TAGS += [{
        'id': tag['_id'],
        'name': tag['name']
    }]

all_club_tags_df = pd.DataFrame(ALL_CLUB_TAGS)
all_club_tags_df.head()

Unnamed: 0,id,name
0,0,Advocacy
1,1,ASUC
2,2,Business
3,3,CalGreek
4,4,Community Service


In [465]:
club_info_db = []

for user in client['production-db']['new_base_user'].find({'role': 'officer'}):
    club_name = user['club']['name'].strip()
    club_link_name = user['club']['link_name'].strip()
    club_description = user['club']['about_us'].strip()
    club_tags = user['club']['tags']
    
    club_info_db += [{
        'name': club_name,
        'link_name': club_link_name,
        'description': club_description,
        'tags': club_tags,
    }]
    
club_db_df = pd.DataFrame(club_info_db)
club_db_df = club_db_df.dropna(axis=0, how='any', thresh=None, subset=None, inplace=False)
club_db_df = club_db_df.reset_index(drop = True)
club_db_df

Unnamed: 0,name,link_name,description,tags
0,The Berkeley Alt. Protein Project,the-berkeley-alt.-protein-project,The Berkeley Alt. Protein Project strives to b...,"[10, 11, 18]"
1,EthiCAL Apparel,ethical-apparel,<p>EthiCAL Apparel is a <strong>student-run so...,"[2, 8, 20]"
2,UC Rally Committee,uc-rally-committee,The UC Rally Committee’s members are the offic...,"[20, 21]"
3,Formula Electric at Berkeley,formula-electric-at-berkeley,Join us in building Cal's first ever Formula S...,"[2, 8, 9]"
4,Codebase,codebase,Codebase is a community of developers that emp...,"[5, 19, 22]"
...,...,...,...,...
281,UNICEF at Berkeley,unicef-at-berkeley,<p>As one of the leading UNICEF Campus Initiat...,"[0, 19, 20]"
282,Volunteers Around the World,volunteers-around-the-world,<p>We are a student organization dedicated to ...,"[0, 4, 11]"
283,Womxn Ignite @ Berkeley,womxn-ignite-berkeley,<p>Hello Bears! We are Womxn Ignite @ Berkeley...,"[0, 14, 20]"
284,Delta Consulting,delta-consulting,<h2>Overview</h2>\n<p>Delta Consulting is UC B...,"[2, 6, 22]"


## Simulate User Favorites and Interest Lists 
Note: we do this since we don't have data yet! Once we have a list of users with their favorites and interests, we can use that instead.

In [466]:
#Find number of clubs in each tag
tag1 = set([1])
istag1 = tag1.issubset
club_db_df[club_db_df.tags.map(istag1)]

for i in all_club_tags_df.index:
    tagnum = set([i])
    istag = tagnum.issubset
    num_clubs_in_tag = len(club_db_df[club_db_df.tags.map(istag)])
    print ('tag', i, 'has', num_clubs_in_tag, 'clubs')

tag 0 has 44 clubs
tag 1 has 10 clubs
tag 2 has 47 clubs
tag 3 has 3 clubs
tag 4 has 65 clubs
tag 5 has 31 clubs
tag 6 has 33 clubs
tag 7 has 42 clubs
tag 8 has 26 clubs
tag 9 has 25 clubs
tag 10 has 13 clubs
tag 11 has 45 clubs
tag 12 has 18 clubs
tag 13 has 21 clubs
tag 14 has 20 clubs
tag 15 has 50 clubs
tag 16 has 15 clubs
tag 17 has 18 clubs
tag 18 has 23 clubs
tag 19 has 81 clubs
tag 20 has 69 clubs
tag 21 has 10 clubs
tag 22 has 55 clubs
tag 23 has 5 clubs


In [467]:
# Simulation Function
import random
'''
    - Takes in # of Students/Users to sample for
    - Assumes that users will only favorite clubs that have tags in interest list 
        (if there are even enough clubs in the tag, else it just randomly picks favorite clubs)
    - Simulates between 1-3 interests and 1-3 favorites from those interests
    
'''
def simulate(num_students):
    users = list(range(num_students))
    sample_users = pd.DataFrame(users).rename(columns={0:'user_id'})
    interests, favorites = [], []
    for i in range(len(sample_users)):
        a = random.sample(set(range(0, 24)), random.randint(1,3))
        interests += [a]
        intersections = [set.intersection(set(a), set(i)) for i in club_db_df['tags']]
        club_db_df['intersections'] = [list(i) for i in intersections]
        club_db_df['len_intersections'] = [len(i) for i in club_db_df['intersections']]
        clubs_within_interests = club_db_df[club_db_df['len_intersections']>=1]
        favorites += [random.sample(set(clubs_within_interests['name']), random.randint(1,3)) if len(clubs_within_interests)>=3 
                      else random.sample(set(club_db_df['name']), random.randint(1,10))]

    sample_users['interests'] = interests
    sample_users['favorites'] = favorites
    return sample_users

In [468]:
sample_users = simulate(100)
sample_users

Unnamed: 0,user_id,interests,favorites
0,0,[15],[Her Campus at Berkeley]
1,1,[22],[Web Development at Berkeley]
2,2,[6],"[DataStory, Berkeley Phi Beta Lambda, The Berk..."
3,3,[22],"[Software and Hardware Recruiting List, Politi..."
4,4,"[13, 22, 7]","[Cal RoboBears, Latinx Student Association at ..."
...,...,...,...
95,95,[7],[Azaad Premiere Dance]
96,96,"[18, 0]",[Perennial: The Undergraduate Environmental Jo...
97,97,"[14, 0]",[Peer Health Exchange]
98,98,"[15, 4]","[ATP at Berkeley, Phoenix Consulting Group, SE..."


In [469]:
#Create Exploded DF to prepare for TuriCreate modeling (one favorite per line)
sample_users_exploded = sample_users.explode('favorites')
sample_users_exploded.head()

Unnamed: 0,user_id,interests,favorites
0,0,[15],Her Campus at Berkeley
1,1,[22],Web Development at Berkeley
2,2,[6],DataStory
2,2,[6],Berkeley Phi Beta Lambda
2,2,[6],The Berkeley Group


### Implement TuriCreate Item Similarity
This will allow us to use collaborative filtering to recommend users clubs that other users favorited if their interests are similar.

In [470]:
!pip install turicreate
import turicreate as tc



In [471]:
#create SFrame for Turi Create
sf_users = tc.SFrame(sample_users_exploded)

In [472]:
sf_users

user_id,interests,favorites
0,[15.0],Her Campus at Berkeley
1,[22.0],Web Development at Berkeley ...
2,[6.0],DataStory
2,[6.0],Berkeley Phi Beta Lambda
2,[6.0],The Berkeley Group
3,[22.0],Software and Hardware Recruiting List ...
3,[22.0],Political Computer Science ...
3,[22.0],Blockchain at Berkeley
4,"[13.0, 22.0, 7.0]",Cal RoboBears
4,"[13.0, 22.0, 7.0]",Latinx Student Association at Berkeley ...


In [473]:
#Create tc model
m = tc.item_similarity_recommender.create(sf_users, user_id = 'user_id', item_id = 'favorites', similarity_type = 'cosine', only_top_k = 5)


In [474]:
#Create Trained Recommendation DataFrame
recs = m.recommend()

### Create and clean joined df (with user favorites and model recommendations)

In [475]:
joined_recs = sf_users.join(recs, on= 'user_id', how='inner')
joined_recs_df = pd.DataFrame(joined_recs).rename(columns = {'favorites.1': 'recommended'})
joined_recs_df

Unnamed: 0,user_id,interests,favorites,recommended,score,rank
0,0,[15.0],Her Campus at Berkeley,Cal Seismic Design Team,0.707107,1
1,0,[15.0],Her Campus at Berkeley,Berkeley Innovation,0.500000,2
2,0,[15.0],Her Campus at Berkeley,UC Rally Committee,0.000000,3
3,0,[15.0],Her Campus at Berkeley,Latinx Student Association at Berkeley,0.000000,4
4,0,[15.0],Her Campus at Berkeley,Cal RoboBears,0.000000,5
...,...,...,...,...,...,...
1945,99,"[0.0, 15.0, 13.0]",United Nations Association of Berkeley,Political Computer Science,0.000000,6
1946,99,"[0.0, 15.0, 13.0]",United Nations Association of Berkeley,Software and Hardware Recruiting List,0.000000,7
1947,99,"[0.0, 15.0, 13.0]",United Nations Association of Berkeley,The Berkeley Group,0.000000,8
1948,99,"[0.0, 15.0, 13.0]",United Nations Association of Berkeley,Berkeley Phi Beta Lambda,0.000000,9


In [476]:
#Taking top 5 ranked recommendations and collapsing dataframe so it is easier to read
old_fav = sample_users[['user_id', 'favorites']]
fixed = joined_recs_df.join(old_fav, on = 'user_id', lsuffix='_left', rsuffix='_right', how='left')
fixed = fixed[['user_id_left','favorites_right','recommended','score','rank']].rename(columns={'user_id_left':'user_id', 'favorites_right': 'favorites'})
fixed['favorites'] = [tuple(i) for i in fixed['favorites']]
fixed = fixed.drop_duplicates()
fixed = fixed[(fixed['rank'] <= 5)]
final = fixed.groupby('user_id')
recs2 = []
for name, group in final:
    recs2 += [group['recommended'].tolist()]
fixed.head()                                     

Unnamed: 0,user_id,favorites,recommended,score,rank
0,0,"(Her Campus at Berkeley,)",Cal Seismic Design Team,0.707107,1
1,0,"(Her Campus at Berkeley,)",Berkeley Innovation,0.5,2
2,0,"(Her Campus at Berkeley,)",UC Rally Committee,0.0,3
3,0,"(Her Campus at Berkeley,)",Latinx Student Association at Berkeley,0.0,4
4,0,"(Her Campus at Berkeley,)",Cal RoboBears,0.0,5


### Collab Filtering Dataset

In [477]:
final_cleaned = fixed[['user_id', 'favorites']].drop_duplicates().reset_index().drop(columns={'index'})
final_cleaned['recs'] = recs2
final_cleaned.head()

Unnamed: 0,user_id,favorites,recs
0,0,"(Her Campus at Berkeley,)","[Cal Seismic Design Team, Berkeley Innovation,..."
1,1,"(Web Development at Berkeley,)","[Berkeley Innovation, Berkeley Legends, SENDfo..."
2,2,"(DataStory, Berkeley Phi Beta Lambda, The Berk...","[Cal Dragon Boat, Cal Triathlon, UC Rally Comm..."
3,3,"(Software and Hardware Recruiting List, Politi...","[Extended Reality at Berkeley, REACH! Asian an..."
4,4,"(Cal RoboBears, Latinx Student Association at ...","[Launchpad, Cal Triathlon, UC Rally Committee,..."


### Random Recommendation Test For Feasibility
Notes: This model looks at what other users' favorites are and creates recommendations based on other pairings. Similarity scores for now don't seem high, and there are other parameters we should look at and tweak in the future.

In [478]:
#Run to compare a random club's favorites with recommendations

club = random.randint(0,len(sample_users))
print('Favorites: ', final_cleaned['favorites'][club],' Recommendations: ', final_cleaned['recs'][club])

Favorites:  ('Partners in Health Engage at Berkeley',)  Recommendations:  ['Pinnacle Consulting', "Active for Alzheimer's", 'Latinx Student Association at Berkeley', 'Cal RoboBears', 'Blockchain at Berkeley']


### Test whether recommendations are in a each user's interests
We only want to keep recommended clubs from item similarity that are in each user's interests! 

In [479]:
relevant_recs = final_cleaned
relevant_recs['interests'] = sample_users['interests']
relevant_recs

Unnamed: 0,user_id,favorites,recs,interests
0,0,"(Her Campus at Berkeley,)","[Cal Seismic Design Team, Berkeley Innovation,...",[15]
1,1,"(Web Development at Berkeley,)","[Berkeley Innovation, Berkeley Legends, SENDfo...",[22]
2,2,"(DataStory, Berkeley Phi Beta Lambda, The Berk...","[Cal Dragon Boat, Cal Triathlon, UC Rally Comm...",[6]
3,3,"(Software and Hardware Recruiting List, Politi...","[Extended Reality at Berkeley, REACH! Asian an...",[22]
4,4,"(Cal RoboBears, Latinx Student Association at ...","[Launchpad, Cal Triathlon, UC Rally Committee,...","[13, 22, 7]"
...,...,...,...,...
95,95,"(Azaad Premiere Dance,)","[Cal UPHC, Latinx Student Association at Berke...",[7]
96,96,(Perennial: The Undergraduate Environmental Jo...,"[Student Premed Advising Network [SPAN], Latin...","[18, 0]"
97,97,"(Peer Health Exchange,)","[United Nations Association of Berkeley, Stude...","[14, 0]"
98,98,"(ATP at Berkeley, Phoenix Consulting Group, SE...","[MCBcDNA, Consult Your Community (CYC), Berkel...","[15, 4]"


In [480]:
def relevance(rec_list, interest_list):
    '''
    This function takes in a list of recommendations and an interest list and returns recommendations that are in interest lists
    If a user has no interests, then will just return all recommendations as relevant.
    '''
    relevant_clubs = []
    if len(interest_list) == 0:
        relevant_clubs = [i for i in rec_list]
        return relevant_clubs 
    else:
        for club in rec_list:
            club_tags = list(club_db_df[club_db_df['name'] == club]['tags'])[0]
            for tag in club_tags:
                if tag in interest_list:
                    relevant_clubs += [club]
                    break
    return relevant_clubs        
        
        
relevance(relevant_recs['recs'][0], relevant_recs['interests'][0])



[]

### Remove clubs that aren't within a user's interests

In [481]:
# Apply to the rest of the rows
relevant_recs['relevant_clubs'] = relevant_recs.apply(lambda row : relevance(row['recs'],
                                  row['interests']), axis = 1)
relevant_recs.head()

Unnamed: 0,user_id,favorites,recs,interests,relevant_clubs
0,0,"(Her Campus at Berkeley,)","[Cal Seismic Design Team, Berkeley Innovation,...",[15],[]
1,1,"(Web Development at Berkeley,)","[Berkeley Innovation, Berkeley Legends, SENDfo...",[22],[Berkeley Legends]
2,2,"(DataStory, Berkeley Phi Beta Lambda, The Berk...","[Cal Dragon Boat, Cal Triathlon, UC Rally Comm...",[6],[]
3,3,"(Software and Hardware Recruiting List, Politi...","[Extended Reality at Berkeley, REACH! Asian an...",[22],"[Extended Reality at Berkeley, FemTech]"
4,4,"(Cal RoboBears, Latinx Student Association at ...","[Launchpad, Cal Triathlon, UC Rally Committee,...","[13, 22, 7]","[Launchpad, Blockchain at Berkeley, Political ..."


## Fill blanks with Generic Club Recommendations
Use content similarity (generic club algorithm) to fill in the blanks from removed irrelevant recommendations from collaboraitve filtering.

### Generic Club Algorithm: Content Similarity

In [482]:
#STEP 1: CLEAN TABLE
def clean_table(table):
    """
    Description:
    Cleans club dataframe descriptions into another column containing a lists of significant words in each description.
    
    Input:
    table - club dataframe
    
    Output:
    cleaned_table - table with a new column: "cleaned descriptions"
    
    """
    
    def clean_description(description):
        """
        Description:
        Clean single description into lists of significant words.

        Input:
        description - string of club description 

        Output:
        new_description - list of significant words in description
        """
        
        try: 
            # Remove punctuation
            new_description = re.sub("[^a-zA-Z]", " ", description)

            # Tokenize into words (all lower case)
            new_description = new_description.lower().split()

            # Remove stopwords
            eng_stopwords = set(stopwords.words("english"))
            new_description = [w for w in new_description if not w in eng_stopwords]

            # Remove "uc" and "berkeley"
            uc_berkeley = ['uc', 'berkeley', 'also', 'providing', 'various', 'well', 'provide', 'one']
            new_description = [w for w in new_description if not w in uc_berkeley]
        except TypeError:
            return [""]

        return new_description
    
    clean_descriptions = []
    
    for i in np.arange(len(table)):
        clean_descriptions += [clean_description(table['description'][i])]
        
    cleaned_table = table.drop(['description'], axis=1)
    cleaned_table['clean_description'] = clean_descriptions
    
    return cleaned_table

cleaned_table = clean_table(club_db_df)
cleaned_table

Unnamed: 0,name,link_name,tags,intersections,len_intersections,clean_description
0,The Berkeley Alt. Protein Project,the-berkeley-alt.-protein-project,"[10, 11, 18]",[],0,"[alt, protein, project, strives, build, health..."
1,EthiCAL Apparel,ethical-apparel,"[2, 8, 20]",[],0,"[p, ethical, apparel, strong, student, run, so..."
2,UC Rally Committee,uc-rally-committee,"[20, 21]",[],0,"[rally, committee, members, official, guardian..."
3,Formula Electric at Berkeley,formula-electric-at-berkeley,"[2, 8, 9]",[],0,"[join, us, building, cal, first, ever, formula..."
4,Codebase,codebase,"[5, 19, 22]",[],0,"[codebase, community, developers, empowers, st..."
...,...,...,...,...,...,...
281,UNICEF at Berkeley,unicef-at-berkeley,"[0, 19, 20]",[0],1,"[p, leading, unicef, campus, initiatives, aspi..."
282,Volunteers Around the World,volunteers-around-the-world,"[0, 4, 11]",[0],1,"[p, student, organization, dedicated, promotin..."
283,Womxn Ignite @ Berkeley,womxn-ignite-berkeley,"[0, 14, 20]",[0],1,"[p, hello, bears, womxn, ignite, collegiate, c..."
284,Delta Consulting,delta-consulting,"[2, 6, 22]",[],0,"[h, overview, h, p, delta, consulting, strong,..."


In [483]:
#STEP 2: VECTORIZE TABLE
def vectorize_table(table, yield_model = False):
    MIN_WORD_COUNT = 20
    VECTOR_SIZE = 100
    CONTEXT_WINDOW_SIZE = 10
    
    """
    Description:
    Uses cleaned table to create another column containing vectors using gensim's word2vec.
    
    Input:
    table - cleaned table
    
    Output:
    vectorized_table - table with a new_column: "vector sum"
    
    Run word2vec model
    """
    
    list_vectors = []
    
    model = gensim.models.Word2Vec(
        table['clean_description'],
        min_count=MIN_WORD_COUNT,
        size=VECTOR_SIZE,
        window=CONTEXT_WINDOW_SIZE,
        compute_loss=True,
        sample=1e-3 / 2,
        seed=42,
        workers=1
    )
    
    for i in range(len(table)):
        ith_description = table['clean_description'][i]    
        
        ith_vector_list = []
        for ith_description_word in ith_description:
            if ith_description_word in model.wv:
                ith_vector_list += [model.wv[ith_description_word]]
        
        if len(ith_vector_list) == 0:
            description_sum_vector = [1e-6] * VECTOR_SIZE
        else:
            description_sum_vector = sum(np.array(ith_vector_list))
            
        list_vectors += [description_sum_vector]
        
    vectorized_table = table.copy()
    vectorized_table['vector_sum'] = list_vectors
    
    if yield_model:
        return vectorized_table, model
    else:
        return vectorized_table
    
vector_table, w2v_model = vectorize_table(cleaned_table, yield_model=True)
vector_table

Unnamed: 0,name,link_name,tags,intersections,len_intersections,clean_description,vector_sum
0,The Berkeley Alt. Protein Project,the-berkeley-alt.-protein-project,"[10, 11, 18]",[],0,"[alt, protein, project, strives, build, health...","[0.0083650965, -0.014156526, -0.0027822545, 0...."
1,EthiCAL Apparel,ethical-apparel,"[2, 8, 20]",[],0,"[p, ethical, apparel, strong, student, run, so...","[0.018845102, -0.14857782, -0.011552009, -0.04..."
2,UC Rally Committee,uc-rally-committee,"[20, 21]",[],0,"[rally, committee, members, official, guardian...","[0.008026369, -0.09144393, 0.026339643, -0.037..."
3,Formula Electric at Berkeley,formula-electric-at-berkeley,"[2, 8, 9]",[],0,"[join, us, building, cal, first, ever, formula...","[0.0011783288, -0.052926827, 0.03044209, -0.03..."
4,Codebase,codebase,"[5, 19, 22]",[],0,"[codebase, community, developers, empowers, st...","[0.009571702, -0.08639321, 0.029330207, -0.025..."
...,...,...,...,...,...,...,...
281,UNICEF at Berkeley,unicef-at-berkeley,"[0, 19, 20]",[0],1,"[p, leading, unicef, campus, initiatives, aspi...","[0.008294601, -0.11319724, -0.0039510364, -0.0..."
282,Volunteers Around the World,volunteers-around-the-world,"[0, 4, 11]",[0],1,"[p, student, organization, dedicated, promotin...","[0.02544077, -0.09716282, 0.037796564, -0.0106..."
283,Womxn Ignite @ Berkeley,womxn-ignite-berkeley,"[0, 14, 20]",[0],1,"[p, hello, bears, womxn, ignite, collegiate, c...","[-0.0014062113, -0.09386333, -0.01878618, -0.0..."
284,Delta Consulting,delta-consulting,"[2, 6, 22]",[],0,"[h, overview, h, p, delta, consulting, strong,...","[0.02652006, -0.18142174, 0.04048846, -0.07147..."


In [484]:
#STEP 3: CREATE DISTANCE TABLE
from scipy import spatial

def create_distance_table(table):
    """
    Description:
    Uses a vectorized table to create a pivot table containing distances between each club.
    
    Input:
    table - table with vectorized descriptions
    
    Output:
    distance_table - table containing all distances between each club
    
    """
    
    dict = table[['link_name', 'vector_sum']].set_index('link_name')['vector_sum'].to_dict()
    distance_list = []
    
    for club_1 in dict:
        vector_1 = dict[club_1]
        distance_dictionary = {}
        
        for club_2 in dict:
            vector_2 = dict[club_2]
            
            cosine_sim = 1 - spatial.distance.cosine(vector_1, vector_2)
            distance_dictionary[club_2] = cosine_sim
            
        distance_list += [distance_dictionary]
        
    distance_table = pd.DataFrame(data=distance_list)
    distance_table.index = table['link_name']
    
    return distance_table

In [485]:
distance_table = create_distance_table(vector_table)
distance_table

Unnamed: 0_level_0,the-berkeley-alt.-protein-project,ethical-apparel,uc-rally-committee,formula-electric-at-berkeley,codebase,berkeley-phi-beta-lambda,cal-seismic-design-team-,sendforc-at-uc-berkeley,beam--berkeley-engineers-and-mentors,berkeley-finance-club,...,nextgen-consulting,bare-magazine,phi-delta-epsilon,underwater-robotics-at-berkeley,reach-asian-and-pacific-islander-recruitment-and-retention-center,unicef-at-berkeley,volunteers-around-the-world,womxn-ignite-berkeley,delta-consulting,biofuels-technology-club
link_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
the-berkeley-alt.-protein-project,1.000000,0.936440,0.934839,0.913621,0.946679,0.943201,0.946135,0.952112,0.059047,0.931035,...,0.946223,0.059047,0.936881,0.932828,0.937526,0.941223,0.936032,0.927831,0.937292,0.935098
ethical-apparel,0.936440,1.000000,0.972559,0.933958,0.975164,0.977925,0.972106,0.969986,0.020248,0.962386,...,0.993335,0.020248,0.993228,0.986603,0.992584,0.982321,0.984404,0.992591,0.992389,0.978835
uc-rally-committee,0.934839,0.972559,1.000000,0.947197,0.973878,0.982986,0.971159,0.972477,0.034249,0.967880,...,0.978608,0.034249,0.976043,0.971811,0.968250,0.978412,0.981915,0.968939,0.975256,0.967887
formula-electric-at-berkeley,0.913621,0.933958,0.947197,1.000000,0.940542,0.962001,0.949755,0.941188,0.050537,0.927266,...,0.944117,0.050537,0.937625,0.954711,0.935886,0.939227,0.937696,0.929891,0.953451,0.929127
codebase,0.946679,0.975164,0.973878,0.940542,1.000000,0.983713,0.977173,0.973726,0.059115,0.970006,...,0.981242,0.059115,0.978218,0.972722,0.975969,0.971463,0.979377,0.970345,0.980653,0.959684
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
unicef-at-berkeley,0.941223,0.982321,0.978412,0.939227,0.971463,0.975574,0.965858,0.970782,-0.019417,0.962485,...,0.987128,-0.019417,0.981799,0.975986,0.981096,1.000000,0.982877,0.983824,0.979850,0.989599
volunteers-around-the-world,0.936032,0.984404,0.981915,0.937696,0.979377,0.983528,0.971591,0.967102,0.027456,0.967990,...,0.989707,0.027456,0.986948,0.980247,0.982695,0.982877,1.000000,0.982263,0.986885,0.976712
womxn-ignite-berkeley,0.927831,0.992591,0.968939,0.929891,0.970345,0.971397,0.964334,0.963412,0.016586,0.952025,...,0.991496,0.016586,0.994340,0.989475,0.989725,0.983824,0.982263,1.000000,0.986532,0.977335
delta-consulting,0.937292,0.992389,0.975256,0.953451,0.980653,0.983970,0.977770,0.967687,0.032679,0.969046,...,0.992949,0.032679,0.987620,0.989393,0.993019,0.979850,0.986885,0.986532,1.000000,0.973482


In [486]:
#STEP 4: RECOMMEND
def recommend(table, club_db_df, club_name, k):
    """
    Description:
    Recommends club based off of k-nearest neighbors, prioritizing matching tags.
    
    Input:
    table - table containing all distances between each club
    club_db_df - table containing club tags
    club_name - string of club name we want to create recommendations for
    k - represents how many neighbors
    
    Output:
    recommendations - k recommendations based off of closest distances
    
    """
    
    def filter_by_tag(club_link_name, k):
        """
        Description:
        Return boolean list that contains max amount of matching tags that satisfies matching k amount of clubs.

        Output:
        filtered_clubs - list of booleans based off of clubs filtered by tags

        """
        
        def count_tags(a, b, num_tags):
            """
            Description:
            Returns if club A and B contain at least k tags in common.

            Input:
            a - list of club tags for club A
            b - list of club tags for club B
            num_tags - minimum number of tags required to match

            Output:
            matching_tags - # of matching tags

            """
            matching_tags = len(set(a).intersection(set(b))) >= num_tags
            return matching_tags

        club_tags = club_db_df[club_db_df['link_name'] == club_link_name]['tags'].iloc[0]
        filtered_clubs = []

        num_of_tags = len(club_tags)
        while (np.count_nonzero(filtered_clubs) - 1 < k):
            if num_of_tags == 0:
                print("Matched {} tags".format(num_of_tags))
                return [True] * len(club_db_df)

            filtered_clubs = []
            for other_club_tags in club_db_df['tags']:
                if count_tags(club_tags, other_club_tags, num_of_tags):
                    filtered_clubs += [True]
                else:
                    filtered_clubs += [False]
            num_of_tags -= 1

        else:
            # print("Matched {} tags".format(num_of_tags + 1))
            # print("{} matches".format(np.count_nonzero(filtered_clubs)))
            return filtered_clubs
    
    filtered_clubs = filter_by_tag(club_name, k)
        
    filtered_distances = table[filtered_clubs]
    club_distances = filtered_distances[club_name] 
    sorted_club_distances = club_distances.sort_values(ascending = True, na_position = 'last')
    recommendations = sorted_club_distances.keys()[1: k + 1]
    
    return list(recommendations)

In [487]:
recommendations = recommend(distance_table, club_db_df, 'big-data-at-berkeley', 3)
recommendations

['plextech', 'datastory', 'data-science-society']

### Applying content recommendations to collaborative filtering model
Fills in blanks for collaborative filtering recommendations that were not within student's interests.

In [521]:
def fill_with_generic(favorites_list, interests_list, relevant_clubs_list):
    '''
    For each row, we want to fill relevant_clubs with randomly sampled top recs for each club in favorites list
    edge case: if there is no favorites list, fill with clubs within interested tags
    Users with no interests will not need fill because all clubs are relevant according to relevance function
    '''
    new_recs = []
    #set final recs to relevant_clubs link_names
    final_recs = [list(club_db_df[club_db_df['name'] == i]['link_name'])[0] for i in relevant_clubs_list]
    
    #If there are less than 5 relevant clubs, then fill with generic recs
    if len(relevant_clubs_list) < 5:
        
             
        if len(list(favorites_list)) != 0:
            #for each favorited club, recommend 5 clubs (to cover case with 1 favorite)
            for fav_club in favorites_list:
                linkname = list(club_db_df[club_db_df['name'] == fav_club]['link_name'])[0]
                rec = recommend(distance_table, club_db_df, linkname, 5)
                new_recs += rec

            #make sure there are no duplicate clubs
            unique_new = list(set(new_recs))
            new_not_sim = list(np.setdiff1d(unique_new,final_recs))


            #for each empty space in relevant_clubs
            num_sampled = 5-len(relevant_clubs_list)
            final_recs += random.sample(new_not_sim, num_sampled)

        #if there is no favorites list, suggest random clubs within interests
        else:
            for i in interests_list:
                clubs_w_interests = []

                #for each club
                for club in range(len(club_db_df)):

                    #if interest tag is equal to the club tags
                    if i in club_db_df['tags'][club]:
                        clubs_w_interests += [club_db_df['link_name'][club]]
                        
                #make sure no duplicates    
                unique_new = list(set(clubs_w_interests))
                new_not_sim = list(np.setdiff1d(unique_new,final_recs))

            num_sampled = 5-len(relevant_clubs_list)
            final_recs += random.sample(new_not_sim, num_sampled)
            
    return final_recs
    
        
fill_with_generic(relevant_recs['favorites'][41], relevant_recs['interests'][41], relevant_recs['relevant_clubs'][41])

['latinx-student-association-at-berkeley',
 'berkeley-chinese-students-and-scholars-association',
 'ra-on',
 'ballet-folklorico-reflejos-de-mexico',
 'team-hbv-']

In [522]:
relevant_recs['filled_relevant_clubs'] = relevant_recs.apply(lambda row : fill_with_generic(row['favorites'], row['interests'], row['relevant_clubs']), axis = 1)
relevant_recs.head()

Unnamed: 0,user_id,favorites,recs,interests,relevant_clubs,filled_relevant_clubs
0,0,"(Her Campus at Berkeley,)","[Cal Seismic Design Team, Berkeley Innovation,...",[15],[],"[caliber-magazine, caravan-magazine, the-golde..."
1,1,"(Web Development at Berkeley,)","[Berkeley Innovation, Berkeley Legends, SENDfo...",[22],[Berkeley Legends],"[berkeley-legends, upsilon-pi-epsilon-(upe), d..."
2,2,"(DataStory, Berkeley Phi Beta Lambda, The Berk...","[Cal Dragon Boat, Cal Triathlon, UC Rally Comm...",[6],[],"[minecraft-at-berkeley, the-berkeley-group, mi..."
3,3,"(Software and Hardware Recruiting List, Politi...","[Extended Reality at Berkeley, REACH! Asian an...",[22],"[Extended Reality at Berkeley, FemTech]","[extended-reality-at-berkeley, femtech, mdb, c..."
4,4,"(Cal RoboBears, Latinx Student Association at ...","[Launchpad, Cal Triathlon, UC Rally Committee,...","[13, 22, 7]","[Launchpad, Blockchain at Berkeley, Political ...","[launchpad, blockchain-at-berkeley, political-..."


### Feasibility Test: do recommendations look related to simulated favorites?
All recommendations look much better and are within each user's interests!

In [523]:
#club2 = random.randint(0,len(sample_users))
#print (club2)
print('Favorites: ', relevant_recs['favorites'][club2],' Recommendations: ', relevant_recs['filled_relevant_clubs'][club2])

Favorites:  ("American Medical Women's Association (AMWA) at Berkeley", 'Ballet Folklorico Reflejos de Mexico')  Recommendations:  ['latinx-student-association-at-berkeley', 'mcbcdna', 'ra-on', 'peer-health-exchange', 'berkeley-chinese-students-and-scholars-association']


In [525]:
all_club_tags_df

Unnamed: 0,id,name
0,0,Advocacy
1,1,ASUC
2,2,Business
3,3,CalGreek
4,4,Community Service
5,5,Computer Science
6,6,Consulting
7,7,Cultural
8,8,Design
9,9,Engineering


### Final DataFrame

In [526]:
final_personalized_recs = relevant_recs.drop(['recs', 'relevant_clubs'], axis=1).rename({'filled_relevant_clubs':'personalized recs'}, axis=1)
final_personalized_recs

Unnamed: 0,user_id,favorites,interests,personalized recs
0,0,"(Her Campus at Berkeley,)",[15],"[caliber-magazine, caravan-magazine, the-golde..."
1,1,"(Web Development at Berkeley,)",[22],"[berkeley-legends, upsilon-pi-epsilon-(upe), d..."
2,2,"(DataStory, Berkeley Phi Beta Lambda, The Berk...",[6],"[minecraft-at-berkeley, the-berkeley-group, mi..."
3,3,"(Software and Hardware Recruiting List, Politi...",[22],"[extended-reality-at-berkeley, femtech, mdb, c..."
4,4,"(Cal RoboBears, Latinx Student Association at ...","[13, 22, 7]","[launchpad, blockchain-at-berkeley, political-..."
...,...,...,...,...
95,95,"(Azaad Premiere Dance,)",[7],"[latinx-student-association-at-berkeley, ra-on..."
96,96,(Perennial: The Undergraduate Environmental Jo...,"[18, 0]","[student-premed-advising-network-[span], pre-p..."
97,97,"(Peer Health Exchange,)","[14, 0]","[united-nations-association-of-berkeley, latin..."
98,98,"(ATP at Berkeley, Phoenix Consulting Group, SE...","[15, 4]","[mcbcdna, atp-at-berkeley, uc-berkeley-nsslha,..."
