In [1]:
import os
import pandas as pd
pd.options.mode.chained_assignment = None
import numpy as np
import matplotlib.pyplot as plt
import seaborn
from rake_nltk import Rake
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
# Using OS library to call CLI commands in Python
#os.system("snscrape --jsonl --max-results 500 --since 2020-10-25 twitter-search '#BlackInPhysicsRollCall until:2020-10-31' > Black_in_Physics.json")
#os.system("snscrape --jsonl --max-results 500 --since 2020-11-16 twitter-search '#BlackInDataRollCall until:2020-11-21' > Black_in_Data.json")
#os.system("snscrape --jsonl --max-results 500 --since 2020-08-09 twitter-search '#BlackInChemRollCall until:2020-08-15' > Black_in_Chem.json")

In [3]:
BiP_df = pd.read_json('Black_in_Physics.json', lines=True)
BiD_df = pd.read_json('Black_in_Data.json', lines=True) 
BiC_df = pd.read_json('Black_in_Chem.json', lines=True)

Add category of research (Physics, Chemistry, or Data) to panda dataframes

In [4]:
BiP_df['research'] = 'Physics'
BiC_df['research'] = 'Chemistry'
BiD_df['research'] = 'Data'

Combine all of the data frames together

In [5]:
Blackin_df = pd.concat([BiP_df, BiC_df, BiD_df], ignore_index = True)
content = Blackin_df['content']

Drop any rows that don't have introduction words

In [6]:
introduction = ["Hey", "Hello", "Hi", "I am", " I\'m ", "name", "check in", "research", "degree", "pursuing"]

In [7]:
i = 0
drop_idx = []
for idx in range(len(content)):
    if any(word in content[idx] for word in introduction):
        i+=1
    else:
        drop_idx.append(idx)

In [8]:
df = Blackin_df.drop(drop_idx, axis=0).reset_index(drop=True)

### Create new colums based on information we want to find

In [9]:
df['Key_words'] = ''
df['Name'] = ''
df['Description'] = ''
df['username'] = ''

In [10]:
r = Rake()
#save keywords from tweets
for index,row in df.iterrows():
    r.extract_keywords_from_text(row['content'])
    key_words_dict_scores = r.get_ranked_phrases()
    row['Key_words'] = list(key_words_dict_scores)
    df['Key_words'][index] = row['Key_words']
    
    r.extract_keywords_from_text(row['user']['displayname'])
    key_words_dict_scores = r.get_ranked_phrases()
    row['Name'] = list(key_words_dict_scores)
    df['Name'][index] = row['Name']
    
    r.extract_keywords_from_text(row['user']['username'])
    key_words_dict_scores = r.get_ranked_phrases()
    row['username'] = list(key_words_dict_scores)
    df['username'][index] = row['username']
    
    r.extract_keywords_from_text(row['user']['description'])
    key_words_dict_scores = r.get_ranked_phrases()
    row['Description'] = list(key_words_dict_scores)
    df['Description'][index] = row['Description']
    
    r.extract_keywords_from_text(row['research'])
    key_words_dict_scores = r.get_ranked_phrases()
    row['research'] = list(key_words_dict_scores)
    df['research'][index] = row['research']

In [11]:
df.head()

Unnamed: 0,_type,url,date,content,renderedContent,id,user,replyCount,retweetCount,likeCount,...,mentionedUsers,coordinates,place,hashtags,cashtags,research,Key_words,Name,Description,username
0,snscrape.modules.twitter.Tweet,https://twitter.com/n3ssa_girl4/status/1322221...,2020-10-30 16:59:13+00:00,My name is Dr. Vanessa A. Sanders and i am #Bl...,My name is Dr. Vanessa A. Sanders and i am #Bl...,1322221533260034048,"{'_type': 'snscrape.modules.twitter.User', 'us...",1,12,46,...,"[{'_type': 'snscrape.modules.twitter.User', 'u...",,,"[BlackInNuclear, BlackInPhysicsWeek, BlackInPh...",,[physics],"[staff scientist, medical applications, brookh...",[vanessa sanders ph],[hbcu grad florida native super nerd aspiring ...,[n3ssa_girl4]
1,snscrape.modules.twitter.Tweet,https://twitter.com/ScienceZemen/status/132221...,2020-10-30 16:30:00+00:00,The moment when you are late to #BlackInPhysic...,The moment when you are late to #BlackInPhysic...,1322214177679855616,"{'_type': 'snscrape.modules.twitter.User', 'us...",0,9,57,...,"[{'_type': 'snscrape.modules.twitter.User', 'u...",,,"[BlackInPhysicsWeek, BlackInPhysicsRollCall, B...",,[physics],"[blackinphysics https ://, standing strong, po...",[zemen sarah berhe],"[tigray ✊🏾, ✡️ 📧: zemenberhe0, swimmer 🏊🏾‍♀️, ...",[sciencezemen]
2,snscrape.modules.twitter.Tweet,https://twitter.com/JoshuaABurrow/status/13221...,2020-10-30 13:38:19+00:00,"I am Joshua Burrow, a Ph.D. cand in Electro-Op...","I am Joshua Burrow, a Ph.D. cand in Electro-Op...",1322170972607229952,"{'_type': 'snscrape.modules.twitter.User', 'us...",3,20,92,...,"[{'_type': 'snscrape.modules.twitter.User', 'u...",,,"[BlackInPhysicsRollCall, BlackinPhysics]",,[physics],"[regular ole fella, blackinphysics https ://, ...",[joshua burrow],"[nasemfordfellow dissertation, morehouse man, ...",[joshuaaburrow]
3,snscrape.modules.twitter.Tweet,https://twitter.com/Reneehortonphd/status/1322...,2020-10-30 10:23:00+00:00,I'm checking in for the #BlackInPhysicsRollCal...,I'm checking in for the #BlackInPhysicsRollCal...,1322121821936824320,"{'_type': 'snscrape.modules.twitter.User', 'us...",1,40,147,...,"[{'_type': 'snscrape.modules.twitter.User', 'u...",{'_type': 'snscrape.modules.twitter.Coordinate...,"{'_type': 'snscrape.modules.twitter.Place', 'f...","[BlackInPhysicsRollCall, BlackInPhysics]",,[physics],"[black physicist looks like, past president, m...","[renee horton, phd]","[nsbp madame president 2016, dr h explores, un...",[reneehortonphd]
4,snscrape.modules.twitter.Tweet,https://twitter.com/BelayViktor/status/1321902...,2020-10-29 19:49:42+00:00,"Hey, I’m Viktor and I’m checking in for #Black...","Hey, I’m Viktor and I’m checking in for #Black...",1321902048091250689,"{'_type': 'snscrape.modules.twitter.User', 'us...",1,16,73,...,"[{'_type': 'snscrape.modules.twitter.User', 'u...",,,"[BlackInPhysicsRollCall, BlackinSquishyPhysics]",,[physics],[sloan_kettering studying membrane protein fun...,[viktor belay],"[biophysics phd student, americanu alum, weill...",[belayviktor]


### Create list of search words from descriptor columns for each person

In [12]:
df['search_words'] = ''

In [13]:
columns = ['research', 'Key_words', 'Description', 'hashtags']
for index, row in df.iterrows():
    words = ''
    for col in columns:
        words += ' '.join(row[col]) + ' '
    row['search_words'] = words
    df['search_words'][index] = row['search_words']

In [14]:
new_df = df[['Name', 'username', 'search_words']]
new_df['name_user'] = new_df['Name'] + new_df['username'] 

In [15]:
new_df.head()

Unnamed: 0,Name,username,search_words,name_user
0,[vanessa sanders ph],[n3ssa_girl4],physics staff scientist medical applications b...,"[vanessa sanders ph, n3ssa_girl4]"
1,[zemen sarah berhe],[sciencezemen],physics blackinphysics https :// standing stro...,"[zemen sarah berhe, sciencezemen]"
2,[joshua burrow],[joshuaaburrow],physics regular ole fella blackinphysics https...,"[joshua burrow, joshuaaburrow]"
3,"[renee horton, phd]",[reneehortonphd],physics black physicist looks like past presid...,"[renee horton, phd, reneehortonphd]"
4,[viktor belay],[belayviktor],physics sloan_kettering studying membrane prot...,"[viktor belay, belayviktor]"


### Given a list of keywords, find the people with at least 2 matching keywords. Recommend based on highest number of matches between people and keywords

In [33]:
def get_jaccard_sim(str1, str2):
    a = set(str1)
    b = set(str2)
    c = b.intersection(a)
    return(float(len(c)) / (len(a) + len(b) - len(c)))

In [34]:
def recommender(keywords, number_of_hits=2, keyword_number=3):
    '''
    keywords: sets the keywords used to search for similar people
    number_of_hits: sets the number of people that the code will return. default = 2
    keyword_number: minimum number of keywords to match for each person. default = 3
    '''

    keywords = keywords.replace(' ','').split(',')
    jaccards = []
    for search in new_df['search_words']:
        matches = {x for x in keywords if x in search}
        if len(matches) >= keyword_number: 
            jaccards.append(get_jaccard_sim(keywords,list(search.split(' '))))
        else:
            jaccards.append(0.0)
    
    jaccards = pd.Series(jaccards)
    j_index = jaccards.nlargest(number_of_hits+1).index
    high_matches = new_df.loc[j_index]
    drop_idx = []
    for i in j_index:
        if jaccards[i] == np.float64(0.00):
            drop_idx.append(i)
    j_index = j_index.drop(drop_idx)
    
    high_matches = new_df.loc[j_index]

    matches = []
    scores = []
    for match,score in zip(zip(high_matches['Name'],high_matches['username']),jaccards[j_index]):
        matches.append(match)
        scores.append(score)
    return matches, scores

In [35]:
recommender('physics, eccentricity, gravitational, waves, soccer, phd, candidate, eccentric', 5,4)

([(['cheeseboro 🇹🇹🖤🤍💜 🏳️\u200d🌈', 'belinda'], ['bdcheeseboro']),
  (['amber lenon', 'dr'], ['amberkiana_'])],
 [0.15789473684210525, 0.15])

415: data, phd, biomedical engineering, rare disease

265: chemistry, nanoparticles, phd candidate, nanotechnology, biological, applications

47: physics, eccentricity, gravitational waves, soccer, candidate, eccentric