##Clone Repo to get data and preprocess data

In [None]:
! git clone https://github.com/jccf12/career_village_challenge.git

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import datetime
import random
random.seed(30)



In [2]:
#Read data files

answers = pd.read_csv("career_village_challenge/data/answers.csv")
answers_scores = pd.read_csv("career_village_challenge/data/answer_scores.csv")


emails1 = pd.read_csv("career_village_challenge/data/emails1.csv")
emails2 = pd.read_csv("career_village_challenge/data/emails2.csv")
emails = pd.concat([emails1,emails2])

group_members = pd.read_csv("career_village_challenge/data/group_memberships.csv")
groups = pd.read_csv("career_village_challenge/data/groups.csv")

matches1 = pd.read_csv("career_village_challenge/data/matches1.csv")
matches2 = pd.read_csv("career_village_challenge/data/matches2.csv")
matches3 = pd.read_csv("career_village_challenge/data/matches3.csv")
matches = pd.concat([matches1,matches2,matches3])

professionals = pd.read_csv("career_village_challenge/data/professionals.csv")

questions = pd.read_csv("career_village_challenge/data/questions.csv")
question_scores = pd.read_csv("career_village_challenge/data/question_scores.csv")

schools_members = pd.read_csv("career_village_challenge/data/school_memberships.csv")
students = pd.read_csv("career_village_challenge/data/students.csv")

tag_users = pd.read_csv("career_village_challenge/data/tag_users.csv")
tag_questions = pd.read_csv("career_village_challenge/data/tag_questions.csv")
tags = pd.read_csv("career_village_challenge/data/tags.csv")

In [3]:
def get_user_tags(user_id):
    user_tags = tag_users[tag_users["tag_users_user_id"] == user_id]["tag_users_tag_id"]
    tags_names =tags.loc[tags["tags_tag_id"].isin(user_tags)]
    return(tags)

In [4]:
def get_user_answers(user_id):
    all_answers = answers.loc[answers["answers_author_id"]==user_id]
    return all_answers

In [5]:
def match_email_question(email_date):
    f = lambda x: datetime.datetime.strptime(x, "%Y-%m-%d %H:%M:%S UTC+0000")
    b  =f(email_date)
    t = questions["questions_date_added"].apply(f)
    return questions.iloc[(b-t).abs().argsort()[:1]]

In [6]:
# get only emails that were sent immediately after getting the question (ignore the weekly newsletter emails)
immediate_emails = emails[emails["emails_frequency_level"]=="email_notification_immediate"]

# get the matches of the immediate emails only
immediate_matches = matches[matches["matches_email_id"].isin(immediate_emails["emails_id"])]
# get the emails that have their matches only (the data is missing the matches of some emails so we ignore those)
immediate_emails = immediate_emails[immediate_emails["emails_id"].isin(matches["matches_email_id"])]
# some renaming for convience
immediate_emails = immediate_emails.rename(columns={'emails_id':"id"})
immediate_matches = immediate_matches.rename(columns={'matches_email_id':"id"})
# let's put the emails and their matches question id:
emails_questions = pd.merge(immediate_emails, immediate_matches, how="left", on="id")
#some renaming:
emails_questions = emails_questions.rename(columns = {"matches_question_id": "question_id", "emails_recipient_id": "professional_id"})
answers = answers.rename(columns = {"answers_question_id": "question_id", "answers_author_id": "professional_id"})

# now, let's make a big data that has emails, questions sent, and answers. Note that some professionals provided more than
# one answer to the same question. So this means the email Id will be repeated because it has two answers
full_data = pd.merge(emails_questions, answers,how='left',  on=["professional_id","question_id" ])
full_data = full_data.rename(columns = {"id": "email_id"})

# get the target variable by looking ad the answer and putting 1 if it exists and 0 if not:
target = full_data['answers_id'].fillna(0)
target = pd.to_numeric(target, errors='coerce').fillna(1).astype(int) #here just turn str to NA then replace with 1
full_data["q_answered?"] = target

# turn the email sent date into a datetime value instead of a string
date_vectorizer = lambda x: datetime.datetime.strptime(x, "%Y-%m-%d %H:%M:%S UTC+0000")
full_data["emails_date_sent"] = full_data["emails_date_sent"].apply(date_vectorizer)
# turn the asnwer added data to datetime value instead of a string (while ignoring the NAs)
indices = full_data["answers_date_added"][full_data["answers_date_added"].notnull()].index.values
full_data.loc[indices, "answers_date_added"] = full_data["answers_date_added"][full_data["answers_date_added"].notnull()].apply(date_vectorizer)
#adding the time taken for each professional to answer a question
full_data["time_taken"] =  pd.to_datetime(full_data["answers_date_added"]) - pd.to_datetime(full_data["emails_date_sent"])

# some people were too active and answered the question even before the email was sent to them. So that would lead
# to a negative time_taken. So we fix that by putting a zero for the time they took (they answered it immediately)
indices2 = full_data["time_taken"][full_data["time_taken"] < datetime.timedelta( days=0)].index.values
full_data.loc[indices2, "time_taken"]=datetime.timedelta(days=0, seconds=0,minutes=0, hours=0 )
# get all the tags of each question
tag_questions_names = pd.merge(tag_questions.rename(columns = {"tag_questions_tag_id":"tag_id"}), tags.rename(columns = {"tags_tag_id":"tag_id"}),how='left',  on=["tag_id" ])
tag_questions_names = tag_questions_names.rename(columns ={"tag_questions_question_id":"question_id", "tags_tag_name":"tag_name"})
questions_tags = tag_questions_names.groupby("question_id")["tag_name"].apply(list).reset_index(name="q_tags")
full_data = pd.merge(full_data,questions_tags,how='left',  on=["question_id" ])

In [7]:
# creating a data for all the professionals
professionals_dataset = pd.DataFrame(columns = ["professional_id"])
professionals_dataset["professional_id"] = professionals["professionals_id"]
# getting the response rate by counting the number of questions answered after the email was sent
response = full_data.groupby("professional_id").mean().drop(columns = ["email_id"]).reset_index()
professionals_dataset = pd.merge(professionals_dataset, response,how='left',  on=["professional_id" ])
professionals_dataset = professionals_dataset.rename(columns = {"q_answered?": "response_rate"})

# getting the average time they took to answer the question if they did
grouped = full_data.groupby('professional_id')["time_taken"]
time_mean = grouped.apply(lambda x: np.mean(x))
professionals_dataset = pd.merge(professionals_dataset, time_mean,how='left',  on=["professional_id" ])
professionals_dataset=  professionals_dataset.rename(columns ={"time_taken": "avg_time_taken"})
# count the total number of questions that each professional answered (including those answered after email)
answers_count = answers["professional_id"].value_counts().reset_index().rename(columns = {"professional_id":"number_q_answered","index":"professional_id",  })
professionals_dataset = pd.merge(professionals_dataset, answers_count,how='left',  on=["professional_id" ])
professionals_dataset["number_q_answered"]= professionals_dataset["number_q_answered"].fillna(0)

# get all the tags that each professional is following
all_tags = pd.merge(tag_users.rename(columns = {"tag_users_tag_id": "tag_id"}),tags.rename(columns = {"tags_tag_id": "tag_id"}),how='left',  on=["tag_id" ])
foll_tags = all_tags.groupby('tag_users_user_id')['tags_tag_name'].apply(list).reset_index(name='following_tags').rename(columns = {"tag_users_user_id": "professional_id"})
professionals_dataset = pd.merge(professionals_dataset, foll_tags,how='left',  on=["professional_id" ])
# get all the tags of the questions that he answered before
a = full_data[["professional_id","q_tags"]][full_data["q_tags"].notnull()].groupby("professional_id")["q_tags"].agg(sum)
professionals_dataset = pd.merge(professionals_dataset, a,how='left',  on=["professional_id" ])
professionals_dataset = professionals_dataset.rename(columns = {"q_tags": "prev_q_tags"})
# get the average score for each professional

prof_score = pd.merge(answers[["professional_id","answers_id" ]], answers_scores.rename(columns = {"id":"answers_id"}),how='left',  on=["answers_id" ]).drop(columns = ["answers_id"])
score_mean = prof_score.groupby("professional_id").mean().reset_index()
professionals_dataset = pd.merge(professionals_dataset,score_mean,how='left',  on=["professional_id" ]).rename(columns = {"score":"avg_ansrs_score"})
professionals_dataset["avg_ansrs_score"] = professionals_dataset["avg_ansrs_score"].fillna(0)
# get the number of groups that each professional is following
prof_grp = group_members["group_memberships_user_id"].value_counts().reset_index().rename(columns = {"index":"professional_id", "group_memberships_user_id":"num_groups"})
professionals_dataset = pd.merge(professionals_dataset,prof_grp,how='left',  on=["professional_id" ])
# get the number of schools that each professional is following
prof_schl = schools_members["school_memberships_user_id"].value_counts().reset_index().rename(columns = {"index":"professional_id", "school_memberships_user_id":"num_schools"})
professionals_dataset = pd.merge(professionals_dataset,prof_schl,how='left',  on=["professional_id" ])

#get the answers to email ratio (#answers/#emails sent) for each professional
prof_emls = immediate_emails["emails_recipient_id"].value_counts().reset_index().rename(columns = {"index":"professional_id", "emails_recipient_id":"num_emails"})
professionals_dataset = pd.merge(professionals_dataset,prof_emls,how='left',  on=["professional_id" ])
professionals_dataset["answrs_emails_ratio"] = professionals_dataset["number_q_answered"] / professionals_dataset["num_emails"]


In [8]:
professionals_dataset = professionals_dataset.drop(columns=['Unnamed: 0_x', 'Unnamed: 0_y'])
professionals_dataset.head(5)

Unnamed: 0,professional_id,response_rate,avg_time_taken,number_q_answered,following_tags,prev_q_tags,avg_ansrs_score,num_groups,num_schools,num_emails,answrs_emails_ratio
0,9ced4ce7519049c0944147afb75a8ce3,,NaT,1.0,,,5.0,,,,
1,f718dcf6d2ec4cb0a52a9db59d7f9e67,,NaT,0.0,,,0.0,,,,
2,0c673e046d824ec0ad0ebe012a0673e4,0.0,NaT,39.0,"[consulting, education, consulting, education,...","[guidance-counselor, school-counselor, school,...",1.974359,,,70.0,0.557143
3,977428d851b24183b223be0eb8619a8c,,NaT,23.0,,,1.478261,,,,
4,e2d57e5041a44f489288397c9904c2b2,,NaT,0.0,,,0.0,,,,


##NLP Functions

In [9]:
#Import functions for NLP
from bs4 import BeautifulSoup
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from dateutil import parser
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [10]:
#Functions for NLP


def lowercase(input):
  """
  Returns lowercase text
  """
  return input.lower()

def remove_punctuation(input):
  """
  Returns text without punctuation
  """
  return input.translate(str.maketrans('','', string.punctuation))

def remove_whitespaces(input):
  """
  Returns text without extra whitespaces
  """
  return " ".join(input.split())
  
def remove_html_tags(input):
  """
  Returns text without HTML tags
  """
  soup = BeautifulSoup(input, "html.parser")
  stripped_input = soup.get_text(separator=" ")
  return stripped_input

def tokenize(input):
  """
  Returns tokenized version of text
  """
  return word_tokenize(input)

def remove_stop_words(input):
  """
  Returns text without stop words
  """
  input = word_tokenize(input)
  return [word for word in input if word not in stopwords.words('english')]

def lemmatize(input):
  """
  Lemmatizes input using NLTK's WordNetLemmatizer
  """
  lemmatizer=WordNetLemmatizer()
  input_str=word_tokenize(input)
  new_words = []
  for word in input_str:
    new_words.append(lemmatizer.lemmatize(word))
  return ' '.join(new_words)


def nlp_pipeline(input):
  """
  Function that calls all other functions together to perform NLP on a given text
  """
  return lemmatize(' '.join(remove_stop_words(remove_whitespaces(remove_punctuation(remove_html_tags(lowercase(input)))))))


##LDA Function

In [11]:

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation as LDA

#Turn tags into a set for faster checking of whether a tag exists or not
unique_tags = set(tags['tags_tag_name'])

def find_topics(question_body):
  """
  Function that takes a question as an input, and finds the two most important topics/tags
  If the found topics exist in the already existing database of tags, we add these tags
  to the professional who answered the question
  """
  try:
    text = nlp_pipeline(question_body)
    count_vectorizer = CountVectorizer(stop_words='english')
    count_data = count_vectorizer.fit_transform([text])
    # One topic that has an avg of two words because most questions had 1/2 tags
    number_topics = 1
    number_words = 2
    # Create and fit the LDA model
    lda = LDA(n_components=number_topics, n_jobs=-1)
    lda.fit(count_data)

    words = count_vectorizer.get_feature_names()

    #Get topics from model. They are represented as a list e.g. ['military','army']
    topics = [[words[i] for i in topic.argsort()[:-number_words - 1:-1]] for (topic_idx, topic) in enumerate(lda.components_)]
    topics = np.array(topics).ravel()
    #Only use topics for which a tag already exists
    existing_topics = set.intersection(set(topics),unique_tags)
  
  #A few question bodies don't work with LDA so this exception just prints them out and ignores them
  except:
    print(question_body)
    return (question_body)

  return existing_topics

### Filter questions data for LDA function

In [35]:
# let's get the answers' authors first:
my_answers_authors = answers.drop(columns = ["answers_date_added", "answers_body"])
my_answers_authors.head(1)

Unnamed: 0,answers_id,professional_id,question_id
0,4e5f01128cae4f6d8fd697cec5dca60c,36ff3b3666df400f956f8335cf53e09e,332a511f1569444485cf7a7a556a5e54


In [36]:
# let's now get the questions_body with the question id:
my_questions_body = questions[["questions_id", "questions_body"]].rename(columns = {"questions_id":"question_id"})
my_questions_body.head(1)

Unnamed: 0,question_id,questions_body
0,332a511f1569444485cf7a7a556a5e54,What is a maths teacher? what is a ma...


In [37]:
# let's merge these together:
add_them = pd.merge(my_answers_authors,my_questions_body,how='left',  on=["question_id" ])
add_them.head(1)

Unnamed: 0,answers_id,professional_id,question_id,questions_body
0,4e5f01128cae4f6d8fd697cec5dca60c,36ff3b3666df400f956f8335cf53e09e,332a511f1569444485cf7a7a556a5e54,What is a maths teacher? what is a ma...


In [38]:
# let's drop the columns that we don't need:
our_final_data = add_them.drop(columns = ["answers_id", "question_id"])
our_final_data.head(1)

Unnamed: 0,professional_id,questions_body
0,36ff3b3666df400f956f8335cf53e09e,What is a maths teacher? what is a ma...


### Apply LDA Function on questions

In [None]:
# let's apply the function and get our new tags:
new_tags= our_final_data["questions_body"].apply(find_topics).values
# let's add it as a column:
our_final_data["new_tags"] = new_tags


In [39]:
"""
Alternatively just import the results from a pickle file
import pickle
f = open('store.pckl', 'rb')
new_tags = pickle.load(f)
f.close()
our_final_data["new_tags"] = new_tags
"""

In [40]:
#group the tags of the same professionals in a list (this will be many tags for each professionl because 
# for each question they answer, they get two new tags)
our_final_data = our_final_data.groupby("professional_id")["new_tags"].apply(list).reset_index()


In [18]:
our_final_data['new_tags'] = our_final_data['new_tags'].apply(lambda x: list(set.union(*x)))

In [19]:
# finally, let's merge the new tags with the prev tags we had:
toy = pd.merge(professionals_dataset,our_final_data,how='left',  on=["professional_id" ])

#Fixed a previous bug here
professionals_dataset['prev_q_tags'].update(toy.pop('new_tags'))


In [20]:
# final data: 
professionals_dataset.head(5)

Unnamed: 0,professional_id,response_rate,avg_time_taken,number_q_answered,following_tags,prev_q_tags,avg_ansrs_score,num_groups,num_schools,num_emails,answrs_emails_ratio
0,9ced4ce7519049c0944147afb75a8ce3,,NaT,1.0,,"[resume, consulting]",5.0,,,,
1,f718dcf6d2ec4cb0a52a9db59d7f9e67,,NaT,0.0,,,0.0,,,,
2,0c673e046d824ec0ad0ebe012a0673e4,0.0,NaT,39.0,"[consulting, education, consulting, education,...","[judge, resume, anesthesiologist, want, year, ...",1.974359,,,70.0,0.557143
3,977428d851b24183b223be0eb8619a8c,,NaT,23.0,,"[investigation, want, make, wondering, major, ...",1.478261,,,,
4,e2d57e5041a44f489288397c9904c2b2,,NaT,0.0,,,0.0,,,,


### Filter answers data for LDA

In [77]:
# let's get the answers' authors first:
my_answers_authors = answers.drop(columns = ["answers_date_added", "answers_body"])
my_answers_authors.head(1)

Unnamed: 0,answers_id,professional_id,question_id
0,4e5f01128cae4f6d8fd697cec5dca60c,36ff3b3666df400f956f8335cf53e09e,332a511f1569444485cf7a7a556a5e54


In [78]:
# let's now get the answers_body with the question id:
my_answers_body = answers[["answers_id", "answers_body"]].rename(columns = {"answers_id":"answers_id"})
my_answers_body.head(1)

Unnamed: 0,answers_id,answers_body
0,4e5f01128cae4f6d8fd697cec5dca60c,<p>Hi!</p>\n<p>You are asking a very interesti...


In [79]:
# let's merge these together:
add_them = pd.merge(my_answers_authors,my_answers_body,how='left',  on=["answers_id" ])
add_them.head(1)

Unnamed: 0,answers_id,professional_id,question_id,answers_body
0,4e5f01128cae4f6d8fd697cec5dca60c,36ff3b3666df400f956f8335cf53e09e,332a511f1569444485cf7a7a556a5e54,<p>Hi!</p>\n<p>You are asking a very interesti...


In [80]:
# let's drop the columns that we don't need:
our_final_data = add_them.drop(columns = ["answers_id", "question_id"])
our_final_data.head(1)

Unnamed: 0,professional_id,answers_body
0,36ff3b3666df400f956f8335cf53e09e,<p>Hi!</p>\n<p>You are asking a very interesti...


###Apply LDA function on answers data

In [None]:
# let's apply the function and get our new tags:

new_tags= our_final_data["answers_body"].apply(find_topics).values

# let's add it as a column:
our_final_data["new_tags"] = new_tags


1
<p><br></p>
<p><br></p>
<p><br></p>
<p><br></p>
<p>Not at all.</p>
<p>No</p>
<p>Whatever you do, do it well!</p>
<p>No.</p>
<p>This just in - no </p>
<p>C and c++. </p>
<p><br></p>
<p>																																								</p>
I do


In [68]:
"""
#Alternatively just import the results from a pickle file
import pickle
f = open('store_answers.pckl', 'rb')
new_tags = pickle.load(f)
f.close()
our_final_data["new_tags"] = new_tags
"""

In [55]:
#group the tags of the same professionals in a list (this will be many tags for each professionl because 
# for each question they answer, they get two new tags)
our_final_data = our_final_data.groupby("professional_id")["new_tags"].apply(list).reset_index()

In [None]:
our_final_data['new_tags'] = our_final_data['new_tags'].apply(lambda x: list(set.union(*x)))

In [None]:
# finally, let's merge the new tags with the prev tags we had:
toy = pd.merge(professionals_dataset,our_final_data,how='left',  on=["professional_id" ])

#Fixed a previous bug here
professionals_dataset['following_tags'].update(toy.pop('new_tags'))

In [None]:
# final data: 
professionals_dataset.head(5)

-------------------

#Clustering model

In [None]:
#Download ~800mb spacy model because it is MUCH more accurate at semantic similarity
!python -m spacy download en_core_web_lg


In [None]:
#Import model for similarity calculation
import en_core_web_lg
nlp = en_core_web_lg.load()

In [None]:
#Use spacy to find similarities between tags
tag_list =  list(tags['tags_tag_name'])
#3871 contains nan so delete it
del tag_list[3871]

#Get rid of hyphens and turn the split words into an extra tag
corpus = ' '.join(list(tag_list)).replace('-',' ')
words = corpus.split()
corpus = " ".join(sorted(set(words), key=words.index))

#Apply the model on our dataset of tags
tokens = nlp(corpus)

#Convert tags into vectors for our clustering model
word_vectors = []
for i in tokens:
  word_vectors.append(i.vector)
word_vectors = np.array(word_vectors)

In [None]:
#Fit model
from sklearn.cluster import DBSCAN

#Use cosine because spacy uses cosine. min_samples = 2 because a cluster should have atleast 2 similar words
dbscan = DBSCAN(metric='cosine', eps=0.3, min_samples=2).fit(word_vectors)

In [None]:
#Function for returning label prediction since there is no builtin function
def dbscan_predict(model, X):

    nr_samples = X.shape[0]

    y_new = np.ones(shape=nr_samples, dtype=int) * -1

    for i in range(nr_samples):
        diff = model.components_ - X[i, :]  # NumPy broadcasting

        dist = np.linalg.norm(diff, axis=1)  # Euclidean distance

        shortest_dist_idx = np.argmin(dist)

        if dist[shortest_dist_idx] < model.eps:
            y_new[i] = model.labels_[model.core_sample_indices_[shortest_dist_idx]]

    return y_new


In [None]:
test_words = ' '.join(['university', 'colleges', 'education', 'courses']).replace('-', ' ')
test_tokens = nlp(test_words)

test_vectors = []
for i in test_tokens:
  test_vectors.append(i.vector)
test_vectors = np.array(test_vectors)

In [None]:
print('Label for university:'+str(dbscan_predict(dbscan,np.array([test_vectors[0]]))[0]))
print('Label for colleges:'+str(dbscan_predict(dbscan,np.array([test_vectors[1]]))[0]))
print('Label for education:'+str(dbscan_predict(dbscan,np.array([test_vectors[2]]))[0]))
print('Label for courses:'+str(dbscan_predict(dbscan,np.array([test_vectors[3]]))[0]))