#Import packages

In [None]:
import pandas as pd
import numpy as np
import os
from google.colab import files
import cv2
from sklearn.utils import shuffle
import itertools
import shutil
from bs4 import BeautifulSoup
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from datetime import datetime
from dateutil import parser

np.random.seed(101)

In [None]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

#Download dataset

In [None]:
# Install a kaggle package to download the dataset
! pip install -q kaggle
! pip install --upgrade --force-reinstall --no-deps kaggle

Processing /root/.cache/pip/wheels/68/6d/9b/7a98271454edcba3b56328cbc78c037286e787d004c8afee71/kaggle-1.5.9-cp36-none-any.whl
Installing collected packages: kaggle
  Found existing installation: kaggle 1.5.9
    Uninstalling kaggle-1.5.9:
      Successfully uninstalled kaggle-1.5.9
Successfully installed kaggle-1.5.9


In [None]:
# Run this cell, then upload your "kaggle.json" file when prompted.
# This is basically a password linked to your account to allow you to download
# the dataset

from google.colab import files
uploaded = files.upload()

Saving kaggle.json to kaggle.json


In [None]:
# Below is code to gain permission to download the dataset

! mkdir ~/.kaggle
! cp kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json

In [None]:
# Download the desired dataset (in the default zip format)

! kaggle competitions download -c data-science-for-good-careervillage

Downloading data-science-for-good-careervillage.zip to /content
 85% 57.0M/67.3M [00:01<00:00, 47.7MB/s]
100% 67.3M/67.3M [00:01<00:00, 65.7MB/s]


In [None]:
# Unzip and load the dataset onto your current directory
import zipfile
zip = zipfile.ZipFile('data-science-for-good-careervillage.zip')
zip.extractall()

#Create dataset 1 (professional topics + response rate + average speed)

In [None]:
#Functions for NLP

def lowercase(input):
  return input.lower()

def remove_punctuation(input):
  return input.translate(str.maketrans('','', string.punctuation))

def remove_whitespaces(input):
  return " ".join(input.split())
  
def remove_html_tags(input):
    soup = BeautifulSoup(input, "html.parser")
    stripped_input = soup.get_text(separator=" ")
    return stripped_input

def tokenize(input):
  return word_tokenize(input)

def remove_stop_words(input):
  return [word for word in input if word not in stopwords.words("english")]

def lemmatize(input):
  lemmatizer=WordNetLemmatizer()
  input_str=word_tokenize(input)
  new_words = []
  for word in input_str:
    new_words.append(lemmatizer.lemmatize(word))
  return ' '.join(new_words)

lemmatize(remove_whitespaces(remove_punctuation(remove_html_tags(lowercase('engineering')))))


'engineer'

In [None]:
#Remove potentially duplicate tags from tag list

tags = pd.read_csv('tags.csv')
tags = tags[tags['tags_tag_name'].notna()]
old_tags = tags['tags_tag_name'].unique()

print('No. of unique tags before: ',len(tags['tags_tag_name'].unique()))
for i in range(len(tags['tags_tag_name'])):
  tags.iloc[i,1] = lemmatize(remove_whitespaces(remove_punctuation(remove_html_tags((tags.iloc[i,1])))))

unique_tags = tags['tags_tag_name'].unique()
print('No. of unique tags after: ', len(unique_tags))

No. of unique tags before:  16268


  ' Beautiful Soup.' % markup)


No. of unique tags after:  13464


In [None]:
#Create dataframe for dataset using tags
dataset1 = pd.DataFrame(columns = np.concatenate((np.array(['prof_id','response_rate','speed']),unique_tags)))
dataset1

Unnamed: 0,prof_id,response_rate,speed,college,computerscience,computersoftware,business,doctor,engineering,career,medicine,science,engineer,teaching,nursing,psychology,teacher,medical,finance,healthcare,collegemajor,professor,computer,law,nurse,biology,technology,job,education,management,any,professional,sport,accounting,university,lawyer,marketing,art,careercounseling,internship,...,xp,xrayionization,xraytech,xslt,xubuntu,y,yachtchef,yale,yardi,yayabrother,yayy,ymca,yogainstructor,yogatherapy,yolo,yonseiuniversity,youarethefuture,young,youngmom,youngprofesional,youngprofessional,youthadvisor,youthdevelopment,youthemployment,youthengagement,youthpastor,youthprograms,youthservices,yui,zambia,zdijhvgiuasbvmnv,zealand,zeitgeist,zillow,zjz,zombiekilling,zsh,zumba,zynga,零售


In [None]:
#Extract data for each professional

#Read files
professionals = pd.read_csv('professionals.csv')
answers = pd.read_csv('answers.csv')
tag_questions = pd.read_csv('tag_questions.csv')
questions = pd.read_csv('questions.csv')
emails = pd.read_csv('emails.csv')

professionals_with_answers = answers['answers_author_id'].unique()


#Loop through each professional's answers to get their topics
count = 0
for prof_id in professionals_with_answers:

  #Should go upto 10000. It's printing to know how much time is left for the cell to end
  print(count)

  prof_answers = answers[answers['answers_author_id']==prof_id]
  ques_ids = prof_answers['answers_question_id']
  prof_tags = []
  tag_ids = []


  #Find tags in the professional's answers
  for ques in ques_ids:
    
    #Check if question is tagged or not
    if ques in list(tag_questions['tag_questions_question_id']):
      tag_ids.append(tag_questions[tag_questions['tag_questions_question_id']==ques]['tag_questions_tag_id'].iloc[0])


  for tag in tag_ids:
    prof_tags.append(tags[tags['tags_tag_id'] == tag].iloc[0,1])

  #Find response rate of professional

  #In case an email was not sent to a professor, but they still answered a question
  if prof_id not in list(emails['emails_recipient_id']):
    rate = 0
  
  else:
    emails_sent = emails.emails_recipient_id.value_counts()[prof_id]
    rate = len(prof_answers)/emails_sent

  #Find avg time taken for professional to answer questions
  total_time = 0

  for ques in ques_ids:
    day_posted = parser.parse(questions[questions['questions_id']==ques]['questions_date_added'].iloc[0])
    day_answered = parser.parse(answers[answers['answers_question_id']==ques]['answers_date_added'].iloc[0])
    total_time += (day_answered - day_posted).days
  
  speed = total_time/len(prof_answers)


  #Append data to dataset
  row = [prof_id,rate,speed]

  for i in unique_tags:
    appended = False
    for tag in prof_tags:
      if tag == i:
        row.append(1)
        appended = True
        break
      else:
        continue

    if appended == False:    
      row.append(0)
  dataset1.loc[count] = row
  count += 1


  



#Alternatively, just directly download the data file from the link below

In [2]:
!gdown --id 1yP5ck8mS5klbZ0tvz9dDqi88MWGJbg27

Downloading...
From: https://drive.google.com/uc?id=1yP5ck8mS5klbZ0tvz9dDqi88MWGJbg27
To: /content/data_part1.csv
275MB [00:01, 151MB/s]
