In [903]:
import pandas as pd
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

import nltk
from nltk.corpus import stopwords

import string
from collections import defaultdict

df = pd.read_csv("Superpowers.csv")
# df.head()

In [904]:
# remove unwanted fields
df = df.drop(['Preferred First Name (Required)',
'Last Name / Family Name (Required)',
'Cornell Tech Email (Required)',
'Slack (Full Name) (Required)',
'Where are you in the Spring Studio teaming process?',
'What have you teamed around? '], axis=1)

In [905]:
df = df.rename(columns={
						"Cornell Tech Degree Program (Required)": "degree",
						"Professional Bio": "bio",
						"Core Teaming Objectives": "core_obj",
						"Skillsets": "skill",
						"Seeking Skillsets": "seeking_skill",
						"Most compelling problems, industries, etc": "industry",
						"Hobbies, Interests, etc": "hobby",
						"Technical Talent (Yes/No)": "tech_talent",
						"Hardware Solution": "hardware",
						"Spinning Out": "spin_out",
						"Spring Studio Preference": "spring_studio",
						})

In [906]:
# df.head()

In [907]:
# process Degree
# for i in df.degree.unique():
# 	print(i)

df["degree"] = df["degree"].apply(lambda x: x.split()[0][:-1])

degree_dict = {str(deg):i for i, deg in enumerate(df.degree.unique())}

# 'CM': 0
# 'ORIE': 1
# 'DESIGN': 2
# 'MBA': 3
# 'AAP': 4
# 'CS': 5
# 'LLM': 6
# 'HT': 7
# 'UT': 8
# 'ECE': 9

# convert to onehot
df["degree_0"] = df["degree"].apply(lambda x: int(degree_dict[x]==0))
df["degree_1"] = df["degree"].apply(lambda x: int(degree_dict[x]==1))
df["degree_2"] = df["degree"].apply(lambda x: int(degree_dict[x]==2))
df["degree_3"] = df["degree"].apply(lambda x: int(degree_dict[x]==3))
df["degree_4"] = df["degree"].apply(lambda x: int(degree_dict[x]==4))
df["degree_5"] = df["degree"].apply(lambda x: int(degree_dict[x]==5))
df["degree_6"] = df["degree"].apply(lambda x: int(degree_dict[x]==6))
df["degree_7"] = df["degree"].apply(lambda x: int(degree_dict[x]==7))
df["degree_8"] = df["degree"].apply(lambda x: int(degree_dict[x]==8))
df["degree_9"] = df["degree"].apply(lambda x: int(degree_dict[x]==9))

df = df.drop(["degree"], axis=1)

In [908]:
# process bio
def sentence_cleaner(sentence):
	ps = PorterStemmer()

	# to lowercase, remove punctuation and number, only keep alphabet
	sentence = sentence.lower()
	sentence = sentence.translate(str.maketrans('', '', string.punctuation))
	sentence = [c for c in sentence if c in string.ascii_lowercase+" "]
	sentence = [c for c in sentence if c not in "1234567890"]
	sentence = "".join(sentence)

	# stem
	words = [ps.stem(w) for w in sentence.lower().split()]

	# remove stopwords
	words = [w for w in words if w not in stopwords.words('english')]

	return words

# apply cleaning 
df["bio_clean"] = df["bio"].apply(sentence_cleaner)
df = df.drop(["bio"], axis=1)

def get_top_words(n, col):
	# n: number of most frequent words
	# col: the target column 
	word_count = defaultdict(int)
	for bio in col:
		for word in bio:
			word_count[word] += 1

	# only take top n common words
	word_count = sorted(word_count.items(), key=lambda item: item[1], reverse=True)[:n]
	top_words  = [i for (i, _) in word_count]
	return top_words

get_top_words(100, df["bio_clean"])

['work',
 'cornel',
 'tech',
 'ha',
 'engin',
 'data',
 'scienc',
 'experi',
 'develop',
 'year',
 'univers',
 'hi',
 'graduat',
 'student',
 'design',
 'comput',
 'prior',
 'softwar',
 'product',
 'also',
 'project',
 'wa',
 'manag',
 'research',
 'learn',
 'startup',
 'bachelor',
 'intern',
 'degre',
 'technolog',
 'join',
 'busi',
 'current',
 'team',
 'compani',
 'dure',
 'industri',
 'studi',
 'program',
 'machin',
 'build',
 'use',
 'cs',
 'system',
 'focus',
 'includ',
 'help',
 'passion',
 'interest',
 'undergradu',
 'master',
 'creat',
 'undergrad',
 'health',
 'major',
 'time',
 'thi',
 'befor',
 'commun',
 'market',
 'school',
 'mba',
 'econom',
 'web',
 'consult',
 'internship',
 'oper',
 'colleg',
 'pursu',
 'analyst',
 'model',
 'new',
 'digit',
 'applic',
 'mathemat',
 'minor',
 'well',
 'law',
 'skill',
 'variou',
 'two',
 'analysi',
 'bs',
 'strategi',
 'background',
 'appli',
 'summer',
 'inform',
 'process',
 'healthcar',
 'financi',
 'analyt',
 'technic',
 'ori',
 '

In [909]:
# process core_obj

In [910]:
# process skill

In [911]:
# process seeking_skill

In [912]:
# process seeking_skill

In [913]:
# process seeking_skill

In [914]:
# process industry

In [915]:
# process hobby

In [916]:
# process tech_talent

df["tech_talent"] = df["tech_talent"].apply(lambda x: 1 if x=="Yes" else 0)

In [917]:
# process hardware

# 0: not interested
# 1: i'd seriously consider it
# 2: very interested

df["hardware"] = df["hardware"].apply(lambda x: 0 if "not" in x.lower() else 1 if "consider" in x.lower() else 2)

# convert to onehot
df["hardware_0"] = df["hardware"].apply(lambda x: int(x==0))
df["hardware_1"] = df["hardware"].apply(lambda x: int(x==1))
df["hardware_2"] = df["hardware"].apply(lambda x: int(x==2))
df = df.drop(["hardware"], axis=1)