In [104]:
import pandas as pd
import numpy as np
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

import nltk
from nltk.corpus import stopwords

import string
from collections import defaultdict
import os

df = pd.read_csv("Superpowers.csv")
# df.head()

In [105]:
# remove unwanted fields
df = df.drop(['Preferred First Name (Required)',
'Last Name / Family Name (Required)',
'Cornell Tech Email (Required)',
'Slack (Full Name) (Required)',
'Where are you in the Spring Studio teaming process?',
'What have you teamed around? '], axis=1)

In [106]:
df = df.rename(columns={
						"Cornell Tech Degree Program (Required)": "degree",
						"Professional Bio": "bio",
						"Core Teaming Objectives": "core_obj",
						"Skillsets": "skill",
						"Seeking Skillsets": "seeking_skill",
						"Most compelling problems, industries, etc": "industry",
						"Hobbies, Interests, etc": "hobby",
						"Technical Talent (Yes/No)": "tech_talent",
						"Hardware Solution": "hardware",
						"Spinning Out": "spin_out",
						"Spring Studio Preference": "spring_studio",
						})

In [107]:
df.head()

Unnamed: 0,degree,bio,core_obj,skill,seeking_skill,industry,hobby,tech_talent,hardware,spin_out,spring_studio
0,CM- Connective Media,"👋Prior to attending Cornell Tech, Anton worked...",My number one teaming objective is to find a t...,Hard skills:\n- prototyping & building web app...,1. passion for doing social good \n2. design ...,- Applying breakthroughs in tech (machine lear...,When I'm not coding I enjoy travelling and goi...,Yes,I’d seriously consider it,I’d seriously consider it,PiTech Studio
1,CM- Connective Media,Previously worked in early stage venture then ...,My number one team objective to launch somethi...,UX & coding,Graphic Design,N/a,I write a lot about how companies got their fi...,Yes,Not interested,Very Interested!,Startup Studio
2,ORIE- Operations Research and Information Engi...,Khaled Abughoush has passion to merge his back...,My number one teaming objective is to find a t...,My hard skills include coding in R & Python. I...,I’m looking for teammates who have strong codi...,I’m very interested in working on supply chain...,Some of my hobbies include watching and playin...,Yes,I’d seriously consider it,I’d seriously consider it,PiTech Studio
3,DESIGN- Parsons,Krishangi Agarwal currently is studying MPS Co...,My number one teaming is to spin out after gra...,"Hard skills: Graphic Designing, UI/UX Design \...","Hard skills: Business and data analysis, Compu...","Industries: Education, Sustainability, healthc...","dancing, painting, boardgames, exploring resta...",No,I’d seriously consider it,Very Interested!,Startup Studio
4,MBA- Master of Business Administration,I have ~7 years of management consulting exper...,"My number 1 team objective, is to join a start...","Business skills, - operating models, program m...","People who are trying to work, but will not bo...","Healthcare, Fintech, blockchain",Before i decided to go to MBA where all my tim...,No,I’d seriously consider it,I’d seriously consider it,BigCo Studio


In [108]:
# process Degree
# for i in df.degree.unique():
# 	print(i)

df["degree"] = df["degree"].apply(lambda x: x.split()[0][:-1])

degree_dict = {str(deg):i for i, deg in enumerate(df.degree.unique())}

# 'CM': 0
# 'ORIE': 1
# 'DESIGN': 2
# 'MBA': 3
# 'AAP': 4
# 'CS': 5
# 'LLM': 6
# 'HT': 7
# 'UT': 8
# 'ECE': 9

# convert to onehot
df["degree_0"] = df["degree"].apply(lambda x: int(degree_dict[x]==0))
df["degree_1"] = df["degree"].apply(lambda x: int(degree_dict[x]==1))
df["degree_2"] = df["degree"].apply(lambda x: int(degree_dict[x]==2))
df["degree_3"] = df["degree"].apply(lambda x: int(degree_dict[x]==3))
df["degree_4"] = df["degree"].apply(lambda x: int(degree_dict[x]==4))
df["degree_5"] = df["degree"].apply(lambda x: int(degree_dict[x]==5))
df["degree_6"] = df["degree"].apply(lambda x: int(degree_dict[x]==6))
df["degree_7"] = df["degree"].apply(lambda x: int(degree_dict[x]==7))
df["degree_8"] = df["degree"].apply(lambda x: int(degree_dict[x]==8))
df["degree_9"] = df["degree"].apply(lambda x: int(degree_dict[x]==9))

df = df.drop(["degree"], axis=1)

In [109]:
# process text features
def sentence_cleaner(sentence):
	ps = PorterStemmer()

	# to lowercase, remove punctuation and number, only keep alphabet
	try:
		sentence = sentence.lower()
	except:
		sentence = ""
	sentence = sentence.translate(str.maketrans('', '', string.punctuation))
	sentence = [c for c in sentence if c in string.ascii_lowercase+" "]
	sentence = [c for c in sentence if c not in "1234567890"]
	sentence = "".join(sentence)

	# stem
	words = [ps.stem(w) for w in sentence.lower().split()]

	# remove stopwords
	words = [w for w in words if w not in stopwords.words('english')]

	return words


def get_top_words(n, col):
	# n: number of most frequent words
	# col: the target column 
	word_count = defaultdict(int)
	for bio in col:
		for word in bio:
			word_count[word] += 1

	# only take top n common words
	word_count = sorted(word_count.items(), key=lambda item: item[1], reverse=True)
	top_words  = [i for (i, _) in word_count[:n]]
	return top_words

def convert_text_to_nparray(n, col):
	# n: number of most frequent words
	# col: the cleaned target column 
	top_words = get_top_words(n, col)
	ret = []
	for bio in col:
		words = [1 if w in bio else 0 for w in top_words]
		ret.append(words)

	return np.array(ret)



In [110]:
# process bio
df["bio_clean"] = df["bio"].apply(sentence_cleaner)
bio = convert_text_to_nparray(100, df["bio_clean"])

In [111]:
# process core_obj
df["core_obj_clean"] = df["core_obj"].apply(sentence_cleaner)
core_obj = convert_text_to_nparray(100, df["core_obj_clean"])

In [112]:
# process skill
df["skill_clean"] = df["skill"].apply(sentence_cleaner)
skill = convert_text_to_nparray(100, df["skill_clean"])

In [113]:
# process seeking_skill
df["seeking_skill_clean"] = df["seeking_skill"].apply(sentence_cleaner)
seeking_skill = convert_text_to_nparray(100, df["seeking_skill_clean"])

In [114]:
# process industry
df["industry_clean"] = df["industry"].apply(sentence_cleaner)
industry = convert_text_to_nparray(100, df["industry_clean"])

In [115]:
# process hobby
df["hobby_clean"] = df["hobby"].apply(sentence_cleaner)
hobby = convert_text_to_nparray(100, df["hobby_clean"])

In [116]:
# process tech_talent

df["tech_talent"] = df["tech_talent"].apply(lambda x: 1 if x=="Yes" else 0)

In [117]:
# process hardware

# 0: not interested
# 1: i'd seriously consider it
# 2: very interested

df["hardware"] = df["hardware"].apply(lambda x: 0 if "not" in x.lower() else 1 if "consider" in x.lower() else 2)

# convert to onehot
df["hardware_0"] = df["hardware"].apply(lambda x: int(x==0))
df["hardware_1"] = df["hardware"].apply(lambda x: int(x==1))
df["hardware_2"] = df["hardware"].apply(lambda x: int(x==2))
df = df.drop(["hardware"], axis=1)

In [118]:
# process spin_out

# 0: not interested
# 1: i'd seriously consider it
# 2: very interested

df["spin_out"] = df["spin_out"].apply(lambda x: 0 if "not" in x.lower() else 1 if "consider" in x.lower() else 2)

# convert to onehot
df["spin_out_0"] = df["spin_out"].apply(lambda x: int(x==0))
df["spin_out_1"] = df["spin_out"].apply(lambda x: int(x==1))
df["spin_out_2"] = df["spin_out"].apply(lambda x: int(x==2))
df = df.drop(["spin_out"], axis=1)

In [119]:
# process spring_studio

# 0: startup
# 1: bigco
# 2: pitech
# 3: no preference

df["spring_studio"] = df["spring_studio"].apply(lambda x: 0 if "startup" in x.lower() else 1 if "bigco" in x.lower() else 2 if "pitech" in x.lower() else 3)

# convert to onehot
df["spring_studio_0"] = df["spring_studio"].apply(lambda x: int(x==0))
df["spring_studio_1"] = df["spring_studio"].apply(lambda x: int(x==1))
df["spring_studio_2"] = df["spring_studio"].apply(lambda x: int(x==2))
df["spring_studio_3"] = df["spring_studio"].apply(lambda x: int(x==3))
df = df.drop(["spring_studio"], axis=1)

In [120]:
# clean unused columns
df = df.drop(["bio", "bio_clean", "core_obj", "core_obj_clean", "skill", "skill_clean", "seeking_skill", "seeking_skill_clean", "industry", "industry_clean", "hobby", "hobby_clean"], axis=1)

In [123]:
# organize two types of data

# categorical data -> one hot
categorical_data = df.to_numpy().astype(int)

# text data -> bag of words, n=100
text_bow_data = np.concatenate((bio, core_obj, skill, seeking_skill, industry, hobby), axis=1).astype(int)

categorical_data.shape, text_bow_data.shape

((350, 21), (350, 600))

In [124]:
# save files

CLEANED_DATA_DIR = "./cleaned_data/"

if not os.path.isdir(CLEANED_DATA_DIR):
	os.mkdir(CLEANED_DATA_DIR)

with open(CLEANED_DATA_DIR + "categorical_data.csv", "wb") as f:
   np.savetxt(f, categorical_data, fmt='%s', delimiter=",")

with open(CLEANED_DATA_DIR + "bow_data.csv", "wb") as f:
	np.savetxt(f, text_bow_data, fmt='%s', delimiter=",")