In [1]:
import sys
import nltk
import pandas as pd
import numpy as np
import itertools
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.corpus import brown, stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
sys.path.append("../../oats")
from oats.utils.utils import flatten

In [2]:
# Reading in the file and extracting the name ideas.
df = pd.read_csv("~/Downloads/SurveyResults - CombResults.csv", skiprows=[1,2,3])
df = df[["Q20_5_TEXT","Q20_6_TEXT"]]
name_ideas = flatten([df[df['Q20_5_TEXT'].notnull()]['Q20_5_TEXT'].values, df[df['Q20_6_TEXT'].notnull()]['Q20_6_TEXT'].values])
name_ideas = [name.lower() for name in name_ideas]

In [3]:
# The first few name ideas.
name_ideas[1:10]

['datanect',
 'data on demand',
 'data connection',
 'serviceprovider',
 'data crunchers',
 'science link',
 'do it',
 'datapro',
 'data wrangle connector']

In [4]:
# The total number of name ideas given.
len(name_ideas)

257

In [5]:
# All the words that showed up atleast 2 times, and how many times they showed up.
words = flatten([word_tokenize(name) for name in name_ideas])
words_and_counts = [(word,count) for (word,count) in Counter(words).most_common() if count>=2]
pd.DataFrame(words_and_counts, columns=["word","count"]).to_csv("~/Desktop/words.csv")
words_and_counts

[('data', 64),
 ('science', 15),
 ('scientist', 14),
 ('connect', 12),
 ('datapro', 11),
 ('expert', 11),
 ('pro', 9),
 ('link', 8),
 ('experts', 8),
 ('citizen', 7),
 ('research', 6),
 ('analyst', 5),
 ('finder', 5),
 ('connection', 4),
 ('connector', 4),
 ('connections', 4),
 ('match', 4),
 ('work', 4),
 ('na', 4),
 ('a', 4),
 ('on', 3),
 ('it', 3),
 ('for', 3),
 ('hire', 3),
 ('solutions', 3),
 ('gig', 3),
 ('.', 3),
 ('helper', 3),
 ('analytics', 3),
 ('analysis', 3),
 ('r', 2),
 ('demand', 2),
 ('do', 2),
 ('help', 2),
 ('to', 2),
 ('citizens', 2),
 ('connectors', 2),
 ('dataconnect', 2),
 ('us', 2),
 ('dataguru', 2),
 ('contractor', 2),
 ('school', 2),
 ('the', 2),
 (',', 2),
 ('cit', 2),
 ('sci', 2),
 ('datalink', 2),
 ('gigs', 2),
 ('helping', 2),
 ('by', 2),
 ('none', 2),
 ('easy', 2),
 ('scientists', 2),
 ('prodata', 2),
 ('datagig', 2),
 ('get', 2),
 ('contract', 2),
 ('guru', 2)]

## Looking at substrings between 3 and 10 characters long

In [6]:
# Checking all the substrings that contain between 3 and 10 characters and show up atleast 2 times.
vectorizer = CountVectorizer(stop_words="english", analyzer="char", ngram_range=[3,10], min_df=2)
substring_count_matrix = vectorizer.fit_transform(words)
substring_count_matrix
substrings = vectorizer.get_feature_names()
substring_counts =  substring_count_matrix.toarray().sum(axis=0) 

# List of (substring,count) tuples.
substrings_and_counts = [(s,c) for s,c in zip(substrings,substring_counts)]
substrings_and_counts = sorted(substrings_and_counts, key=lambda x: x[1], reverse=True)

# Mapping between a substring and it's count.
substring_to_count = {s:c for s,c in substrings_and_counts}


# Checks if a substring is part of another longer substring that showed up just as many (or more) times.
# i.e., get rid of "ata" if "data" shows up just as many times, etc.
def check_validity_of_substring(substring, big_list, to_count):
    matches = [other_s for other_s in big_list if (substring in other_s) and (to_count[other_s]>=to_count[substring])]
    return len(matches)==1


# Use that function to find just the valid substrings.
substrings = [s_and_c[0] for s_and_c in substrings_and_counts]
valid_substrings = [s for s in substrings if check_validity_of_substring(s, substrings, substring_to_count)]
print("reduced list of substrings from {} down to {}".format(len(substrings),len(valid_substrings)))


# Subsetting the list of substrings to only include those valid ones.
substrings_and_counts = [tup for tup in substrings_and_counts if tup[0] in valid_substrings]
pd.DataFrame(substrings_and_counts, columns=["substrings","count"]).to_csv("~/Desktop/substrings_3_to_10.csv")
substrings_and_counts

reduced list of substrings from 656 down to 173


[('data', 112),
 ('sci', 47),
 ('con', 44),
 ('ect', 41),
 ('pro', 40),
 ('scien', 38),
 ('conne', 37),
 ('nect', 37),
 ('connect', 36),
 ('per', 32),
 ('ert', 24),
 ('ent', 23),
 ('expert', 23),
 ('ist', 23),
 ('enti', 21),
 ('scient', 21),
 ('scienti', 20),
 ('nce', 19),
 ('scientist', 19),
 ('science', 17),
 ('ink', 16),
 ('cit', 15),
 ('gig', 15),
 ('ion', 15),
 ('link', 15),
 ('nal', 15),
 ('analy', 14),
 ('tion', 14),
 ('ear', 12),
 ('hel', 12),
 ('citizen', 11),
 ('datapro', 11),
 ('esearch', 11),
 ('help', 11),
 ('tor', 11),
 ('analys', 10),
 ('ctor', 10),
 ('ecti', 10),
 ('research', 10),
 ('connection', 9),
 ('ons', 9),
 ('wor', 9),
 ('atch', 8),
 ('experts', 8),
 ('ind', 8),
 ('work', 8),
 ('analyst', 7),
 ('and', 7),
 ('der', 7),
 ('ector', 7),
 ('find', 7),
 ('ing', 7),
 ('match', 7),
 ('tions', 7),
 ('ali', 6),
 ('connector', 6),
 ('ers', 6),
 ('her', 6),
 ('ntr', 6),
 ('tas', 6),
 ('contract', 5),
 ('crowd', 5),
 ('datag', 5),
 ('finder', 5),
 ('gigs', 5),
 ('helper', 5)

## Repeating for substrings between 5 and 10 characters (doesn't include 'gig' and 'pro' but the longer ones are more visible)

In [7]:
# Checking all the substrings that contain between 5 and 10 characters and show up atleast 2 times.
vectorizer = CountVectorizer(stop_words="english", analyzer="char", ngram_range=[5,10], min_df=2)
substring_count_matrix = vectorizer.fit_transform(words)
substring_count_matrix
substrings = vectorizer.get_feature_names()
substring_counts =  substring_count_matrix.toarray().sum(axis=0) 

# List of (substring,count) tuples.
substrings_and_counts = [(s,c) for s,c in zip(substrings,substring_counts)]
substrings_and_counts = sorted(substrings_and_counts, key=lambda x: x[1], reverse=True)

# Mapping between a substring and it's count.
substring_to_count = {s:c for s,c in substrings_and_counts}


# Checks if a substring is part of another longer substring that showed up just as many (or more) times.
# i.e., get rid of "ata" if "data" shows up just as many times, etc.
def check_validity_of_substring(substring, big_list, to_count):
    matches = [other_s for other_s in big_list if (substring in other_s) and (to_count[other_s]>=to_count[substring])]
    return len(matches)==1


# Use that function to find just the valid substrings.
substrings = [s_and_c[0] for s_and_c in substrings_and_counts]
valid_substrings = [s for s in substrings if check_validity_of_substring(s, substrings, substring_to_count)]
print("reduced list of substrings from {} down to {}".format(len(substrings),len(valid_substrings)))


# Subsetting the list of substrings to only include those valid ones.
substrings_and_counts = [tup for tup in substrings_and_counts if tup[0] in valid_substrings]
pd.DataFrame(substrings_and_counts, columns=["substrings","count"]).to_csv("~/Desktop/substrings_5_to_10.csv")
substrings_and_counts

reduced list of substrings from 300 down to 67


[('scien', 38),
 ('conne', 37),
 ('connect', 36),
 ('expert', 23),
 ('scient', 21),
 ('scienti', 20),
 ('scientist', 19),
 ('science', 17),
 ('analy', 14),
 ('citizen', 11),
 ('datapro', 11),
 ('esearch', 11),
 ('analys', 10),
 ('research', 10),
 ('connection', 9),
 ('experts', 8),
 ('analyst', 7),
 ('ector', 7),
 ('match', 7),
 ('tions', 7),
 ('connector', 6),
 ('contract', 5),
 ('crowd', 5),
 ('datag', 5),
 ('finder', 5),
 ('helper', 5),
 ('ctors', 4),
 ('datalink', 4),
 ('onnections', 4),
 ('solution', 4),
 ('analysis', 3),
 ('analytics', 3),
 ('contractor', 3),
 ('datac', 3),
 ('datagig', 3),
 ('datama', 3),
 ('datas', 3),
 ('demand', 3),
 ('ectors', 3),
 ('every', 3),
 ('solutions', 3),
 ('assist', 2),
 ('ataconnect', 2),
 ('awork', 2),
 ('citizens', 2),
 ('connectors', 2),
 ('consu', 2),
 ('crunch', 2),
 ('data4', 2),
 ('dataconnec', 2),
 ('dataf', 2),
 ('dataguru', 2),
 ('datamatch', 2),
 ('datan', 2),
 ('ectpro', 2),
 ('helping', 2),
 ('market', 2),
 ('master', 2),
 ('matcher',