In [1]:
import re
import csv
import time
import math
import json
import string
import pickle
import requests
import numpy as np
import pandas as pd
import seaborn as sns
import readability
import statistics
from sklearn import metrics
from itertools import combinations
from collections import defaultdict
import matplotlib.pyplot as plt
from scipy import stats
import statsmodels.formula.api as smf
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
pd.options.display.max_rows = 1000
pd.options.display.max_columns = 300
pd.options.display.max_colwidth = 500

### Count hype words

In [6]:
hype_dict = {
 'Importance': ['compelling',
 'critical',
 'crucial',
 'essential',
 'foundational',
 'fundamental',
 'imperative',
 'important',
 'indispensable',
 'invaluable',
 'key',
 'major',
 'paramount',
 'pivotal',
 'significant',
 'strategic',
 'timely',
 'ultimate',
 'urgent',
 'vital'],

'Novelty': ['creative',
 'emerging',
 'first',
 'groundbreaking',
 'innovative',
 'latest',
 'novel',
 'revolutionary',
 'unique',
 'unparalleled',
 'unprecedented'],
             
'Rigor': ['accurate',
 'advanced',
 'careful',
 'cohesive',
 'detailed',
 'nuanced',
 'powerful',
 'quality',
 'reproducible',
 'rigorous',
 'robust',
 'scientific',
 'sophisticated',
 'strong',
 'systematic'],
             
'Scale': ['ample',
 'biggest',
 'broad',
 'comprehensive',
 'considerable',
 'deeper',
 'diverse',
 'enormous',
 'expansive',
 'extensive',
 'fastest',
 'greatest',
 'huge',
 'immediate',
 'immense',
 'interdisciplinary',
 'international',
 'interprofessional',
 'largest',
 'massive',
 'multidisciplinary',
 'myriad',
 'overwhelming',
 'substantial',
 'top',
 'transdisciplinary',
 'tremendous',
 'vast'],
             
'Utility': ['accessible',
 'actionable',
 'deployable',
 'durable',
 'easy',
 'effective',
 'efficacious',
 'efficient',
 'generalizable',
 'ideal',
 'impactful',
 'intuitive',
 'meaningful',
 'productive',
 'ready',
 'relevant',
 'rich',
 'safer',
 'scalable',
 'seamless',
 'sustainable',
 'synergistic',
 'tailored',
 'tangible',
 'transformative',
 'user-friendly'],
             
'Quality': ['ambitious',
 'collegial',
 'dedicated',
 'exceptional',
 'experienced',
 'intellectual',
 'longstanding',
 'motivated',
 'premier',
 'prestigious',
 'promising',
 'qualified',
 'renowned',
 'senior',
 'skilled',
 'stellar',
 'successful',
 'talented',
 'vibrant'],
             
'Attitude': ['attractive',
 'confident',
 'exciting',
 'incredible',
 'interesting',
 'intriguing',
 'notable',
 'outstanding',
 'remarkable',
 'surprising'],
             
'Problem': ['alarming',
 'daunting',
 'desperate',
 'devastating',
 'dire',
 'dismal',
 'elusive',
 'stark',
 'unanswered',
 'unmet']
}

In [7]:
hype_words = set()
for cate in hype_dict:
    for w in hype_dict[cate]:
        hype_words.add(w)

In [8]:
len(hype_words)

139

In [29]:
' + '.join(hype_words)

'nuanced + interdisciplinary + latest + motivated + durable + groundbreaking + premier + fastest + senior + efficacious + crucial + key + intuitive + substantial + strong + dire + emerging + tangible + first + top + exceptional + tremendous + notable + stark + comprehensive + expansive + largest + user-friendly + unprecedented + systematic + innovative + experienced + synergistic + talented + confident + pivotal + revolutionary + remarkable + fundamental + actionable + dedicated + enormous + skilled + vast + relevant + devastating + vibrant + seamless + biggest + significant + unanswered + effective + dismal + qualified + powerful + scalable + robust + greatest + ready + impactful + generalizable + huge + promising + critical + safer + daunting + accurate + meaningful + diverse + invaluable + advanced + productive + alarming + international + multidisciplinary + imperative + interesting + desperate + interprofessional + intriguing + ample + quality + successful + attractive + exciting 

In [16]:
table = str.maketrans('', '', string.punctuation)

def parse_text(text):
    words = text.lower().split()
    # rm punctuation
    words = [w.translate(table) for w in words]
    return words
    
def count_words(words):
    return len(words)
    
def count_hype_words(words):
    mdict = defaultdict(int)
    for w in words:
        if w in hype_words:
            mdict[w] += 1
    cn_li = [mdict[w] for w in hype_words]
    return cn_li

In [34]:
data_comb['words_parse'] = data_comb['Project_description_clean'].apply(parse_text)

In [35]:
data_comb['num_words'] = data_comb['words_parse'].apply(count_words)

In [36]:
data_comb['words_cn_li'] = data_comb['words_parse'].apply(count_hype_words)

In [37]:
data_comb[[w for w in hype_words]] = pd.DataFrame(data_comb['words_cn_li'].values.tolist(), index = data_comb.index)
data_comb = data_comb.drop(columns=['words_cn_li'])

In [39]:
data_comb['total_num_hype_words'] = data_comb[list(hype_words)].sum(axis=1)

In [40]:
data_comb['frac_hype_words'] = data_comb['total_num_hype_words'] / data_comb['num_words']

In [41]:
# count category
for cate in hype_dict:
    words = hype_dict[cate]
    data_comb[cate] = np.sum(data_comb[words], axis = 1)

#### Save reg data

In [63]:
data_comb.to_csv(data_root+'reg_data.csv', header=True, index=False, quoting=csv.QUOTE_ALL)

In [28]:
# data_comb = pd.read_csv(data_root+'reg_data.csv', header=0)