# Using phrasal segmentation results to analyze phrases over time

In [9]:
import pandas as pd
import re
from glob import glob
import numpy as np
import json
import altair as alt
import csv

## Initial loading + processing of dataframes and dictionaries

In [10]:
# AutoPhrase results dataframe
fp_phrases = '../results/dblp-v10-grouped/phrases.csv'
phrases = pd.read_csv(fp_phrases, index_col=0)
phrases.head()

Unnamed: 0,Phrase Quality,Phrase,Year,Num Words
0,0.9915,operations research,1950-1959,2
1,0.6505,operations research society of america,1950-1959,5
2,0.5735,high speed,1950-1959,2
3,0.5255,operations research society,1950-1959,3
4,0.981,tunnel diode,1960-1964,2


In [11]:
phrases['Year'].unique()

array(['1950-1959', '1960-1964', '1965-1969', '1970-1974', '1975-1979',
       '1980-1984', '1985-1989', '1990-1994', '1995-1999', '2000-2004',
       '2005-2009', '2010-2014', '2015-2017'], dtype=object)

In [8]:
# Processed phrasal segmentation results dataframe
infolder = '../results/dblp-v10-grouped'
subfolders = glob(infolder + '/*.csv')
subfolders = list(filter(lambda x: 'segmented' in x, subfolders))
seg = pd.DataFrame(columns=['Phrases', 'Year', 'Num Phrases'])
for fp in subfolders:
    df = pd.read_csv(fp, index_col=0)
    df = df.dropna()
    df['Num Phrases'] = df.apply(lambda x: len(x['Phrases'].split(',')), axis=1)
    #df = df.drop('Phrases', axis=1)
    seg = seg.append(df, ignore_index=True)
seg = seg.dropna()
seg.head()

Unnamed: 0,Phrases,Year,Num Phrases
0,"paper,wheatstone,bridge,tangent,triangle,trian...",1950-1959,14
1,"numerical integration,differential equations,o...",1950-1959,6
2,fur,1950-1959,1
3,"computing,computing,amplifier,high,amplifiers,...",1950-1959,8
4,"operations research,journal,operations researc...",1950-1959,5


In [12]:
# Creates dictionary for counts of phrases for each year range
counts = {}
for yr in seg['Year'].unique():
    counts[yr] = {}
    
def add_counts(x):
    phrases = x['Phrases'].split(',')
    year = x['Year']
    for phrase in phrases:
        if phrase not in counts[year]:
            counts[year][phrase] = 0
        counts[year][phrase] += 1

_ = seg.apply(add_counts, axis=1)

# Sorts the counts in descending order
for key, val in counts.items():
    counts[key] = dict(sorted(val.items(), key=lambda item: item[1], reverse=True))

In [13]:
# Only contains the multi-word phrases and counts
multi_counts = {}
for year_range, phrase_counts in counts.items():
    multi_counts[year_range] = {key: val for key, val in phrase_counts.items() if len(key.split()) > 1}

In [14]:
# Creates counts dictionary but with percent as values, rather than raw frequency
counts_per = {}
for year_range, phrase_counts in counts.items():
    total_count_yr = sum(phrase_counts.values())
    prop_counts = {}
    for key, val in phrase_counts.items():
        prop_counts[key] = (val / total_count_yr)
    counts_per[year_range] = prop_counts

# Testing gephi yearly data processing

In [15]:
BAD_PHRASES_MULTI = set(['an adaptive', 'based approach', 'de los', 'en la',
                         'de la', 'en el', 'de las', '2005 copyright spie',
                         'outcomes of dagstuhl seminar', 'periodicals inc comput appl eng educ',
                         'de donnees', 'de ces', '2008 copyright spie', 'que la',
                         'sur des', 'et les', 'sur la', 'sur les', 'dans les',
                         'proposed algorithm', 'proposed approach', 'proposed method',
                         'a case study', 'recent years', 'an adaptive', 'an overview',
                         'proposed scheme', 'case study', 'obtainable from cpc program library queens',
                         'university belfast n irelandrnrnlicensing provisions',
                         'format targzrnrnprogramming language',
                         'summaryrunprogram title',
                         'distributed program including test data etc'])

In [16]:
edge_thresh = 0

In [19]:
infolder = '../results/gephi'

# Gephi segmentation csvs only contain high-quality, multi-word phrases (no duplicates per paper)
subfolders = glob(infolder + '/*.csv')
subfolders = list(filter(lambda x: 'segmented' in x, subfolders))
seg = pd.DataFrame(columns=['Phrases', 'Year Range'])
for fp in subfolders:
    df = pd.read_csv(fp, index_col=0)
    df = df.dropna()
    seg = seg.append(df, ignore_index=True)
seg = seg.dropna()
seg['Phrases'] = seg['Phrases'].map(lambda x: x.split(','))

# Removes any papers (rows) with only a single phrase - no edges are possible
seg = seg[seg.apply(lambda x: len(x['Phrases']) > 1, axis=1)]

# Creates and outputs EdgeData.csv
edge_counts = {}
def get_edges(line):
    """
    Helper function to process segmentation results csv to get edge data
    Modifies the edge_dict dictionary
    """
    year_range = line['Year Range']
    phrase_lst = line['Phrases']
    for phrase in phrase_lst:
        for inner_phrase in phrase_lst:
            # Prevents any bad phrases from being included
            if phrase in BAD_PHRASES_MULTI or inner_phrase in BAD_PHRASES_MULTI:
                continue

            # Prevents any self-comparisons
            if phrase == inner_phrase: continue

            # Modifies phrase to include Year Range in parenthesis
            temp_phrase = phrase +  ' (' + year_range + ')'
            temp_inner_phrase = inner_phrase + ' (' + year_range + ')'

            # Stops any comparisons of existing phrase A - phrase B comparisons
            # We don't need to add the phrase B - phrase A data to the dictionary
            if temp_inner_phrase in edge_counts and temp_phrase in edge_counts[temp_inner_phrase]:
                continue

            # Creates inner dictionary and adds to count
            if temp_phrase not in edge_counts:
                edge_counts[temp_phrase] = {}
            if temp_inner_phrase not in edge_counts[temp_phrase]:
                edge_counts[temp_phrase][temp_inner_phrase] = 0
            edge_counts[temp_phrase][temp_inner_phrase] += 1
    return
# Applies helper function to seg dataframe. The function will just modify
# the edge_dict dictionary
_ = seg.apply(lambda x: get_edges(x), axis=1)
# Filters out edges that have less than edge_thresh overlaps
edge_filtered = {}
edge_phrases = set() # Keeps track of phrases included in EdgeData

# For each year range, provide a certain threshold
# or maybe a certain percentage of the top edges can be included?
edge_thresh_dict = {}
for phrase, phrase_counts in edge_counts.items():
    for inner_phrase, count in phrase_counts.items():
        # Extracts the year range for the phrase
        year_range = inner_phrase.split()[-1][1:-1]

        if phrase not in edge_filtered:
            edge_filtered[phrase] = {}
        edge_filtered[phrase][inner_phrase] = count
        edge_phrases.add(phrase)
        edge_phrases.add(inner_phrase)

In [20]:
edge_filtered

{'positive definite (1950-1959)': {'numerical integration (1950-1959)': 1,
  'differential equations (1950-1959)': 1},
 'numerical integration (1950-1959)': {'differential equations (1950-1959)': 2,
  'numerical method (1950-1959)': 1,
  'power series (1950-1959)': 1,
  'ordinary differential equations (1950-1959)': 1,
  'quadratic equation (1950-1959)': 1,
  'differential equation (1950-1959)': 1},
 'machine language (1950-1959)': {'future systems (1950-1959)': 1,
  'error detection (1950-1959)': 1,
  'los angeles (1950-1959)': 1},
 'operations research (1950-1959)': {'linear programming (1950-1959)': 4,
  'statistical methods (1950-1959)': 1,
  'numerical methods (1950-1959)': 1,
  'dynamic programming (1950-1959)': 1,
  'los angeles (1950-1959)': 1},
 'power series (1950-1959)': {'round off error (1950-1959)': 1,
  'ordinary differential equations (1950-1959)': 1,
  'differential equations (1950-1959)': 1,
  'quadratic equation (1950-1959)': 1,
  'differential equation (1950-1959)':

In [21]:
yearly_edge_counts = {}
for key, val in edge_filtered.items():
    year_range = key.split()[-1][1:-1]
    if year_range not in yearly_edge_counts:
        yearly_edge_counts[year_range] = 0
    yearly_edge_counts[year_range] += len(val)

In [22]:
yearly_edge_counts

{'1950-1959': 115,
 '1960-1964': 220,
 '1965-1969': 784,
 '1970-1974': 1807,
 '1975-1979': 4459,
 '1980-1984': 13397,
 '1985-1989': 68049,
 '1990-1994': 322991,
 '1995-1999': 1064763,
 '2000-2004': 3130660,
 '2005-2009': 7515650,
 '2010-2014': 12606010,
 '2015-2017': 7076865}

## Start of phrases over time EDA + figures

In [8]:
# TODO:
# Ridgeline plot where y-axis is a Phrase, x-axis is the Year Range, height showing the frequency or percentage of each phrase

In [9]:
# Add more phrases to list by looking at the dictionaries
interesting_phrases = ['cloud computing', 'machine learning', 'neural network', 'data mining', 'social networks',
                      'image processing', 'computer vision', 'user experience', 'genetic algorithms', 'computer science',
                      'virtual reality']

In [10]:
# Looking at count (raw frequency) of each phrase across year ranges
data = pd.DataFrame(columns=['Year Range', 'Count', 'Phrase'])
phrases_to_graph = ['machine learning', 'computer vision', 'neural network']

for phrase in phrases_to_graph:
    for key, val in counts.items():
        count = val.get(phrase)
        data.loc[len(data)] = [key, count, phrase]

In [11]:
# Issue with looking at raw frequency is that it will depend on how many papers are in each year range
# There are more papers over time, so the frequency will naturally increase for almost all phrases
alt.Chart(data).mark_line().encode(x='Year Range', y='Count', color='Phrase')

In [12]:
# Looking at percentage of each phrase across year ranges
phrases_to_graph = ['machine learning', 'computer vision', 'neural network', 'virtual reality']

data = pd.DataFrame(columns=['Year Range', 'Percent', 'Phrase'])
for phrase in phrases_to_graph:
    for key, val in counts_per.items():
        count = val.get(phrase)
        data.loc[len(data)] = [key, count, phrase]
data = data.fillna(0)

In [13]:
alt.Chart(data).mark_line().encode(x='Year Range',
                                   y=alt.Y('Percent', axis=alt.Axis(format='%')),
                                   color='Phrase'
                                   ).properties(title='Phrase Percent Prominence over Year Ranges')

In [14]:
nearest = alt.selection(type='single', nearest=True, on='mouseover',
                        fields=['Year Range'], empty='none')

line = alt.Chart(data).mark_line(interpolate='basis').encode(
    x='Year Range',
    y=alt.Y('Percent', axis=alt.Axis(format='%')),
    color='Phrase')

selectors = alt.Chart(data).mark_point().encode(
    x='Year Range',
    opacity=alt.value(0)).add_selection(nearest)

points = line.mark_point().encode(opacity=alt.condition(nearest, alt.value(1), alt.value(0)))
    
text = line.mark_text(align='left', dx=5, dy=-5).encode(
    text=alt.condition(nearest, 'Percent', alt.value(' ')))
    
rules = alt.Chart(data).mark_rule(color='gray').encode(
    x='Year Range').transform_filter(nearest)

alt.layer(line, selectors, points, rules, text).properties(width=600,
                                                           height=300,
                                                           title='Phrase Percent Prominence over Year Ranges (interactive)')

In [15]:
alt.Chart(data).mark_area(opacity=0.5).encode(x='Year Range', 
                                              y=alt.Y('Percent', stack=None), 
                                              color='Phrase',
                                              tooltip=['Phrase', 'Year Range', 'Percent']
                                              ).properties(title='Phrase Percent Prominence over Year Ranges').interactive()

# Gephi data pre-processing

In [16]:
# Gephi segmentation csvs only contain high-quality, multi-word phrases (no duplicates per paper)
infolder = '../results/gephi'
subfolders = glob(infolder + '/*.csv')
#subfolders = list(filter(lambda x: 'segmented' in x, subfolders))
data = pd.DataFrame(columns=['Phrases', 'Year Range'])
for fp in subfolders:
    df = pd.read_csv(fp, index_col=0)
    df = df.dropna()
    #df['Num Phrases'] = df.apply(lambda x: len(x['Phrases'].split(',')), axis=1)
    #df = df.drop('Phrases', axis=1)
    data = data.append(df, ignore_index=True)
data = data.dropna()
data['Phrases'] = data['Phrases'].map(lambda x: x.split(','))

In [17]:
# Can use to remove papers with only a single phrase
data = data[data.apply(lambda x: len(x['Phrases']) > 1, axis=1)]

In [18]:
# Use to see the distribution of phrase counts per paper
# data['Phrases'].map(len).value_counts()

In [19]:
labels = pd.DataFrame(columns=['ID', 'Label', 'Count'])
labels

Unnamed: 0,ID,Label,Count


In [20]:
bad_phrases = set(['fur', 'e', 'ma', 'fr', 'as', 'first', 'most', 'so', 'if', 'ii', 'i', 'k',
                   'm', 'd', 'far', 'b', 'co', 't', 's', 'h', 'et'])
to_add = ['proposed method', 'proposed approach', 'recent years']

In [21]:
label_counts = {}
def get_label_counts(x):
    for phrase in x:
        # if phrase in bad_phrases: continue
        if phrase not in label_counts:
            label_counts[phrase] = 0
        label_counts[phrase] += 1

_ = data.apply(lambda x: get_label_counts(x['Phrases']), axis=1)
label_counts = dict(sorted(label_counts.items(), key=lambda item: item[1], reverse=True))

In [22]:
labels = pd.DataFrame.from_dict(label_counts, orient='index', columns=['Count']).reset_index().rename(columns={'index': 'ID'})
labels['Label'] = labels['ID']
labels = labels[['ID', 'Label', 'Count']]
labels

Unnamed: 0,ID,Label,Count
0,proposed method,proposed method,90379
1,proposed approach,proposed approach,44371
2,proposed algorithm,proposed algorithm,40898
3,a case study,a case study,32366
4,recent years,recent years,28936
...,...,...,...
84292,catastrophic events,catastrophic events,1
84293,minority over sampling,minority over sampling,1
84294,semi dense,semi dense,1
84295,k user mimo,k user mimo,1


In [23]:
labels[labels['Count'] > 300].to_csv('../results/NodeData.csv')

In [24]:
#labels.to_csv('../results/NodeData.csv')

In [25]:
data

Unnamed: 0,Phrases,Year Range
0,"[positive definite, numerical integration, dif...",1950-1959
2,"[machine language, future systems]",1950-1959
3,"[operations research, linear programming]",1950-1959
4,"[power series, round off error]",1950-1959
7,"[operations research, linear programming]",1950-1959
...,...,...
2471993,"[level set, current state, remains unclear, vo...",2015-2017
2471994,"[extended kalman filter ekf, received signal s...",2015-2017
2471996,"[infinite horizon, dynamic pricing, discount r...",2015-2017
2471997,"[per minute, operating conditions, m 1, power ...",2015-2017


In [26]:
edge_dict = {}

def get_edges(phrase_lst):

    for phrase in phrase_lst:
        for inner_phrase in phrase_lst:
            # Prevents any self-comparisons
            if phrase == inner_phrase: continue

            # Stops any comparisons of existing phrase A - phrase B comparisons
            # We don't need to add the phrase B - phrase A data to the dictionary
            if inner_phrase in edge_dict and phrase in edge_dict[inner_phrase]:
                continue

            if phrase not in edge_dict:
                edge_dict[phrase] = {}

            if inner_phrase not in edge_dict[phrase]:
                edge_dict[phrase][inner_phrase] = 0
            edge_dict[phrase][inner_phrase] += 1

In [27]:
# test = pd.DataFrame(['a,b,c', 'b,c,a', 'c,b', 'a,b,c'], columns=['Phrases'])
# test['Phrases'] = test.apply(lambda x: x['Phrases'].split(','), axis=1)
# test

In [28]:
# _ = test.apply(lambda x: get_edges(x['Phrases']), axis=1)
# edge_dict

In [29]:
_ = data.apply(lambda x: get_edges(x['Phrases']), axis=1)

In [30]:
#edge_dict

In [31]:
threshold = 300
edge_filtered = {}
for phrase, phrase_counts in edge_dict.items():
    edge_filtered[phrase] = {}
    for inner_phrase, count in phrase_counts.items():
        if count < threshold:
            continue
        edge_filtered[phrase][inner_phrase] = count

In [32]:
edge_filtered

{'positive definite': {},
 'numerical integration': {},
 'machine language': {},
 'operations research': {},
 'power series': {},
 'numerical method': {'numerical results': 324},
 'linear function': {},
 'network theory': {},
 'inverse problem': {},
 'steady state': {},
 'negative feedback': {},
 'angular frequency': {},
 'inventory control': {},
 'chemical process': {},
 'north american': {},
 'standard language': {},
 'san diego': {},
 'error detection': {},
 'world war': {},
 'standard model': {},
 'pulse repetition frequency': {},
 'ordinary differential equations': {},
 'differential equations': {'differential equation': 326},
 'quadratic equation': {},
 'electric field': {},
 'space charge': {},
 'reflection coefficient': {},
 'statistical model': {},
 'jet propulsion laboratory': {},
 'conditional probability': {},
 'false alarm': {},
 'likelihood ratio': {},
 'impulse response': {},
 'closed form': {},
 'level crossings': {},
 'statistical theory': {},
 'dependent variable': {}

In [33]:
import csv

In [34]:
# Testing with only including nodes in the EdgeData in NodeData

In [35]:
edge_phrases = set()

In [36]:
for phrase, phrase_counts in edge_filtered.items():
    for inner_phrase, count in phrase_counts.items():
        edge_phrases.add(inner_phrase)
        edge_phrases.add(phrase)

In [37]:
edge_phrases

{'a case study',
 'access control',
 'access network',
 'access networks',
 'activation function',
 'activation functions',
 'ad hoc',
 'adaptive control',
 'admission control',
 'amino acid',
 'amino acids',
 'an ad hoc network',
 'an adaptive',
 'an improved',
 'an open source',
 'an overview',
 'analytical model',
 'analytical results',
 'anomaly detection',
 'approximation algorithm',
 'approximation algorithms',
 'approximation ratio',
 'artificial intelligence',
 'artificial neural network',
 'artificial neural network ann',
 'artificial neural networks',
 'association rule mining',
 'association rules',
 'asymptotic stability',
 'augmented reality',
 'augmented reality ar',
 'automatic speech recognition',
 'automatic speech recognition asr',
 'back propagation',
 'base station',
 'base stations',
 'based access control',
 'based algorithm',
 'based approach',
 'based method',
 'based methods',
 'benchmark datasets',
 'benchmark functions',
 'benchmark problems',
 'ber performan

In [38]:
label_counts = {}
def get_label_counts(x):
    for phrase in x:
        # Skips to next phrase if it is not in the EdgeData
        if phrase not in edge_phrases:
            continue

        if phrase not in label_counts:
            label_counts[phrase] = 0
        label_counts[phrase] += 1

_ = data.apply(lambda x: get_label_counts(x['Phrases']), axis=1)
label_counts = dict(sorted(label_counts.items(), key=lambda item: item[1], reverse=True))

In [39]:
labels = pd.DataFrame.from_dict(label_counts, orient='index', columns=['Count']).reset_index().rename(columns={'index': 'ID'})
labels['Label'] = labels['ID']
labels = labels[['ID', 'Label', 'Count']]
labels

Unnamed: 0,ID,Label,Count
0,proposed method,proposed method,90379
1,proposed approach,proposed approach,44371
2,proposed algorithm,proposed algorithm,40898
3,a case study,a case study,32366
4,recent years,recent years,28936
...,...,...,...
713,an ad hoc network,an ad hoc network,487
714,interference alignment ia,interference alignment ia,462
715,obtainable from cpc program library queens,obtainable from cpc program library queens,441
716,based access control,based access control,433
