# Using phrasal segmentation results to analyze phrases over time

In [12]:
import pandas as pd
import re
from glob import glob
import numpy as np
import json
import altair as alt

## Initial loading + processing of dataframes and dictionaries

In [3]:
# AutoPhrase results dataframe
fp_phrases = '../results/dblp-v10-grouped/phrases.csv'
phrases = pd.read_csv(fp_phrases, index_col=0)
phrases.head()

Unnamed: 0,Phrase Quality,Phrase,Year,Num Words
0,0.9915,operations research,1950-1959,2
1,0.6505,operations research society of america,1950-1959,5
2,0.5735,high speed,1950-1959,2
3,0.5255,operations research society,1950-1959,3
4,0.981,tunnel diode,1960-1964,2


In [4]:
phrases['Year'].unique()

array(['1950-1959', '1960-1964', '1965-1969', '1970-1974', '1975-1979',
       '1980-1984', '1985-1989', '1990-1994', '1995-1999', '2000-2004',
       '2005-2009', '2010-2014', '2015-2017'], dtype=object)

In [5]:
# Processed phrasal segmentation results dataframe
infolder = '../results/dblp-v10-grouped'
subfolders = glob(infolder + '/*.csv')
subfolders = list(filter(lambda x: 'segmented' in x, subfolders))
seg = pd.DataFrame(columns=['Phrases', 'Year', 'Num Phrases'])
for fp in subfolders:
    df = pd.read_csv(fp, index_col=0)
    df = df.dropna()
    df['Num Phrases'] = df.apply(lambda x: len(x['Phrases'].split(',')), axis=1)
    #df = df.drop('Phrases', axis=1)
    seg = seg.append(df, ignore_index=True)
seg = seg.dropna()
seg.head()

Unnamed: 0,Phrases,Year,Num Phrases
0,"paper,wheatstone,bridge,tangent,triangle,trian...",1950-1959,14
1,"numerical integration,differential equations,o...",1950-1959,6
2,fur,1950-1959,1
3,"computing,computing,amplifier,high,amplifiers,...",1950-1959,8
4,"operations research,journal,operations researc...",1950-1959,5


In [6]:
# Creates dictionary for counts of phrases for each year range
counts = {}
for yr in seg['Year'].unique():
    counts[yr] = {}
    
def add_counts(x):
    phrases = x['Phrases'].split(',')
    year = x['Year']
    for phrase in phrases:
        if phrase not in counts[year]:
            counts[year][phrase] = 0
        counts[year][phrase] += 1

_ = seg.apply(add_counts, axis=1)

# Sorts the counts in descending order
for key, val in counts.items():
    counts[key] = dict(sorted(val.items(), key=lambda item: item[1], reverse=True))

In [7]:
# Only contains the multi-word phrases and counts
multi_counts = {}
for year_range, phrase_counts in counts.items():
    multi_counts[year_range] = {key: val for key, val in phrase_counts.items() if len(key.split()) > 1}

In [49]:
# Creates counts dictionary but with percent as values, rather than raw frequency
counts_per = {}
for year_range, phrase_counts in counts.items():
    total_count_yr = sum(phrase_counts.values())
    prop_counts = {}
    for key, val in phrase_counts.items():
        prop_counts[key] = (val / total_count_yr)
    counts_per[year_range] = prop_counts

## Start of phrases over time EDA + figures

In [9]:
# TODO:
# Ridgeline plot where y-axis is a Phrase, x-axis is the Year Range, height showing the frequency or percentage of each phrase

In [61]:
# Add more phrases to list by looking at the dictionaries
interesting_phrases = ['cloud computing', 'machine learning', 'neural network', 'data mining', 'social networks',
                      'image processing', 'computer vision', 'user experience', 'genetic algorithms', 'computer science',
                      'virtual reality']

In [62]:
# Looking at count (raw frequency) of each phrase across year ranges
data = pd.DataFrame(columns=['Year Range', 'Count', 'Phrase'])
phrases_to_graph = ['machine learning', 'computer vision', 'neural network']

for phrase in phrases_to_graph:
    for key, val in counts.items():
        count = val.get(phrase)
        data.loc[len(data)] = [key, count, phrase]

In [63]:
# Issue with looking at raw frequency is that it will depend on how many papers are in each year range
# There are more papers over time, so the frequency will naturally increase for almost all phrases
alt.Chart(data).mark_line().encode(x='Year Range', y='Count', color='Phrase')

In [86]:
# Looking at percentage of each phrase across year ranges
phrases_to_graph = ['machine learning', 'computer vision', 'neural network']

data = pd.DataFrame(columns=['Year Range', 'Percent', 'Phrase'])
for phrase in phrases_to_graph:
    for key, val in counts_per.items():
        count = val.get(phrase)
        data.loc[len(data)] = [key, count, phrase]
data = data.fillna(0)

In [87]:
alt.Chart(data).mark_line().encode(x='Year Range', y=alt.Y('Percent', axis=alt.Axis(format='%')), color='Phrase')

In [101]:
nearest = alt.selection(type='single', nearest=True, on='mouseover',
                        fields=['Year Range'], empty='none')

line = alt.Chart(data).mark_line(interpolate='basis').encode(
    x='Year Range',
    y=alt.Y('Percent', axis=alt.Axis(format='%')),
    color='Phrase')

selectors = alt.Chart(data).mark_point().encode(
    x='Year Range',
    opacity=alt.value(0)).add_selection(nearest)

points = line.mark_point().encode(opacity=alt.condition(nearest, alt.value(1), alt.value(0)))
    
text = line.mark_text(align='left', dx=5, dy=-5).encode(
    text=alt.condition(nearest, 'Percent', alt.value(' ')))
    
rules = alt.Chart(data).mark_rule(color='gray').encode(
    x='Year Range').transform_filter(nearest)
    
alt.layer(line, selectors, points, rules, text).properties(width=600, height=300)

In [90]:
alt.Chart(data).mark_area(opacity=0.5).encode(x='Year Range', 
                                              y=alt.Y('Percent', stack=None), 
                                              color='Phrase',
                                              tooltip=['Phrase', 'Year Range', 'Percent']).interactive()