# Sociology concept popularity
- This notebook selects data from the web of science about the popularity of sociology concepts over time. 
- Unfortunately this code is built for proprietary data and systems. :( 
- Concepts chosen from: https://journals.sagepub.com/doi/full/10.1177/0003122419846628

## Packages

In [1]:
from pyspark.sql import SQLContext
import pandas as pd
from collections import Counter
from pyspark.sql.functions import *
import pyspark.sql

In [2]:
sqlC = SQLContext(sc)

## Data

In [4]:
cites = sqlC.read.parquet("wos_core_clean.parquet")
cites.columns

['UID',
 'keywords',
 'references',
 'full_abstract',
 'all_lang',
 'pubyear',
 'has_abstract',
 'pubtype',
 'subjects',
 'subheadings',
 'headings',
 'item_title',
 'journal',
 'issn',
 'isbn',
 'eissn',
 'eisbn',
 'doi',
 'bare_text']

In [5]:
cites = cites.dropDuplicates(subset=['UID'])
#cites.count()

## Terms of interest

In [6]:
terms = ['intersectional', 'bell curve', 'second shift', 'bowling alone', 
         'clash of civilizations', 'creative class', 'overworked american', 
         'culture of fear']
len(terms)

8

## Count papers using the concepts

In [9]:
titles = dict()
abstracts = dict()

In [12]:
for t in terms:
    print('working on:', t)
    titles[t] = cites.filter(lower(col('item_title')).contains(t)
                            ).groupby('pubyear').count().toPandas()
    abstracts[t] = cites.filter(lower(col('full_abstract')).contains(t)
                               ).groupby('pubyear').count().toPandas()

('working on:', 'intersectional')
('working on:', 'bell curve')
('working on:', 'second shift')
('working on:', 'bowling alone')
('working on:', 'clash of civilizations')
('working on:', 'creative class')
('working on:', 'overworked americans')
('working on:', 'culture of fear')


In [19]:
together = pd.DataFrame()
together['pubyear'] = 2000
for t in terms:
    tmp = titles[t].copy()
    tmp.columns = ['pubyear', t+' in title']
    together = together.merge(tmp, on='pubyear', how='outer')
    
    tmp = abstracts[t].copy()
    tmp.columns = ['pubyear', t+' in abstract']
    together = together.merge(tmp, on='pubyear', how='outer')
    
together = together.fillna(0)
together.sort_values(by='pubyear', inplace=True)
for c in together.columns:
    together[c] = together[c].astype(int)
together.head()

Unnamed: 0,pubyear,intersectional in title,intersectional in abstract,bell curve in title,bell curve in abstract,second shift in title,second shift in abstract,bowling alone in title,bowling alone in abstract,clash of civilizations in title,clash of civilizations in abstract,creative class in title,creative class in abstract,overworked americans in title,overworked americans in abstract,culture of fear in title,culture of fear in abstract
0,1951,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
7,1956,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
31,1966,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
23,1968,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
20,1970,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [20]:
together.tail()

Unnamed: 0,pubyear,intersectional in title,intersectional in abstract,bell curve in title,bell curve in abstract,second shift in title,second shift in abstract,bowling alone in title,bowling alone in abstract,clash of civilizations in title,clash of civilizations in abstract,creative class in title,creative class in abstract,overworked americans in title,overworked americans in abstract,culture of fear in title,culture of fear in abstract
4,2014,105,191,2,6,1,4,2,2,6,10,22,22,3,0,0,0
45,2015,170,330,2,8,2,13,5,3,8,17,19,44,5,0,1,4
10,2016,198,385,0,7,1,10,0,0,3,12,11,28,3,0,0,7
19,2017,252,546,1,11,2,12,3,2,9,6,7,22,0,0,2,4
13,2018,311,708,0,11,2,6,0,2,3,10,7,39,0,0,3,10


In [21]:
together.to_csv('data/soc_concept_counts_wos.tsv', sep='\t', index=False)

## Count papers  citing the concepts

In [42]:
refs = cites.select('pubyear', explode('references.reference').alias('r')
                   ).select('pubyear','r.UID')
refs.cache()
refs.show()

+-------+--------------------+
|pubyear|                 UID|
+-------+--------------------+
|   1997|WOS:0000583917000...|
|   1997|WOS:0000583917001...|
|   1997|WOS:0001746017000...|
|   1997|WOS:0000586620000...|
|   1997|WOS:0000586620000...|
|   1997|WOS:0000586620000...|
|   1997|WOS:0000603220000...|
|   1997|WOS:0000603220000...|
|   1997|WOS:0000603220000...|
|   1997|WOS:0000603220000...|
|   1997|WOS:0000603220000...|
|   1997|   000365728600009.5|
|   1997| WOS:A1996VJ15900002|
|   1997|WOS:0000604356000...|
|   1997| 000330694200008.180|
|   1997|WOS:0001765478000...|
|   1997|WOS:0000604356000...|
|   1997|WOS:0000604356000...|
|   1997|WOS:0000604356000...|
|   1997|WOS:0000604356000...|
+-------+--------------------+
only showing top 20 rows



In [43]:
for t in terms:
    print('working on:', t)
    tmp = cites.filter(lower(col('item_title')).contains(t)
                            ).select('UID')
    tmp = tmp.join(refs, on='UID', how='inner')
    titles[t] = tmp.groupby('pubyear').count().toPandas()
    
    tmp = cites.filter(lower(col('full_abstract')).contains(t)
                            ).select('UID')
    tmp = tmp.join(refs, on='UID', how='inner')
    abstracts[t] = tmp.groupby('pubyear').count().toPandas()

('working on:', 'intersectional')
('working on:', 'bell curve')
('working on:', 'second shift')
('working on:', 'bowling alone')
('working on:', 'clash of civilizations')
('working on:', 'creative class')
('working on:', 'overworked americans')
('working on:', 'culture of fear')


In [44]:
together2 = pd.DataFrame()
together2['pubyear'] = 2000
for t in terms:
    tmp = titles[t].copy()
    tmp.columns = ['pubyear', t+' in title']
    together2 = together2.merge(tmp, on='pubyear', how='outer')
    
    tmp = abstracts[t].copy()
    tmp.columns = ['pubyear', t+' in abstract']
    together2 = together2.merge(tmp, on='pubyear', how='outer')
    
together2 = together2.fillna(0)
together2.sort_values(by='pubyear', inplace=True)
for c in together2.columns:
    together2[c] = together2[c].astype(int)
together2.head()

Unnamed: 0,pubyear,intersectional in title,intersectional in abstract,bell curve in title,bell curve in abstract,second shift in title,second shift in abstract,bowling alone in title,bowling alone in abstract,clash of civilizations in title,clash of civilizations in abstract,creative class in title,creative class in abstract,overworked americans in title,overworked americans in abstract,culture of fear in title,culture of fear in abstract
0,1951,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
33,1957,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
41,1965,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
7,1967,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
23,1968,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [45]:
together2.tail()

Unnamed: 0,pubyear,intersectional in title,intersectional in abstract,bell curve in title,bell curve in abstract,second shift in title,second shift in abstract,bowling alone in title,bowling alone in abstract,clash of civilizations in title,clash of civilizations in abstract,creative class in title,creative class in abstract,overworked americans in title,overworked americans in abstract,culture of fear in title,culture of fear in abstract
4,2014,1013,1825,25,221,12,192,36,30,121,101,409,651,6,0,2,34
48,2015,2084,3611,26,231,25,253,37,24,184,157,404,767,8,0,9,64
10,2016,2566,4561,28,246,27,273,41,36,209,173,297,676,13,0,5,64
19,2017,2966,5444,29,242,31,266,46,34,187,188,417,827,2,0,3,71
13,2018,3969,7764,25,294,35,278,59,51,179,146,389,946,3,0,6,62


In [46]:
together2.to_csv('data/soc_concept_cited_counts_wos.tsv', sep='\t', index=False)

## Fin