# Clean SJR data and convert to parquet
- Data downloaded from: https://www.scimagojr.com/journalrank.php

## setup

In [1]:
from pyspark.sql import SQLContext
#import pandas as pd
from pyspark.sql.functions import *
import pyspark.sql

sqlC = SQLContext(sc)
print('done')

done


In [2]:
import pandas as pd
import string
from collections import Counter
import re
import os

## load and clean journal info

In [4]:
cols = ['Rank', 'Sourceid', 'Title', 'Type', 'SJR', 'SJR Best Quartile',
       'H index', 'Total Cites (3years)', 'Citable Docs. (3years)',
       'Cites / Doc. (2years)', 'Country', 'Categories']
files = os.listdir('./')
for f in files:
    if f.endswith('csv'):
        print(f)
        year = int(f.split(' ')[1].split('.')[0])
        df = pd.read_csv('./'+f, sep=';', decimal=',')
        df['year'] = year
        df['quartile'] = pd.to_numeric(df['SJR Best Quartile'].str[1:], errors='coerse')
        df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_'
                                                      ).str.replace('(', ''
                                                      ).str.replace(')', ''
                                                      ).str.replace('/', ''
                                                      ).str.replace('.', ''
                                                      )

        df.to_csv('./sjr_'+str(year)+'.tsv', sep='\t', index=False)
    
df.head()

scimagojr 2007.csv
scimagojr 2014.csv
scimagojr 2013.csv
scimagojr 1999.csv
scimagojr 2004.csv
scimagojr 2008.csv
scimagojr 2006.csv
scimagojr 2017.csv
scimagojr 2000.csv
scimagojr 2002.csv
scimagojr 2005.csv
scimagojr 2011.csv
scimagojr 2015.csv
scimagojr 2010.csv
scimagojr 2016.csv
scimagojr 2012.csv
scimagojr 2001.csv
scimagojr 2009.csv
scimagojr 2003.csv


Unnamed: 0,rank,sourceid,title,type,issn,sjr,sjr_best_quartile,h_index,total_docs_2003,total_docs_3years,total_refs,total_cites_3years,citable_docs_3years,cites__doc_2years,ref__doc,country,publisher,categories,year,quartile
0,1,20651,Annual Review of Immunology,journal,"07320582, 15453278",37.995,Q1,274,25,82,5272,4682,81,55.16,210.88,United States,Annual Reviews Inc.,Immunology (Q1); Immunology and Allergy (Q1),2003,1.0
1,2,16801,Annual Review of Biochemistry,journal,"15454509, 00664154",35.186,Q1,268,25,81,4453,3234,81,39.1,178.12,United States,Annual Reviews Inc.,Biochemistry (Q1),2003,1.0
2,3,18434,Cell,journal,"00928674, 10974172",28.284,Q1,682,368,1084,14525,29831,1055,26.88,39.47,United States,Cell Press,"Biochemistry, Genetics and Molecular Biology (...",2003,1.0
3,4,29719,Reviews of Modern Physics,journal,"00346861, 15390756",26.67,Q1,284,39,95,8816,2945,94,33.25,226.05,United States,American Physical Society,Physics and Astronomy (miscellaneous) (Q1),2003,1.0
4,5,18395,Annual Review of Cell and Developmental Biology,book series,"15308995, 10810706",24.175,Q1,199,28,73,3891,1866,71,22.63,138.96,United States,Annual Reviews Inc.,Cell Biology (Q1); Developmental Biology (Q1),2003,1.0


In [5]:
sjr = sqlC.read.csv('sjr/*.tsv', sep='\t', header=True, inferSchema=True)
sjr.printSchema()

root
 |-- rank: string (nullable = true)
 |-- sourceid: string (nullable = true)
 |-- title: string (nullable = true)
 |-- type: string (nullable = true)
 |-- issn: string (nullable = true)
 |-- sjr: string (nullable = true)
 |-- sjr_best_quartile: string (nullable = true)
 |-- h_index: string (nullable = true)
 |-- total_docs_2003: string (nullable = true)
 |-- total_docs_3years: string (nullable = true)
 |-- total_refs: string (nullable = true)
 |-- total_cites_3years: string (nullable = true)
 |-- citable_docs_3years: string (nullable = true)
 |-- cites__doc_2years: string (nullable = true)
 |-- ref__doc: string (nullable = true)
 |-- country: string (nullable = true)
 |-- publisher: string (nullable = true)
 |-- categories: string (nullable = true)
 |-- year: string (nullable = true)
 |-- quartile: string (nullable = true)



In [6]:
sjr.count()

509906

## save 

In [7]:
sjr.write.parquet('sjr_ranks.parquet', mode='overwrite')