## Loading Data to prepare for Topic Modeling

This is the first notebook in the series of Topic Modeling notebooks

In [1]:
## General Dependencies
import re
import numpy as np
import pandas as pd
from pprint import pprint
import sys, os
import glob
from tika import parser # pip install tika
import inspect
import datetime
import pickle5 as pickle

## Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim import models
#from gensim.models.coherencemodel import CoherenceModel
from gensim.models import CoherenceModel
from gensim.models import LdaModel
from gensim.models.wrappers import LdaMallet
from gensim.models import ldaseqmodel


## Preprocessing
import spacy
import nltk as nltk
from nltk.stem import WordNetLemmatizer 
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])

## Plotting
import pyLDAvis
import pyLDAvis.gensim
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
import ast

## Other Libraries
from operator import itemgetter

## ScikitLearn
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS

  from PIL import PILLOW_VERSION
  from PIL import PILLOW_VERSION


### Opening data: if data are a folder of full text PDFs

In [18]:
## Use the glob method to retrieve files/pathnames in the directory
## https://www.geeksforgeeks.org/how-to-use-glob-function-to-find-files-recursively-in-python/

directory = "data/10_News_All_Final"

# ## Other file directories
# ## "data/10_News_All_Final"
# ## "data/1_News_Industry"
# ## "data/2_Government_Documents"
# ## "data/3_Altmetric_Policy"

files = list(glob.glob(os.path.join(directory,'*.*')))

## Inspect output as needed
print(files)

## Other Resources
## https://stackoverflow.com/questions/34000914/how-to-create-a-list-from-filenames-in-a-user-specified-directory-in-python
## https://stackoverflow.com/questions/3207219/how-do-i-list-all-files-of-a-directory
## https://stackoverflow.com/questions/33912773/python-read-txt-files-into-a-dataframe



In [19]:
## Extract text from the pdfs and add them to a list using Tika Python
## The output is a dictionary with: metadata, content, status

document_list = []
for f in files:
    raw = parser.from_file(f)
    document_list.append(raw)
    
## Resources
## https://www.geeksforgeeks.org/parsing-pdfs-in-python-with-tika/
## https://stackoverflow.com/questions/34837707/how-to-extract-text-from-a-pdf-file

In [20]:
## Add the dictionary to a pandas dataframe

text_df = pd.DataFrame(document_list)

## Inspect the output as needed
# text_df.head()
# print(text_df["metadata"][1])
# print(text_df["content"][1])
# text_df.to_csv('gensim_test.csv')

In [21]:
## Count the number of rows in the dataframe

text_df_count_row = text_df.shape[0]  # gives number of row count

## Inspect output as needed
print(text_df_count_row)

838


In [22]:
## Create a new column in the dataframe called "title" and populate it with the title from the metadata key called dc:title

text_df['title'] = [value.get('dc:title') for value in text_df["metadata"]]

## Inspect output as needed
text_df.head()

## Resources
## https://stackoverflow.com/questions/44218812/pandas-add-columns-to-a-dataframe-based-in-dict-from-one-of-the-columns

## If needed, you can review the contents of the metadata column using this code:
## print(text_df['metadata'])
## my_dict.keys()[0]     -> key of "first" element
## my_dict.values()[0]   -> value of "first" element
## my_dict.items()[0]    -> (key, value) tuple of "first" element
##list(contacts.items())[0]

# for v in text_df['metadata']:
#     new = list(v.items())[1]
#     print(new)
   

Unnamed: 0,metadata,content,status,title
0,"{'Content-Type': 'application/pdf', 'Creation-...",\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,200,New Study Shows Limited Screening for Hypertro...
1,"{'Content-Type': 'application/pdf', 'Creation-...",\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,200,Young Athletes To Be Screened For Heart Condit...
2,"{'Content-Type': 'application/pdf', 'Creation-...",\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,200,From the Legal Bar to the Health Bar
3,"{'Content-Type': 'application/pdf', 'Creation-...",\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,200,American Heart Association: Higher Coronary Ar...
4,"{'Content-Type': 'application/pdf', 'Creation-...",\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,200,HIGHER CORONARY ARTERY CALCIUM LEVELS IN MIDDL...


In [9]:
## Create a new column in the dataframe called "file_name" and populate it with the title from the metadata key called "resourceName"

text_df['file_name'] = [value.get('resourceName') for value in text_df["metadata"]]

## Inspect output as needed

text_df.head()

## Resources
## https://stackoverflow.com/questions/44218812/pandas-add-columns-to-a-dataframe-based-in-dict-from-one-of-the-columns

## If needed, you can review the contents of the metadata column using this code:
## print(text_df['metadata'])
## my_dict.keys()[0]     -> key of "first" element
## my_dict.values()[0]   -> value of "first" element
## my_dict.items()[0]    -> (key, value) tuple of "first" element
##list(contacts.items())[0]

# for v in text_df['metadata']:
#     new = list(v.items())[1]
#     print(new)

Unnamed: 0,metadata,content,status,title,file_name
0,"{'Application-Name': 'Microsoft Office Word', ...",\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,200,,b'R - Allen Pratt CDH Lurie peds.doc'
1,"{'Application-Name': 'Microsoft Office Word', ...",\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,200,,b'R - Arie Habis - CDH Lurie Peds.doc'
2,"{'Application-Name': 'Microsoft Office Word', ...",\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,200,,b'R - Bauer and Karen Lake Forest Grays Lake.doc'
3,"{'Application-Name': 'Microsoft Office Word', ...",\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,200,,b'R - Bonomo - Residency programs west.doc'
4,"{'Application-Name': 'Microsoft Office Word', ...",\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,200,,b'R - Chawla - CDH - Lurie ca center.doc'


In [23]:
## Change the file type of the file_name column to string (if it isn't already). 

text_df['file_name'] = text_df['file_name'].astype(str)

## Inspect output as needed
text_df.head()
print(type(text_df['file_name'][1]))
print(text_df.dtypes)

KeyError: 'file_name'

In [None]:
## Remove the b' before the string name 
text_df['file_name'] = text_df['file_name'].str.strip("b\'\"")

## Inspect output as needed
text_df.head()

## Other method
## text_df['file_name'] = text_df['file_name'].str.decode('utf-8')
## text_df.head()

## Resources
## https://stackoverflow.com/questions/46696679/removing-b-from-string-column-in-a-pandas-dataframe
## https://stackoverflow.com/questions/61970212/pandas-decoding-a-string-returns-nan

In [None]:
## Count the number of rows in the dataframe

text_df_count_row = text_df.shape[0]  # gives number of row count
print(text_df_count_row)

In [None]:
## This code was written to inspect the "file_name" column, but is no longer needed for this workflow. 
## Sort the dataframe by values in the "file_name" column
## Notice that some file_names have .pdf or other file type included...this will cause problems
## text_df = text_df.sort_values(by=['file_name'], ascending=False)
## text_df.head()
## Resources
## https://stackoverflow.com/questions/37787698/how-to-sort-pandas-dataframe-from-one-column

## Inspect the output as needed
# print(text_df['file_name'][0])
# print(text_df['file_name'][1])
# print(text_df['file_name'][2])
# print(text_df['file_name'][3])

### Opening data: if data is a pickle file

In [13]:
## Check working directory
# cwd = os.getcwd()
# print(cwd)

In [14]:
## path_to_protocol5 = "C:\\Users\\keg827\\AppData\\Local\\Continuum\\anaconda3\\Lib\\site-packages\\pickle5"
## https://www.geeksforgeeks.org/dataframe-read_pickle-method-in-pandas/
# with open(path_to_protocol5, "rb") as fh:
# altmetric_policy_df = pd.read_pickle(r'C:\Users\keg827\Documents\10_Github_Repos\CARDIAimpactanalysis\data\policy_fulltext (1).pkl', compression=None)
   

In [15]:
## Check recursion depth
## https://riptutorial.com/python/example/17855/increasing-the-maximum-recursion-depth
# def cursing(depth):
#   try:
#     cursing(depth + 1) # actually, re-cursing
#   except RuntimeError as RE:
#     print('I recursed {} times!'.format(depth))
# cursing(0)

In [16]:
# import sys
# print(sys.getrecursionlimit())

In [17]:
# sys.setrecursionlimit(4000)

In [18]:
# filename = "C:\\Users\\keg827\\Documents\\10_Github_Repos\\CARDIAimpactanalysis\\data\\policy_fulltext (1).pkl"
# infile = open(filename,'rb')
# new_dict = pickle.load(infile)
# infile.close()

In [19]:
# print(type(new_dict))
# new_dict.head()

In [20]:
## Upload Altmetric data
## pmid_list = pd.read_csv(r"C:\Users\keg827\Documents\10_Github_Repos\pubmedbiopython\cardiaids.csv", encoding= 'unicode_escape')
# altmetric_policy_df = pd.read_csv(r"C:\Users\keg827\Documents\10_Github_Repos\CARDIAimpactanalysis\data\policy_fulltext.csv")
# altmetric_policy_df.head()

## print(altmetric_policy_df['metadata'][1])

### Get Creation Date from Metadata column (if it exists)

In [15]:
## Create a new column called metadata2 which is a reformatted copy of the metadata column from string to object

# text_df['metadata2']=text_df['metadata'].apply(ast.literal_eval)

## Resource(s)
## https://stackoverflow.com/questions/56102724/how-to-convert-string-representation-of-dictionary-in-pandas-dataframe-to-a-new


In [22]:
## Extract the creation-date data from the metadata2 column and create a new column called date

# text_df['date'] = [value.get('Creation-Date') for value in text_df["metadata2"]]
# text_df.head()

In [23]:
## Create a new column called year and extract the year from the date column

## efficient way to extract year from string format date
# text_df['year'] = pd.DatetimeIndex(text_df['date']).year
# text_df.head(50)

### Add Creation Date from outside CSV (if not in PDF metadata)

In [24]:
## Import dataframe from non-core spreadsheet with year column
date_df = pd.read_csv("data/10_News_All_Final_2.csv", encoding='ISO-8859-1') 
date_df.head()
## print(date_df['File_Name'][4])
## date_df.keys()

## print(type(date_df['File_Name'][1]))
## print(date_df.dtypes)

Unnamed: 0,Database,Document_Type,Reference_Detail,CORE,Authors,Title,Year,Publication_Date,Source_title,Page_start,File_Name,Metadata_Title
0,Nexis Uni,News Magazine,CO,NONCORE,H. T. George Wehrfritz; R. W. B. J. Lee; M. R....,Periscope,2003,10-Nov-03,Newsweek,2209 words,CO.100Periscope,Periscope
1,Nexis Uni,News Magazine,CO,NONCORE,,Poor Fitness in Young Adults Associated With L...,2004,5-Jan-04,FDA Consumer,1060 words,CO.99Poor Fitness in Young Adults Associated W...,Poor Fitness in Young Adults Associated With L...
2,Academic Search Complete,News Magazine,CO,NONCORE,"Curry, Andrew",Mother Wit,2009,2009,Diabetes Forecast,66-67,Mother Wit_ Forcast Diabetes Magazine,Mother_Wit_Forecast_Diabetes_Magazine
3,CINAHL,News Magazine,CO,NONCORE,"Stoler, Felicia D.",MYTHS BUSTED!,2016,2016,Natural Solutions,28-31,,
4,Nexis Uni,News Magazine,CO,NONCORE,DayAfter,Drops in income can harm your brain: Study,2019,4-Oct-19,The DayAfter,371 words,CO.11Drops in income can harm your brain_ Stud...,Drops in income can harm your brain: Study


In [25]:
## Count the number of rows in the dataframe

date_df_count_row = date_df.shape[0]  # gives number of row count
print(date_df_count_row)

770


In [26]:
## Match text_df and date_df based on title and Metadata_title to add year to the text_df dataframe

merged_df= text_df.merge(date_df, left_on = 'title', right_on = 'Metadata_Title', how = 'inner')
merged_df.keys()
# merged_df.head()

## Left Dataframe is text_df
## Right Dataframe is date_df

## Note: An inner join only merges together what the two spreadsheets have in common. Anything not in common will be dropped. 

## Resources
## https://stackoverflow.com/questions/49890305/match-two-columns-from-two-dataframes-and-add-items-from-a-third-column-if-cells
# miscset = miscset.merge(oset, left_on='subset', right_on='some_items', 
#     how='inner').drop(columns='some_items')

Index(['metadata', 'content', 'status', 'title', 'Database', 'Document_Type',
       'Reference_Detail', 'CORE', 'Authors', 'Title ', 'Year',
       'Publication_Date ', 'Source_title ', 'Page_start', 'File_Name',
       'Metadata_Title'],
      dtype='object')

In [27]:
## CANNOT USE THIS because file_name and File_Name are not matchable in this dataset
## Match text_df and date_df based on file_name and File_Name to add year to the text_df dataframe

# merged_df= text_df.merge(date_df, left_on = 'file_name', right_on = 'File_Name', how = 'inner')
# merged_df.keys()
# merged_df.head()

## Left Dataframe is text_df
## Right Dataframe is date_df

## Note: An inner join only merges together what the two spreadsheets have in common. Anything not in common will be dropped. 

## Resources
## https://stackoverflow.com/questions/49890305/match-two-columns-from-two-dataframes-and-add-items-from-a-third-column-if-cells
# miscset = miscset.merge(oset, left_on='subset', right_on='some_items', 
#     how='inner').drop(columns='some_items')

In [27]:
## Count the number of rows in the dataframe

merged_df_count_row = merged_df.shape[0]  # gives number of row count
print(merged_df_count_row)

972


In [29]:
## Identify any duplicates in the dataframe by Metadata_Title, Source_title, and Year

duplicate_df = merged_df[merged_df.duplicated(subset=['Metadata_Title','Source_title ', 'Year'], keep=False)]
# duplicate_df.head()

## Count the number of rows in the dataframe

duplicate_df_count_row = duplicate_df.shape[0]  # gives number of row count
print(duplicate_df_count_row)

## Note that some files may have multiple duplicates

363


In [30]:
## Drop duplicates from the merged_df using the "Metadata_Title" AND "Source_title" and 'Year'...and keep the "last record" 

deduplicate_df = merged_df.drop_duplicates(['Metadata_Title','Source_title ', 'Year'], keep= 'last')
print(deduplicate_df)

## Count the number of rows in the dataframe

deduplicate_df_count_row = deduplicate_df.shape[0]  # gives number of row count
print(deduplicate_df_count_row)

## Resources
## https://www.geeksforgeeks.org/python-pandas-dataframe-drop_duplicates/
## https://www.codegrepper.com/code-examples/python/find+duplicated+rows+with+respect+to+multiple+columns+pandas
## https://stackoverflow.com/questions/32093829/remove-duplicates-from-dataframe-based-on-two-columns-a-b-keeping-row-with-max

                                              metadata  \
0    {'Content-Type': 'application/pdf', 'Creation-...   
1    {'Content-Type': 'application/pdf', 'Creation-...   
2    {'Content-Type': 'application/pdf', 'Creation-...   
3    {'Content-Type': 'application/pdf', 'Creation-...   
4    {'Content-Type': 'application/pdf', 'Creation-...   
..                                                 ...   
967  {'Content-Type': 'application/pdf', 'Creation-...   
968  {'Content-Type': 'application/pdf', 'Creation-...   
969  {'Content-Type': 'application/pdf', 'Creation-...   
970  {'Content-Type': 'application/pdf', 'Creation-...   
971  {'Content-Type': 'application/pdf', 'Creation-...   

                                               content  status  \
0    \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...     200   
1    \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...     200   
2    \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...     200   
3    \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n

In [31]:
## Count the number of duplicates removed from the dataframe

duplicates_removed = merged_df_count_row - deduplicate_df_count_row
print(duplicates_removed)

238


In [32]:
## Find any missing data in the dataframe
## If any column is mising data, the number of rows with missing data will be reported in this series

deduplicate_df.isnull().sum(axis = 0)

## Resources: 
## https://stackoverflow.com/questions/15943769/how-do-i-get-the-row-count-of-a-pandas-dataframe
## https://stackoverflow.com/questions/46864740/selecting-a-subset-using-dropna-to-select-multiple-columns

metadata               0
content                0
status                 0
title                  0
Database               0
Document_Type          0
Reference_Detail       0
CORE                   0
Authors              333
Title                  0
Year                   0
Publication_Date       0
Source_title           0
Page_start             1
File_Name              0
Metadata_Title         0
dtype: int64

In [33]:
## Drop rows where values in these columns are NaN

first_drop_df = deduplicate_df.dropna(subset=['content'], how = 'all')
text_content_df = first_drop_df.dropna(subset=['Year'], how = 'all')
text_content_df.head()

Unnamed: 0,metadata,content,status,title,Database,Document_Type,Reference_Detail,CORE,Authors,Title,Year,Publication_Date,Source_title,Page_start,File_Name,Metadata_Title
0,"{'Content-Type': 'application/pdf', 'Creation-...",\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,200,New Study Shows Limited Screening for Hypertro...,Nexis Uni,News Newswire,CO,NONCORE,,New Study Shows Limited Screening for Hypertro...,2001,"March 19, 2001, Monday",Business Wire,859 words,Bibliography.102New Study Shows Limited Screen...,New Study Shows Limited Screening for Hypertro...
1,"{'Content-Type': 'application/pdf', 'Creation-...",\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,200,Young Athletes To Be Screened For Heart Condit...,Nexis Uni,News Newswire,CO,NONCORE,,Young Athletes To Be Screened For Heart Condit...,2000,"September 15, 2000, Friday",PR Newswire,1352 words,Bibliography.102Young Athletes To Be Screened ...,Young Athletes To Be Screened For Heart Condit...
2,"{'Content-Type': 'application/pdf', 'Creation-...",\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,200,From the Legal Bar to the Health Bar,Nexis Uni,News Magazine,CO,NONCORE,S. Joy,From the Legal Bar to the Health Bar,2019,17-Jun-19,Wealth Management,3262 words,Bibliography.13From the Legal Bar to the Healt...,From the Legal Bar to the Health Bar
3,"{'Content-Type': 'application/pdf', 'Creation-...",\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,200,American Heart Association: Higher Coronary Ar...,Nexis Uni,News Newswire,CO,NONCORE,S. Targeted News,American Heart Association: Higher Coronary Ar...,2019,"June 14, 2019 Friday",Targeted News Service,1070 words,Bibliography.14American Heart Association_ Hig...,American Heart Association: Higher Coronary Ar...
4,"{'Content-Type': 'application/pdf', 'Creation-...",\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,200,HIGHER CORONARY ARTERY CALCIUM LEVELS IN MIDDL...,Nexis Uni,News Newswire,CO,NONCORE,S. States News,HIGHER CORONARY ARTERY CALCIUM LEVELS IN MIDDL...,2019,"June 14, 2019 Friday",States News Service,1268 words,Bibliography.14HIGHER CORONARY ARTERY CALCIUM ...,HIGHER CORONARY ARTERY CALCIUM LEVELS IN MIDDL...


In [34]:
## Count the number of rows in the dataframe

text_content_df_count_row = text_content_df.shape[0]  # gives number of row count
print(text_content_df_count_row)

## Find any missing data in the dataframe
## If any column is mising data, the number of rows with missing data will be reported in this series

text_content_df.isnull().sum(axis = 0)

## Resources: 
## https://stackoverflow.com/questions/15943769/how-do-i-get-the-row-count-of-a-pandas-dataframe
## https://stackoverflow.com/questions/46864740/selecting-a-subset-using-dropna-to-select-multiple-columns

734


metadata               0
content                0
status                 0
title                  0
Database               0
Document_Type          0
Reference_Detail       0
CORE                   0
Authors              333
Title                  0
Year                   0
Publication_Date       0
Source_title           0
Page_start             1
File_Name              0
Metadata_Title         0
dtype: int64

In [36]:
## Make sure to sort the dataframe by YEAR and reset the index. 
## This is VERY important for using the Dynamic Topic Modeling later on. 

final_df = text_content_df.sort_values(['Year'], ascending=True).reset_index(drop=True)
final_df.head(15)

## Resources
## https://stackoverflow.com/questions/53332116/reset-index-after-sorting-data-frame

Unnamed: 0,metadata,content,status,title,Database,Document_Type,Reference_Detail,CORE,Authors,Title,Year,Publication_Date,Source_title,Page_start,File_Name,Metadata_Title
0,"{'Content-Type': 'application/pdf', 'Creation-...",\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,200,UCI MEDICAL CENTER AWARDED $1.3 MILLION FOR NA...,Nexis Uni,News Newswire,CO,NONCORE,,UCI MEDICAL CENTER AWARDED $1.3 MILLION FOR NA...,1990,"December 4, 1990, Tuesday",PR Newswire,536 words,CO.105UCI MEDICAL CENTER AWARDED $1.3 MILLION ...,UCI MEDICAL CENTER AWARDED $1.3 MILLION FOR NA...
1,"{'Content-Type': 'application/pdf', 'Creation-...",\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,200,"KAISER PERMANENTE STUDY: IF YOU'RE OBESE, HOST...",Nexis Uni,News Newswire,CO,NONCORE,,"KAISER PERMANENTE STUDY: IF YOU'RE OBESE, HOST...",1994,"March 18, 1994, Friday",PR Newswire,380 words,CO.105KAISER PERMANENTE STUDY_ IF YOU'RE OBESE...,"KAISER PERMANENTE STUDY: IF YOU'RE OBESE, HOST..."
2,"{'Content-Type': 'application/pdf', 'Creation-...",\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,200,"YOUNG AMERICAN ADULTS BECOMING FATTER, NOT FIT...",Nexis Uni,News Newswire,CO,NONCORE,,"YOUNG AMERICAN ADULTS BECOMING FATTER, NOT FIT...",1995,"March 10, 1995, Friday",PR Newswire,749 words,"CO.105YOUNG AMERICAN ADULTS BECOMING FATTER, N...","YOUNG AMERICAN ADULTS BECOMING FATTER, NOT FIT..."
3,"{'Content-Type': 'application/pdf', 'Creation-...",\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,200,New Estimates On The Prevalence Of Hypertrophi...,Nexis Uni,News Newswire,CO,NONCORE,,New Estimates On The Prevalence Of Hypertrophi...,1995,"August 30, 1995 Wednesday",Reuters Health Medical News,385 words,CO.105New Estimates On The Prevalence Of Hyper...,New Estimates On The Prevalence Of Hypertrophi...
4,"{'Content-Type': 'application/pdf', 'Creation-...",\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,200,KAISER PERMANENTE CORRECTS MISLEADING S.F. CHR...,Nexis Uni,News Newswire,CO,NONCORE,,KAISER PERMANENTE CORRECTS MISLEADING S.F. CHR...,1995,"November 7, 1995, Tuesday",PR Newswire,1338 words,CO.105KAISER PERMANENTE CORRECTS MISLEADING S....,KAISER PERMANENTE CORRECTS MISLEADING S.F. CHR...
5,"{'Content-Type': 'application/pdf', 'Creation-...",\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,200,Symptoms of societal ills?,Nexis Uni,News Newspaper,CO,NONCORE,R. William,Symptoms of societal ills?,1996,"October 25, 1996, Friday",The Tampa Tribune (Florida),746 words,Commentary.104Symptoms of societal ills_,Symptoms of societal ills?
6,"{'Content-Type': 'application/pdf', 'Creation-...",\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,200,Hypertension and Race,Nexis Uni,News Newspaper,CO,NONCORE,R. William,Hypertension and Race,1996,"October 25, 1996, Friday, Final Edition",The Washington Post,732 words,Commentary.104Hypertension and Race,Hypertension and Race
7,"{'Content-Type': 'application/pdf', 'Creation-...",\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,200,Study: Discrimination May Cause Hypertension i...,Nexis Uni,News Newspaper,CO,NONCORE,W. P. S. W. David Brown,Study: Discrimination May Cause Hypertension i...,1996,"October 24, 1996, Thursday, Final Edition",The Washington Post,1160 words,CO.105Study_ Discrimination May Cause Hyperten...,Study: Discrimination May Cause Hypertension i...
8,"{'Content-Type': 'application/pdf', 'Creation-...",\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,200,"STUDY LINKS RACIAL BIAS, BLOOD PRESSURE",Nexis Uni,News Newspaper,CO,NONCORE,T. W. P. David Brown,"STUDY LINKS RACIAL BIAS, BLOOD PRESSURE",1996,"October 24, 1996, Thursday",Pittsburgh Post-Gazette (Pennsylvania),872 words,"CO.105STUDY LINKS RACIAL BIAS, BLOOD PRESSURE","STUDY LINKS RACIAL BIAS, BLOOD PRESSURE"
9,"{'Content-Type': 'application/pdf', 'Creation-...",\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,200,Positive Correlation Between Body Weight And F...,Nexis Uni,News Newswire,CO,NONCORE,,Positive Correlation Between Body Weight And F...,1996,"August 1, 1996 Thursday",Reuters Health Medical News,247 words,CO.105Positive Correlation Between Body Weight...,Positive Correlation Between Body Weight And F...


In [37]:
## Add a unique identifier to each document in the dataset

final_df['unique_id']=final_df.index
final_df.head(15)

## Resources
## https://stackoverflow.com/questions/44878740/how-do-i-create-a-unique-record-id-in-a-python-dataframe
## df['unique_id'] = df.longstrings.map(hash)

Unnamed: 0,metadata,content,status,title,Database,Document_Type,Reference_Detail,CORE,Authors,Title,Year,Publication_Date,Source_title,Page_start,File_Name,Metadata_Title,unique_id
0,"{'Content-Type': 'application/pdf', 'Creation-...",\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,200,UCI MEDICAL CENTER AWARDED $1.3 MILLION FOR NA...,Nexis Uni,News Newswire,CO,NONCORE,,UCI MEDICAL CENTER AWARDED $1.3 MILLION FOR NA...,1990,"December 4, 1990, Tuesday",PR Newswire,536 words,CO.105UCI MEDICAL CENTER AWARDED $1.3 MILLION ...,UCI MEDICAL CENTER AWARDED $1.3 MILLION FOR NA...,0
1,"{'Content-Type': 'application/pdf', 'Creation-...",\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,200,"KAISER PERMANENTE STUDY: IF YOU'RE OBESE, HOST...",Nexis Uni,News Newswire,CO,NONCORE,,"KAISER PERMANENTE STUDY: IF YOU'RE OBESE, HOST...",1994,"March 18, 1994, Friday",PR Newswire,380 words,CO.105KAISER PERMANENTE STUDY_ IF YOU'RE OBESE...,"KAISER PERMANENTE STUDY: IF YOU'RE OBESE, HOST...",1
2,"{'Content-Type': 'application/pdf', 'Creation-...",\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,200,"YOUNG AMERICAN ADULTS BECOMING FATTER, NOT FIT...",Nexis Uni,News Newswire,CO,NONCORE,,"YOUNG AMERICAN ADULTS BECOMING FATTER, NOT FIT...",1995,"March 10, 1995, Friday",PR Newswire,749 words,"CO.105YOUNG AMERICAN ADULTS BECOMING FATTER, N...","YOUNG AMERICAN ADULTS BECOMING FATTER, NOT FIT...",2
3,"{'Content-Type': 'application/pdf', 'Creation-...",\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,200,New Estimates On The Prevalence Of Hypertrophi...,Nexis Uni,News Newswire,CO,NONCORE,,New Estimates On The Prevalence Of Hypertrophi...,1995,"August 30, 1995 Wednesday",Reuters Health Medical News,385 words,CO.105New Estimates On The Prevalence Of Hyper...,New Estimates On The Prevalence Of Hypertrophi...,3
4,"{'Content-Type': 'application/pdf', 'Creation-...",\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,200,KAISER PERMANENTE CORRECTS MISLEADING S.F. CHR...,Nexis Uni,News Newswire,CO,NONCORE,,KAISER PERMANENTE CORRECTS MISLEADING S.F. CHR...,1995,"November 7, 1995, Tuesday",PR Newswire,1338 words,CO.105KAISER PERMANENTE CORRECTS MISLEADING S....,KAISER PERMANENTE CORRECTS MISLEADING S.F. CHR...,4
5,"{'Content-Type': 'application/pdf', 'Creation-...",\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,200,Symptoms of societal ills?,Nexis Uni,News Newspaper,CO,NONCORE,R. William,Symptoms of societal ills?,1996,"October 25, 1996, Friday",The Tampa Tribune (Florida),746 words,Commentary.104Symptoms of societal ills_,Symptoms of societal ills?,5
6,"{'Content-Type': 'application/pdf', 'Creation-...",\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,200,Hypertension and Race,Nexis Uni,News Newspaper,CO,NONCORE,R. William,Hypertension and Race,1996,"October 25, 1996, Friday, Final Edition",The Washington Post,732 words,Commentary.104Hypertension and Race,Hypertension and Race,6
7,"{'Content-Type': 'application/pdf', 'Creation-...",\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,200,Study: Discrimination May Cause Hypertension i...,Nexis Uni,News Newspaper,CO,NONCORE,W. P. S. W. David Brown,Study: Discrimination May Cause Hypertension i...,1996,"October 24, 1996, Thursday, Final Edition",The Washington Post,1160 words,CO.105Study_ Discrimination May Cause Hyperten...,Study: Discrimination May Cause Hypertension i...,7
8,"{'Content-Type': 'application/pdf', 'Creation-...",\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,200,"STUDY LINKS RACIAL BIAS, BLOOD PRESSURE",Nexis Uni,News Newspaper,CO,NONCORE,T. W. P. David Brown,"STUDY LINKS RACIAL BIAS, BLOOD PRESSURE",1996,"October 24, 1996, Thursday",Pittsburgh Post-Gazette (Pennsylvania),872 words,"CO.105STUDY LINKS RACIAL BIAS, BLOOD PRESSURE","STUDY LINKS RACIAL BIAS, BLOOD PRESSURE",8
9,"{'Content-Type': 'application/pdf', 'Creation-...",\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,200,Positive Correlation Between Body Weight And F...,Nexis Uni,News Newswire,CO,NONCORE,,Positive Correlation Between Body Weight And F...,1996,"August 1, 1996 Thursday",Reuters Health Medical News,247 words,CO.105Positive Correlation Between Body Weight...,Positive Correlation Between Body Weight And F...,9


In [39]:
## Save dataframe to csv
with open(r"output/loading/final_df.csv", 'w', encoding='utf-8') as file:
    final_df.to_csv(file, index=True, line_terminator='\n')
    file.close()

In [40]:
## Create a list of the unique IDS
identifiers = final_df['unique_id'].tolist()
print(identifiers)

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221,

In [41]:
## Save the list using pkl

file_name = "output/loading/identifiers.pkl"

open_file = open(file_name, "wb")
pickle.dump(identifiers, open_file, protocol=4)
open_file.close()

