## Cleaning Input data 

In [1]:
import pandas as pd
import json 
import os
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from torch.utils.data import DataLoader, Dataset
from torch import nn, optim


Loading in the json file with the subjects and corresponding classes from the [OSTI Subject Booklet](https://www.osti.gov/stip.old/system/files/Subject_Categories_Booklet.pdf)

In [9]:
with open('subjects_and_classes.json') as f:
    classes = json.load(f)

print(classes)

{'Energy Storage, Conversion, and Utilization': ['25 Energy Storage National Defense', '29 Energy Planning, Policy, and Economy', '30 Direct Energy Conversion National Defense', '32 Energy Conservation, Consumption, and Utilization', '33 Advanced Propulsion Systems'], 'Environmental Sciences': ['54 Environmental Sciences'], 'Fossil Fuels': ['1 Coal, Lignite, and Peat', '2 Petroleum', '3 Natural Gas', '4 Oil Shales and Tar Sands'], 'Fission and Nuclear Technologies': ['7 Isotope and Radiation Sources', '11 Nuclear Fuel Cycle and Fuel Materials', '12 Management Of Radioactive Wastes, and Non-Radioactive Wastes from Nuclear Facilities', '21 Specific Nuclear Reactors and Associated Plants', '22 General Studies of Nuclear Reactors'], 'Renewable Energy Sources': ['8 Hydrogen', '9 Biomass Fuels', '10 Synthetic Fuels', '13 Hydro Energy', '2 Petroleum', '14 Solar Energy', '15 Geothermal Energy', '4 Oil Shales and Tar Sands', '16 Tidal and Wave Power', '17 Wind Energy'], 'Geosciences': ['58 Geos

Retrieving the data files downloaded by hand in a specific file format and append the files together.

In [14]:
def get_all_files(path):
    all_files = []
    for root, directories, files in os.walk(path):
        for filename in files:
            file_path = os.path.join(root, filename)
            all_files.append(file_path)
    return all_files

In [18]:
folder_path = 'data/Data/'
all_files = get_all_files(folder_path)
print(all_files)

['data/Data/.DS_Store', 'data/Data/17 Wind Energy/OSTI.GOV-metadata (15).csv', 'data/Data/17 Wind Energy/OSTI.GOV-metadata (16).csv', 'data/Data/11 Nuclear Fuel Cycle and Fuel Materials/OSTI.GOV-metadata (21).csv', 'data/Data/11 Nuclear Fuel Cycle and Fuel Materials/OSTI.GOV-metadata (20).csv', 'data/Data/9 Biomass Fuels/OSTI.GOV-metadata (1).csv', 'data/Data/12 Management Of Radioactive Wastes, and/OSTI.GOV-metadata (22).csv', 'data/Data/32 Energy Conservation, Consumption, and/OSTI.GOV-metadata (32).csv', 'data/Data/20 Fossil-Fueled Power Plants/OSTI.GOV-metadata (28).csv', 'data/Data/20 Fossil-Fueled Power Plants/OSTI.GOV-metadata (27).csv', 'data/Data/25 Energy Storage/OSTI.GOV-metadata (29).csv', 'data/Data/21 Specific Nuclear Reactors and Associated/OSTI.GOV-metadata (24).csv', 'data/Data/16 Tidal and Wave Power/OSTI.GOV-metadata (14).csv', 'data/Data/30 Direct Energy Conversion/OSTI.GOV-metadata (31).csv', 'data/Data/3 Natural Gas/OSTI.GOV-metadata (19).csv', 'data/Data/54 Envir

Combining the downloaded CSVs into one pandas dataframe and removing any duplicates or extraneous columns

In [31]:
def combine_csv(list_of_csvs): 
    combined_df = pd.DataFrame()
    for csv_file in list_of_csvs:
        if "data/Data/.DS_Store" not in csv_file:
            df = pd.read_csv(csv_file)
            combined_df = pd.concat([combined_df, df], ignore_index=True)
    
            # Remove duplicates
            data = combined_df.drop_duplicates(subset=['OSTI_IDENTIFIER'])

            # Filter for the columns we want
            df = data[['DESCRIPTION', 'SUBJECT']]

            # Drop NAN if neither has description or subject
            df = df.dropna(subset=['DESCRIPTION', 'SUBJECT'])

            return df

In [32]:
df = combine_csv(all_files)

print(len(df))

847


In [30]:
def get_main_subject(df, sub_dict):
    '''
    Given a list of subjects, will pull the first (and main)
    as the overall subject of the paper

    Inputs:
        df: Dataframe with subject column
    
    Returns:
        df: 
    '''

    df.SUBJECT = df.SUBJECT.apply(lambda x: x.title().split('; '))
    # df.SUBJECT = df.SUBJECT.applymap(lambda x: x.title())
    subjects = []
    
    for values in sub_dict.values():
        # append each value to the new list using the extend() method
        subjects.extend(values)
    
    subjects = [s.title() for s in subjects]

    df['MAIN_SUBJECT'] = df['SUBJECT'].apply(lambda x: next((s for s in x if any(sub in s for sub in subjects)), None))

    # for i, row in df.iterrows():
    #     main_subs = []
    #     for subject in subjects:
    #         if subject in row['SUBJECT']:
    #             main_subs.append(subject)
        
    #     df.at[i, 'MAIN_SUBJECT'] = main_subs

    return df

In [None]:
df = get_main_subject(df, classes)

In [None]:
df.head()

Unnamed: 0,DESCRIPTION,SUBJECT,MAIN_SUBJECT
0,{sup 1}H and {sup 13}C high-resolution liquid-...,"[01 Coal, Lignite, And Peat, Hydrogen 1, Carbo...","01 Coal, Lignite, And Peat"
1,"The effects of blending polyethylene (PE), pol...","[01 Coal, Lignite, And Peat, Bituminous Coal, ...","01 Coal, Lignite, And Peat"
2,High-temperature {sup 1}H NMR and rheometry me...,"[01 Coal, Lignite, And Peat, Coking, Additives...","01 Coal, Lignite, And Peat"
3,Enormous progress has been made in coal pyroly...,"[01 Coal, Lignite, And Peat, 66 Physics, Coal ...","01 Coal, Lignite, And Peat"
4,Although high temperature in-situ {sup 1}H NMR...,"[01 Coal, Lignite, And Peat, 66 Physics, Coal,...","01 Coal, Lignite, And Peat"


Introducing the Class that we want to identify with our models from the subject and classes json.

In [None]:
for key, value in classes.items():
    for v in value:
        mask = df['MAIN_SUBJECT'] == v.title()
        df.loc[mask, 'CLASS'] = key

In [None]:
df.head()

Unnamed: 0,DESCRIPTION,SUBJECT,MAIN_SUBJECT,CLASS
0,{sup 1}H and {sup 13}C high-resolution liquid-...,"[01 Coal, Lignite, And Peat, Hydrogen 1, Carbo...","01 Coal, Lignite, And Peat",Fossil Fuels
1,"The effects of blending polyethylene (PE), pol...","[01 Coal, Lignite, And Peat, Bituminous Coal, ...","01 Coal, Lignite, And Peat",Fossil Fuels
2,High-temperature {sup 1}H NMR and rheometry me...,"[01 Coal, Lignite, And Peat, Coking, Additives...","01 Coal, Lignite, And Peat",Fossil Fuels
3,Enormous progress has been made in coal pyroly...,"[01 Coal, Lignite, And Peat, 66 Physics, Coal ...","01 Coal, Lignite, And Peat",Fossil Fuels
4,Although high temperature in-situ {sup 1}H NMR...,"[01 Coal, Lignite, And Peat, 66 Physics, Coal,...","01 Coal, Lignite, And Peat",Fossil Fuels


One comment we've received is on the fear of imbalanced classes, so the goal of this next block of code is to equitably randomly retrieve data from each class we're looking to match

In [None]:
# retrieve the minimum class and values
min_obs = df['CLASS'].value_counts().min()

# Sample
sample_df = df.groupby('CLASS').apply(pd.DataFrame.sample, n=min_obs, random_state=30255).reset_index(drop=True)

Using the Transformers package and Bert tokenizer from pre trained

In [None]:
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [None]:
def tokenize_text(text):
    return tokenizer.encode_plus(text,
                                  add_special_tokens=True,
                                  max_length=256,
                                  padding='max_length',
                                  return_attention_mask=True,
                                  return_tensors='pt')

sample_df['BERT_TOKENIZED'] = sample_df['DESCRIPTION'].apply(tokenize_text)
for i, t in enumerate(sample_df["BERT_TOKENIZED"]):
    print(type(t))
    if i == 1:
        break

Also will include Spacy preprocessed text. To

In [None]:
import spacy

nlp = spacy.load('en_core_web_sm')

def preprocess_text(text):
    doc = nlp(text)
    tokens = [token.lemma_.lower() for token in doc if not token.is_stop and not token.is_punct and not token.is_space and not token.like_num]
    return " ".join(tokens)

In [None]:
sample_df['SPACY_PREPROCESSED'] = sample_df['DESCRIPTION'].apply(preprocess_text)

In [None]:
if not os.path.exists("./data"):
    os.mkdir("./data")

sample_df.to_csv("data/preprocessed_data.csv", index=False)