# Revised Odden's LDA 
(adapted for nlp in physics education project)

# 05 - Data Cleaning for Science Education Articles

In [6]:
# Print out  all expressions
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all" #default 'last_expr'
# Wider cells
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

In [7]:
# Imports
import pickle
import pandas as pd
import numpy as np

#Import regular expressions, for data processing
import re

import gensim
from gensim.parsing.preprocessing import STOPWORDS
from gensim.corpora import Dictionary
import nltk
from nltk.corpus import wordnet
from nltk import pos_tag
nltk.download('wordnet',quiet=True)
nltk.download('punkt',quiet=True)   #required by word_tokenize method
nltk.download('averaged_perceptron_tagger',quiet=True) #required by pos_tag method

True

True

True

In [8]:
from helpers import find_in_list

In [9]:
# reload any module
'''
import importlib
import helpers
importlib.reload(helpers)
'''

'\nimport importlib\nimport helpers\nimportlib.reload(helpers)\n'

## Reading the datafile

In [10]:
def ML_process(text):
    filt_text = text
    #remove 'cid'
    filt_text = re.sub('\W(cid:\d{0,3})\W', '', filt_text) #Symbols such as @    
    #remove some words in all-caps
    #USELESS NOW cause RawTextProcesser_* converted everything to lower case
    filt_text = re.sub(r'(?<=\W)(INTRODUCTION|CONCLUSION[S]?|BACKGROUND|ABSTRACT|ANALYSIS|EXPERIMENTAL|METHOD[S]?|METHODOLOGY|MOTIVATION[S]?|PRELIMINARY|RESULTS|APPLICATIONS|CONCLUDING|IMPLEMENTATION|EVALUATION|REMARKS|DISCUSSION[S]?|ACKNOWLEDGEMENTS|FUTURE PLANS|FUTURE WORK|FUTURE REASEARCH|SUMMARY|FIGURE[S]?|FIG|TABLE|I\.|II|III|IV|VI{0,3}|IX|X|XI{0,3})(?=\W)', 
                       '', filt_text)
    #remove newlines, tabs, etc. also remove digits (\d) and bullet points (\uf0b7)
    filt_text = re.sub('[\t\n\r\f\v\d\uf0b7]', ' ', filt_text)
    #removes all special characters that aren't numbers or letters
    filt_text = re.sub('[^A-Za-z0-9]+', ' ', filt_text)
    #split lines
    filt_text = re.sub('- ', '', filt_text)
    #to lower case
    filt_text = filt_text.lower()
    
    #tlie -> the
    filt_text = re.sub(' tlie ', ' the ', filt_text)
    #per cent -> percent
    filt_text = re.sub(' per cent ', ' percent ', filt_text)
    # )ed -> fied
    #filt_text = re.sub(re.escape(' \)ed '), 'fied ', filt_text)
    # - cation -> cation
    #filt_text = re.sub('- cation ', 'cation ', filt_text)
    return filt_text

# 06 - Tokenize_MakeBigrams

## Removing stopwords and stemming

Now, we can remove the stopwords and do the stemming, leaving us with a list of documents, each of which is essentially a tokenized list of words.

In [11]:
def sent_to_words(list_sentences):
    return [gensim.utils.simple_preprocess(str(sentence), deacc=True) for sentence in list_sentences]  #deacc=True removes accent marks from tokens (incl. punctuations)

def remove_stopwords(tokens):
    return [[word for word in gensim.utils.simple_preprocess(str(doc)) if word not in STOPWORDS ] for doc in tokens]

def get_wordnet_pos(word): #Provide a POS tag
    """Map POS tag to first character lemmatize() accepts"""
    tag = pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN) #return NOUN by default

def lemmatize_token(token):
    return nltk.stem.WordNetLemmatizer().lemmatize(token, get_wordnet_pos(token))

def lemmatize(token_list):
    '''Input example: ["he", "matches", "the", "profile"]'''
    return [lemmatize_token(token) for token in token_list]

# 07 - Choose_noabove_nobelow

In [12]:
import gensim
from sklearn.feature_extraction.text import CountVectorizer

# Import usual data analysis tools
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
#np.random.seed(2018)

import pickle

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning);

In [15]:
from helpers import plot_freq_dist,get_top_n_words,plot_words_freq

## II. Loading in and filtering the data

The datafile we use for this analysis is a pickle file containing processed versions of PERC papers from 2001 to 2018. We have scraped the available PDFs, then done the following data cleaning on the scraped text:
1. Removed references, acknowledgments, keywords, and PACS 
2. Removed all numbers, symbols, punctuation, characters, and section headers
3. Removed "stop words" (words like "and", "or", "is", etc. which do not carry specific meaning)
4. Lowercased all words
5. Lemmatized all words, reducing them to their more basic form (for example, reducing "tests", "testing", and "tested" to "test")
6. Created bi-grams: combining commonly-associated words into one (for example, "problem" and "solving" into "problem_solving")
7. Turned the resulting text into a list of individual words, or "tokens"

This processed data was then stored in a datafile, which we now load in:

data_words_bigrams = pd.read_pickle(path_pkl+'scied_words_bigrams_V5.pkl')

### A. Plotting top words in all documents

Now, we will do some investigation and filtering based on word frequency. Our goal is to filter out the words that occur in a large number of documents, which are less likely to carry any distinct meaning for any specific theories, methods, or research traditions in PER. For example, most people in the PER community talk about "physics", "education", and "students" in one form or another. Those words do not carry much meaning, and so should be removed from our dataset in order to make sure that the more interesting, distinct, and meaningful words are prioritized in the analysis.

We start by defining and implementing some functions to plot the word frequency distribution in the dataset.