In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
#custom functions 
from projectfunctions import * 

In [3]:
import pandas as pd  
import spacy 
import scattertext as st 
from IPython.core.display import display, HTML

Read in the NGSS and Arizona state standards & apply preprocessing methods including tokenizing, removing stop words to the text. 

In [4]:
#ngss corpus
ngss = open_and_flatten('TXTfiles/ngss')   

#create a nested list
s = ['ngss'] 
s.append(ngss) 

#create a ngss dataframe
ngss_corpi = pd.DataFrame(s, columns=["corpus"]) 
ngss_corpi['state'] = "ngss" 
ngss_corpi = ngss_corpi.drop([0], axis=0)

In [5]:
#state corpus 
texas = open_and_flatten('TXTfiles/texas')  

#create a nested list
a = ['texas'] 
a.append(texas) 

#create a alaska dataframe
state_corpi = pd.DataFrame(a, columns=["corpus"]) 
state_corpi['state'] = "texas" 
state_corpi = state_corpi.drop([0], axis=0)

In [6]:
#join states & ngss
standards_corpi = pd.concat([state_corpi, ngss_corpi], axis=0)  
  

#clearn corpus column
standards_corpi['corpus'] = standards_corpi['corpus'].astype(str)  
standards_corpi['corpus'] = standards_corpi['corpus'].apply(lambda x: 
                                                            x.strip("[")
                                                            .strip("]"))

standards_corpi.head()

Unnamed: 0,corpus,state
1,"'elementary', 'essential', 'knowledge', 'skill...",texas
1,"'topic', 'arrangements', 'next', 'generation',...",ngss


I chose the Arizona state standards because they are heavily aligned with the NGSS standards so should produce a high number of similar points.

In [7]:
#turn text into a Scattertext Corpus 
nlp = spacy.load('en')
corpus = st.CorpusFromPandas(standards_corpi, 
                            category_col='state', 
                            text_col='corpus', 
                            nlp=nlp).build()

In [8]:
print("Corpus Unique Words:")
word = list(corpus.get_scaled_f_scores_vs_background().index[:10]) 
for w in word: 
    print("-", w)

Corpus Unique Words:
- dcis
- crosscutting
- abstractly
- testable
- geosphere
- nonliving
- progresses
- hydrosphere
- delimiting
- quantitatively


In [12]:
#words most associated with aligned states 
print("Texas:")
term_freq_df = corpus.get_term_freq_df()
term_freq_df['Texas'] = corpus.get_scaled_f_scores('texas')
al = list(term_freq_df.sort_values(by='Texas', ascending=False) 
          .index[:20])
for a in al: 
    print("-", a)

Texas:
- international
- concepts student
- student knows
- adopted
- effective
- knows
- student expected
- update
- provisions
- investigation reasoning
- international baccalaureate
- baccalaureate
- scientific investigation
- adopted effective
- provisions adopted
- source provisions
- recommended
- laboratory field
- critical thinking
- field investigations


In [13]:
#words most associated with aligned states 
print("NGSS:")
term_freq_df = corpus.get_term_freq_df()
term_freq_df['NGSS'] = corpus.get_scaled_f_scores('ngss')
al = list(term_freq_df.sort_values(by='NGSS', ascending=False) 
          .index[:20])
for a in al: 
    print("-", a)

NGSS:
- engineering
- builds
- expectations
- performance expectations
- framework education
- core ideas
- clarification statement
- experiences progresses
- disciplinary core
- assessment
- clarification
- boundary
- could include
- progresses
- performance
- statement
- boundary assessment
- assessment boundary
- integrate
- constructing


In [14]:
html = st.produce_scattertext_explorer(corpus, 
                                       category='ngss',
                                       category_name='NGSS',
                                       not_category_name='Texas',
                                       width_in_pixels=1000)
open("TX_NGSS_Visualization.html", 'wb').write(html.encode('utf-8'))

1611141