In [209]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [210]:
#custom functions 
from projectfunctions import * 

In [211]:
import pandas as pd  
import spacy 
import scattertext as st 
from IPython.core.display import display, HTML

Read in the NGSS and Arizona state standards & apply preprocessing methods including tokenizing, removing stop words to the text. 

In [212]:
#ngss corpus
ngss = open_and_flatten('TXTfiles/ngss')   

#create a nested list
s = ['ngss'] 
s.append(ngss) 

#create a ngss dataframe
ngss_corpi = pd.DataFrame(s, columns=["corpus"]) 
ngss_corpi['state'] = "ngss" 
ngss_corpi = ngss_corpi.drop([0], axis=0)

In [213]:
#alaska corpus 
arizona = open_and_flatten('TXTfiles/alaska')  

#create a nested list
a = ['alaska'] 
a.append(arizona) 

#create a alaska dataframe
state_corpi = pd.DataFrame(a, columns=["corpus"]) 
state_corpi['state'] = "alaska" 
state_corpi = state_corpi.drop([0], axis=0)

In [214]:
#join states & ngss
standards_corpi = pd.concat([state_corpi, ngss_corpi], axis=0)  
  

#clearn corpus column
standards_corpi['corpus'] = standards_corpi['corpus'].astype(str)  
standards_corpi['corpus'] = standards_corpi['corpus'].apply(lambda x: 
                                                            x.strip("[")
                                                            .strip("]"))

standards_corpi.head()

Unnamed: 0,corpus,state
1,"'dept', 'education', 'early', 'development', '...",alaska
1,"'topic', 'arrangements', 'next', 'generation',...",ngss


I chose the Arizona state standards because they are heavily aligned with the NGSS standards so should produce a high number of similar points.

In [215]:
#turn text into a Scattertext Corpus 
nlp = spacy.load('en')
corpus = st.CorpusFromPandas(standards_corpi, 
                            category_col='state', 
                            text_col='corpus', 
                            nlp=nlp).build()

In [216]:
print("Corpus Unique Words:")
word = list(corpus.get_scaled_f_scores_vs_background().index[:10]) 
for w in word: 
    print("-", w)

Corpus Unique Words:
- crosscutting
- dcis
- abstractly
- ngss
- geosphere
- clarification
- progresses
- hydrosphere
- embryological
- delimiting


In [217]:
#words most associated with aligned states 
print("Alaska:")
term_freq_df = corpus.get_term_freq_df()
term_freq_df['Alaska'] = corpus.get_scaled_f_scores('alaska')
al = list(term_freq_df.sort_values(by='Alaska', ascending=False) 
          .index[:20])
for a in al: 
    print("-", a)

Alaska:
- concepts constructing
- ideas crosscutting
- understanding develop
- practices disciplinary
- education engineering
- elements framework
- developed using
- following elements
- using following
- expectations developed
- engineering practices
- developed
- following
- crosscutting concepts
- elements
- crosscutting
- demonstrate understanding
- understanding use
- understanding construct
- ice


In [218]:
#words most associated with aligned states 
print("NGSS:")
term_freq_df = corpus.get_term_freq_df()
term_freq_df['NGSS'] = corpus.get_scaled_f_scores('ngss')
al = list(term_freq_df.sort_values(by='NGSS', ascending=False) 
          .index[:20])
for a in al: 
    print("-", a)

NGSS:
- progresses
- experiences progresses
- achieve reserved
- reserved
- national academy
- verbatim framework
- academy sciences
- academy
- permission
- reprinted
- section entitled
- entitled disciplinary
- ideas reproduced
- verbatim
- education practices
- entitled
- ' achieve
- practices cross
- ideas integrated
- integrated reprinted


In [219]:
html = st.produce_scattertext_explorer(corpus, 
                                       category='ngss',
                                       category_name='NGSS',
                                       not_category_name='Alaska',
                                       width_in_pixels=1000)
open("AZ_NGSS_Visualization.html", 'wb').write(html.encode('utf-8'))

2012185