# Data Collection 
For this project I used BeautifulSoup, PyPDF2, and pdftotext to both scrape data from the web as well as convert PDF data into txt files. My primary focus in collection were the science standards written for NGS, aligned NGS standards and unaligned standards. In addition I collected the state wide grade 8 science examination (2002-2019) and a broad range of questions used in my own 6th grade classroom to preform classification and question alignment.

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
#custom functions 
from projectfunctions import *

In [2]:
#webscraping
from bs4 import BeautifulSoup 
import requests   
from time import sleep 
import textract 

#data analysis
import pandas as pd  
import numpy as np 
from random import randint  

#saving files
import pickle 
import PyPDF2 
import sys

# Standards

### Next Generation Science Standards K-12  
The [NGS](https://www.nextgenscience.org/) Standards are redesigned, national standards for K-12, in addition a [book](https://www.nap.edu/read/13165/chapter/1) detailing the reasoning, methodology and practices was released 

#### Standards & Basic Description Via Website

In [3]:
#scrape the standard name and description of each science standard 
pages = np.arange(0, 36, 1)  
standard_list = [] 

for i in pages:   
    page = requests.get(f"https://www.nextgenscience.org/search-standards?keys=&page={i}") 
    soup = BeautifulSoup(page.text, 'html.parser')
    standards = soup.find_all('div', class_="col-sm-9") 
    sleep(randint(2,10))  
    for standard in standards:  
        standard_list.append(standard.text.split('\n\n\n\n\n')[:2])   
        
#standards by grade & core concept(dci)
ngs = pd.DataFrame(standard_list, columns=['dci', 'standard'])  
ngs['dci'] = ngs['dci'].map(lambda x: x.replace('\n\n\n', '').strip('\t'))   

#split the standard column to have the tag and name in two seperate columns 
ngs[['tag','dci']] = ngs["dci"].str.split(" ", 1, expand=True)
ngs["dci"] = ngs["dci"].str.strip("-")
ngs['tag'] = ngs['tag'].str.strip("\n\r\n") 
ngs['dci'] = ngs['dci'].str.strip('Grade:\xa0    \n\n') 
#remove the DCI arrangemnts (double listed) 
ngs_standards = ngs.loc[: 206]  
ngs_standards.head() 
pickle.dump(ngs_standards, open( "ngsstandards.p", "wb" ) )

#### Load the Expanded NGS Standards PDF into a TXT File

In [5]:
#load the expanded ngs standards pdf into a txt file 
pdf_to_text(filepath='/Users/kristen/Downloads/NGS.pdf',
            filename='ngs')

### Aligned NGS State Standards  
As of January 2021 the [following states](https://victoryprd.com/blog/update-on-next-generation-science-standards-ngss/) have aligned thier local standards to the NGS Standards.

In [6]:
#alabama 
pdf_to_text(filepath=r'/Users/kristen/Downloads/Alabama.pdf', 
            filename='alabama') 

In [7]:
#alaska 
pdf_to_text(filepath=r'/Users/kristen/Downloads/alaska.pdf', 
            filename='alaska')  

In [8]:
#arizona 
pdf_to_text(filepath=r'/Users/kristen/Downloads/Arizona.pdf', 
            filename='arizona') 

In [9]:
#colorado
pdf_to_text(filepath=r'/Users/kristen/Downloads/colorado.pdf', 
            filename='colorado')  

In [10]:
#flordia  
pdf_to_text(filepath=r'/Users/kristen/Downloads/flordia.pdf', 
            filename='flordia') 

In [15]:
#georgia  
pdf_to_text(filepath=r'/Users/kristen/allgeorgia.pdf', 
            filename='georgia') 

In [16]:
#idaho 
pdf_to_text(filepath=r'/Users/kristen/Downloads/colorado.pdf', 
            filename='idaho')          

In [17]:
#indiana  
pdf_to_text(filepath=r'/Users/kristen/Downloads/indiana.pdf',
            filename='indiana') 

In [18]:
#lousiana
pdf_to_text(filepath=r'/Users/kristen/Downloads/k-12-louisiana-student-standards-for-science 2/louisiana.pdf',
            filename='louisiana')          

In [19]:
#massachusetts
pdf_to_text(filepath=r'/Users/kristen/Downloads/mass.pdf', 
            filename='mass')  

In [20]:
#minnesota   
pdf_to_text(filepath=r'/Users/kristen/Downloads/mass.pdf',
            filename='minnesota')  

In [21]:
#mississippi 
pdf_to_text(filepath=r'/Users/kristen/Downloads/mass.pdf',
            filename='mississippi')  

In [22]:
#missouri 
pdf_to_text(filepath=r'/Users/kristen/Downloads/mo1 2021-01-13 at 5.38.23 PM.pdf', 
            filename='missouri')  

In [23]:
#montana 
pdf_to_text(filepath=r'/Users/kristen/Downloads/mass.pdf', 
            filename='montana')  

In [24]:
#nebraska 
pdf_to_text(filepath=r'/Users/kristen/Downloads/nebraska.pdf', 
            filename='nebraksa') 

In [25]:
#north dakota
pdf_to_text(filepath=r'/Users/kristen/Downloads/northdakota.pdf', 
            filename='northdakota')

In [26]:
#oklahoma 
pdf_to_text(filepath=r'/Users/kristen/Downloads/oklahoma.pdf', 
            filename='oklahoma') 

In [27]:
#south carolina 
pdf_to_text(filepath=r'/Users/kristen/Downloads/southcarolina.pdf', 
            filename='southcarolina')  

In [28]:
#south dakota 
pdf_to_text(filepath=r'/Users/kristen/Downloads/southdakota.pdf', 
            filename='southdakota') 

In [29]:
#tennessee 
pdf_to_text(filepath=r'/Users/kristen/Downloads/tennessee.pdf', 
            filename='tennessee') 

In [30]:
#utah 
pdf_to_text(filepath=r'/Users/kristen/Downloads/utah.pdf', 
            filename='utah')  

In [33]:
#west virgina 
pdf_to_text(filepath=r'/Users/kristen/Desktop/westvirginia.pdf', 
            filename='westvirginia')             

In [34]:
#wisconsin 
pdf_to_text(filepath=r'/Users/kristen/Downloads/wisconsin.pdf', 
            filename='wisconsin')  

In [35]:
#wyoming 
pdf_to_text(filepath=r'/Users/kristen/Downloads/wyoming.pdf', 
            filename='wyoming') 

### Non Aligned State Standards  
As of January 2021 the [following states](https://victoryprd.com/blog/update-on-next-generation-science-standards-ngss/) have not aligned thier local standards to the NGS Standards.

In [36]:
#maine 
pdf_to_text(filepath=r'/Users/kristen/Downloads/maine.pdf', 
            filename='maine') 

In [37]:
#michigan 
pdf_to_text(filepath=r'/Users/kristen/Downloads/michigan.pdf', 
            filename='michigan') 

In [38]:
#north carolina
pdf_to_text(filepath=r'/Users/kristen/Downloads/nc1 2021-01-13 at 5.48.03 PM.pdf',
            filename='northcarolina') 

In [39]:
#ohio 
pdf_to_text(filepath=r'/Users/kristen/Downloads/ohio.pdf', 
            filename='ohio')  

In [40]:
#pennsylvania 
pdf_to_text(filepath=r'/Users/kristen/Downloads/pennsylvania.pdf', 
            filename='pennsylvania') 

In [41]:
#texas  
pdf_to_text(filepath=r'/Users/kristen/Downloads/texas.pdf',
            filename='texas')

In [42]:
#virginia 
pdf_to_text(filepath=r'/Users/kristen/Downloads/virginia.pdf', 
            filename='virginia') 

# Question Sets 

### NY State 8th Grade State Test  
A NGS aligned state's standardized test

In [43]:
#new york tests 
pdf_to_text(filepath=r'/Users/kristen/Downloads/2001 2021-01-13 at 5.51.01 PM.pdf', 
            filename='newyorktests') 

### Classroom Question Repository 
This is a collection of 'real world' questions used in classwork, labs, tests in quizzes in a 6th Grade Science classroom.

In [57]:
#load in question data 
classroom_questions_csv = pd.read_csv('/Users/kristen/Desktop/classroom_questions.csv')
classroom_questions = pd.DataFrame(classroom_questions_csv) 

# State Dictionaries 

In [5]:
#state dictionary of most recent standards update and states status with NGS
state_dict = {
    'alabama': {'update_year': 2015, 'standards': 'aligned'}, 
             'alaskas': {'update_year': 2017, 'standards': 'aligned'}, 
             'arizona': {'update_year': 2018, 'standards': 'aligned'}, 
             'arkansas': {'update_year': 2016, 'standards': 'adopted'}, 
             'california': {'update_year': 2013, 'standards': 'adopted'},
             'colorado': {'update_year': 2020, 'standards': 'aligned'},
             'conneticut': {'update_year': 2015, 'standards': 'adopted'},
             'delaware': {'update_year': 2016, 'standards': 'adopted'}, 
             'flordia': {'update_year': 2008, 'standards': 'independent'}, 
             'georgia': {'update_year': 2016, 'standards': 'aligned'}, 
             'hawaii': {'update_year': 2016, 'standards': 'adopted'}, 
             'idaho': {'update_year': 2018, 'standards': 'aligned'}, 
             'illinois': {'update_year': 2014, 'standards': 'adopted'}, 
             'indiana': {'update_year': 2016, 'standards': 'aligned'}, 
             'iowa': {'update_year': 2015, 'standards': 'adopted'}, 
             'arkansas': {'update_year': 2016, 'standards': 'adopted'},
             'kansas': {'update_year': 2013, 'standards': 'adopted'}, 
             'kentucky': {'update_year': 2015, 'standards': 'adopted'},
             'louisiana': {'update_year': 20202, 'standards': 'aligned'}, 
             'maine': {'update_year': 2019, 'standards': 'independent'},
             'maryland': {'update_year': 2013, 'standards': 'adopted'}, 
             'massachusetts': {'update_year': 2016, 'standards': 'aligned'},
             'michigan': {'update_year': 2015, 'standards': 'independent'}, 
             'minnesota': {'update_year': 2019, 'standards': 'aligned'},
             'mississippi': {'update_year': 2018, 'standards': 'aligned'},
             'missouri': {'update_year': 2016, 'standards': 'aligned'}, 
             'montana': {'update_year': 2016, 'standards': 'aligned'},
             'nebraska': {'update_year': 2017, 'standards': 'aligned'},
             'nevada': {'update_year': 2015, 'standards': 'adopted'}, 
             'new hampshire': {'update_year': 2015, 'standards': 'adopted'}, 
             'new jersey': {'update_year': 2020, 'standards': 'adopted'}, 
             'new mexico': {'update_year': 2003, 'standards': 'adopted'}, 
             'new york': {'update_year': 2016, 'standards': 'adopted'}, 
             'north carolina': {'update_year': 2004, 'standards': 'independent'}, 
             'north dakota': {'update_year': 2019, 'standards': 'aligned'}, 
             'ohio': {'update_year': 2019, 'standards': 'independent'}, 
             'oklahoma': {'update_year': 2020, 'standards': 'aligned'}, 
             'oregon': {'update_year': 2014, 'standards': 'adopted'}, 
             'pennsylvania': {'update_year': 2002, 'standards': 'independent'}, 
             'rhode island': {'update_year': 2013, 'standards': 'adopted'}, 
             'south carolina': {'update_year': 2014, 'standards': 'aligned'}, 
             'south dakota': {'update_year': 2016, 'standards': 'aligned'}, 
             'tennessee': {'update_year': 2016, 'standards': 'aligned'}, 
             'texas': {'update_year': 2017, 'standards': 'independent'}, 
             'utah': {'update_year': 2019, 'standards': 'aligned'}, 
             'vermont': {'update_year': 2013, 'standards': 'adopted'}, 
             'virginia': {'update_year': 2018, 'standards': 'independent'},
             'washington': {'update_year': 2013, 'standards': 'adopted'}, 
             'west virginia': {'update_year': 2015, 'standards': 'adopted'}, 
             'wisconsin': {'update_year': 2017, 'standards': 'aligned'},
             'wyoming': {'update_year': 2017, 'standards': 'aligned'},}

In [6]:
#turn the dictionary into a df 
state_df = pd.DataFrame.from_dict(state_dict)  

#pickle df 
state_df.to_pickle("./state_df.pkl")

In [7]:
#state dictionary of most recent standards update and states status with NGS
no_adopted_state_dict ={
            'alabama': {'update_year': 2015, 'standards': 'aligned'}, 
             'alaskas': {'update_year': 2017, 'standards': 'aligned'}, 
             'arizona': {'update_year': 2018, 'standards': 'aligned'}, 
             'colorado': {'update_year': 2020, 'standards': 'aligned'},
             'flordia': {'update_year': 2008, 'standards': 'independent'}, 
             'georgia': {'update_year': 2016, 'standards': 'aligned'}, 
             'idaho': {'update_year': 2018, 'standards': 'aligned'}, 
             'indiana': {'update_year': 2016, 'standards': 'aligned'}, 
             'louisiana': {'update_year': 20202, 'standards': 'aligned'}, 
             'maine': {'update_year': 2019, 'standards': 'independent'},
             'massachusetts': {'update_year': 2016, 'standards': 'aligned'},
             'michigan': {'update_year': 2015, 'standards': 'independent'}, 
             'minnesota': {'update_year': 2019, 'standards': 'aligned'},
             'mississippi': {'update_year': 2018, 'standards': 'aligned'},
             'missouri': {'update_year': 2016, 'standards': 'aligned'}, 
             'montana': {'update_year': 2016, 'standards': 'aligned'},
             'nebraska': {'update_year': 2017, 'standards': 'aligned'}, 
             'north carolina': {'update_year': 2004, 'standards': 'independent'}, 
             'north dakota': {'update_year': 2019, 'standards': 'aligned'}, 
             'ohio': {'update_year': 2019, 'standards': 'independent'}, 
             'oklahoma': {'update_year': 2020, 'standards': 'aligned'}, 
             'pennsylvania': {'update_year': 2002, 'standards': 'independent'},  
             'south carolina': {'update_year': 2014, 'standards': 'aligned'}, 
             'south dakota': {'update_year': 2016, 'standards': 'aligned'}, 
             'tennessee': {'update_year': 2016, 'standards': 'aligned'}, 
             'texas': {'update_year': 2017, 'standards': 'independent'}, 
             'utah': {'update_year': 2019, 'standards': 'aligned'}, 
             'virginia': {'update_year': 2018, 'standards': 'independent'},
             'wisconsin': {'update_year': 2017, 'standards': 'aligned'},
             'wyoming': {'update_year': 2017, 'standards': 'aligned'},}

In [8]:
#turn the dictionary into a df 
state_df_no_adopted = pd.DataFrame.from_dict(no_adopted_state_dict)  

#pickle df 
state_df_no_adopted.to_pickle("./state_df_no_adopted.pkl")