In [3]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time
import pickle
from math import floor
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.by import By
import re
from sklearn.manifold import TSNE
import nltk
from nltk.tokenize import TextTilingTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem.snowball import SnowballStemmer
from tensorflow.keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical
import random
from keras import models, layers, optimizers
from keras.models import load_model

model = load_model('model.h5')
## Random Forest

def initialize():
    '''Loads model and tokenizer.'''

    # load model
    model = load_model('model.h5')

    # load tokenizer
    file = open('tokenizer.pickle', 'rb') 
    tokenizer = pickle.load(file)
    
    return model, tokenizer
def lemmatize(data):
    '''With raw text data passed in as a single array, will return
    each word with each sentence and its punctuation lemmatized'''
    wnl = WordNetLemmatizer()
    processed = ' '.join(data.splitlines())
    tokens = [word for sent in nltk.sent_tokenize(processed) for word in nltk.word_tokenize(sent)]
    lemmas = []
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            lemmas.append(wnl.lemmatize(token))
        else:
            lemmas.append(token)
    return lemmas
def preprocessing(para):
    '''Loads stopwords to be used, splits data up into managable chunks for the model, 
    lemmatizes input data, and vectorizes data.'''
    
    # loading stopwords (combination of elizabethan stopwords and nltk's english stopwords)
    file = open('stopwords.pickle', 'rb')
    stop_words = pickle.load(file)

    # generating paragraphs to evaluate based on full texts
    tt = TextTilingTokenizer(stopwords=stop_words)

    # ensure data passed in is a string
    para = str(para)
    
    try:
        # splitting data into smaller bits to pass into model
        print('Splitting input.')
        paragraph = tt.tokenize(para)
        # text processing -lemmatizing to pass into model
        print('Processing input.')
        lemmas = lemmatize(paragraph)
    except ValueError:
        # text processing -lemmatizing to pass into model
        print('Processing input.')
        lemmas = lemmatize(para)

    print('Returning data.')
    # vectoring
    one_hot_results= tokenizer.texts_to_matrix([lemmas], mode='tfidf') 

    return one_hot_results
nltk.download('wordnet')
model, tokenizer = initialize()
fake_para = 'Keras is a simple and powerful Python library for deep learning.\nGiven that deep learning models can take hours, days and even weeks to train, it is important to know how to save and load them from disk.\nIn this post, you will discover how you can save your Keras models to file and load them up again to make predictions.'
science_art = 'Scientists have resurrected a purple-blue hue that had been lost to time.\nCalled folium, this watercolor had been used to paint images on the pages of medieval manuscripts. But long ago, it fell out of use. Now scientists have tracked down folium’s source to a plant. They’ve also mapped out the molecule that produces its blue hue./nSuch chemical information can be key to conserving art. “We want to mimic these ancient colors to know how to … preserve them,” explains Maria Melo. She works at Universidade Nova de Lisboa in Caparica, Portugal. There she studies ancient art and how to preserve or restore it. To unmask folium’s identity, her team had to first find out where it came from./nThe pigment hadn’t been used for centuries. Everyone who knew how to prepare it had died long ago. So the researchers turned to books from the 1400’s and found one that described the plant that was its source. That led them on a scavenger hunt to find living specimens of this plant.'
def model_predict(text):
    '''Passes preprocessed text into model and classifies text, returning approximate lexile level.'''
    
    # Load the class labels
    with open('labels.pickle', 'rb') as file:
        classes = pickle.load(file)
    
    # Preprocess the text
    processed_text = preprocessing(text)  # assuming this returns (1, 10000)
    processed_text = np.expand_dims(processed_text, axis=1)  # Now shape is (1, 1, 10000)

    # Use the model to make predictions
    predictions = model.predict(processed_text)

    predictions = predictions.flatten()

    to_post = 'No lexile match'
    for index, value in enumerate(predictions):
        if value > 0.70:
            print('Lexile Found!')
            found_lexile = classes[index]
            if found_lexile == classes[0]:
                to_post = 'This text is suitable for early elementary aged readers. The lexile range for this text is from 0 to 650L.'
            elif found_lexile == classes[3]:
                to_post = 'This text is suitable for late elementary aged readers. The lexile range for this text is from 650L to 1050L.'
            elif found_lexile == classes[1]:
                to_post = 'This text is suitable for middle school aged readers. The lexile range for this text is from 1050L to 1200L.'
            else:
                to_post = 'This text is suitable for high school aged readers. The lexile range for this text is from 1200L to 1400L.'
            break
        elif value > 0.3:
            found_lexile = classes[index]
            if found_lexile == classes[0]:
                to_post = 'This text is most similar to texts suitable for early elementary aged readers. The lexile match is closest to the range 0 to 650L.'
            elif found_lexile == classes[3]:
                to_post = 'This text is most similar to texts suitable for late elementary aged readers. The lexile match is closest to the range 650L to 1050L.'
            elif found_lexile == classes[1]:
                to_post = 'This text is most similar to texts suitable for middle school aged readers. The lexile match is closest to the range 1050L to 1200L.'
            else:
                to_post = 'This text is most similar to texts suitable for high school aged readers. The lexile match is closest to the range 1200L to 1400L.'
            break
        elif index > 3:
            to_post = 'No lexile match'
            break
        else:
            print('Reevaluating.')
    
    return to_post
model_predict(science_art)
more_science_art='They enlisted the help of a botanist, a scientist who studies plants. The team landed on Chrozophora tinctoria (Croh-ZOFF-or-uh Tink-TOR-ee-uh). They found this tiny herb with silvery-green leaves in a village in south Portugal. It was growing along roadsides and in fields after harvest. The team gathered its pebble-sized fruit with care.\nBack in the lab, the scientists extracted the pigment with the help of a medieval text on colors. “It’s very specific,” notes Paula Nabais. She’s a conservation scientist who was part of the research team. “So we were able to use that recipe [and] reproduce it.” Nabais also works at Universidade Nova de Lisboa.\n“That’s pretty cool to have done that work of looking in the historical recipes and traveling back in time,” says Francesca Casadio. She’s a chemist and museum scientist at the Art Institute of Chicago in Illinois. Casadio, who was not part of this study, says the new work is a good example of what’s called experimental archaeology. It recreates an ancient process. By making the dye, the scientists could study its chemistry without experimenting on priceless works of art, she points out.\nThe researchers used many techniques to analyze the dye and identify its chemical structure. They reported how they did it April 17 in Science Advances. They also simulated how light interacts with the candidate molecule. That helped the scientists check whether the structure would give them the blue they desired.\nKnowing a paint’s chemistry helps conservation scientists know how to preserve art that used it. For instance, these data might be used to slow a paint’s degradation. Or if the piece needs to be restored, museum scientists can find compatible pigments. “This is absolutely vital to conservation,” says Mark Clarke. He’s a conservation scientist at Universidade Nova de Lisboa. He was not part of this team but has studied folium before.\nFolium had presented scientists with a difficult chemistry puzzle. “People have been tinkering with [this dye] since the ‘30s and they’ve finally cracked it,” he says.\nThis team succeeded because it brought together experts from fields as diverse as chemistry, botany and medieval literature. In the end, Clarke says, you’ve got “new science from old books.” And, he adds, these very modern things are being used to answer “very old problems.”'
model_predict(science_art + more_science_art)
bubbles = 'Bubbles are everywhere. You just need to know where to look. There’s the obvious place — the soap bubbles in your bath. There are also bubbles in your body. They’re responsible for your cracking knuckles. The gems in a ring might have bubbles, called inclusions. Going farther out, humpback whales use bubbles to hunt. And scientists figured out a way to heal wounds with bubbles.\nBut the best bubbles, at least on a sunny summer day, are probably the bubbles you blow in your own backyard. Scientists have found these bubbles to be alluring, too. They’ve figured out the best way to blow perfect bubbles, and the secret recipe for making huge ones. They’ve also listened in on bubble bursts to figure out the physics that underlie the gentle “pfttt” that accompanies a bubble’s demise.'
model_predict(bubbles)
science_method = "The basic scientific method includes the steps scientists use and follow when trying to solve a problem or prove or disprove a theory. The methods are used by scientists all over the world. This is done so scientists can work together to solve some of the same problems.\nThere are usually five steps which are a part of the scientific method. The steps can occur in any order, but the first step is usually observation. An observation is the use of one or more of the five senses, which include seeing, hearing, feeling, smelling, and tasting. The five senses are used to learn about or identify an event or object the scientist wants to study. For example, while observing a spider a scientist may observe the pattern or size of the spider's web.\nThe second step of the scientific method is the question being researched, the hypothesis. It is the question that is turned into a statement about an event or object the scientist would like to research. A good hypothesis includes three things: The explanation for the observations, it is able to be tested by other scientists, and it will usually predict new outcomes or conclusions. The scientist observing the spider building the web may have a question about the strength of the web. An example of the hypothesis might be: The larger the spider, the stronger the web. This hypothesis includes the explanation for the observation, it can be tested, and new conclusions may be reached.\nThe third step of the scientific method is the experiment. An experiment is a test which will either challenge or support the hypothesis. The hypothesis will then be true or false. Using the spider hypothesis, a scientist may experiment by measuring spider webs in relation to a spider's size. Often, even when a hypothesis is disproved much can still be learned during the experiment. For example, while measuring the strength of spider webs the scientist may discover something new about them.\nThe final step in the scientific method is the conclusion. The conclusion will either clearly support the hypothesis or it will not. If the results support the hypothesis a conclusion can be written. If it does not support the hypothesis, the scientist may choose to change the hypothesis or write a new one based on what was learned during the experiment. In the example, if the scientist proves that larger spiders build stronger webs, then that is the conclusion. If it was not proven, the scientist may change the hypothesis to: The size of a spider does has no bearing on the strength of its web.\nThe scientific method is used for simple experiments students may do in the classroom or very complex or difficult experiments being done all over the world. The spider experiment may be done by any scientist in the world.\nIn summary, the scientific method includes the steps scientists use to solve a problem or to prove or disprove a theory. There are four basic steps involved with the scientific method. The usual steps include observation, hypothesis, experiment, and conclusion. The steps may not always be completed in the same order. Following the four steps, the results of the experiment will either support the hypothesis or will not support the hypothesis. Scientists are always free to change or write a new hypothesis and start the four steps all over again. The scientific method is used for simple experiments or for more difficult experiments."
model_predict(science_method)
evolution = "In 1859, Charles Darwin published convincing evidence that species evolve. He further explained how this process occurs. From that evidence and explanation, we have what scientists and others call today, the Theory of Evolution.\nLike all scientific theories, the theory of evolution has developed through decades of scientific observations and experimentation. Today almost all scientists accept that evolution is the basis for the diversity of life on earth.\nAfter years of research and study, Darwin suggested that by surviving long enough to reproduce, populations have the opportunity to pass on favorable characteristics to offspring. Over time, these characteristics will increase in a population and the nature of that population will gradually change. Darwin called this process by which populations change in response to their environment natural selection.\nDarwin suggested that organisms differ from place to place because their habitats present different challenges to survival and reproduction. As a result, each species has evolved in response to their specific environment. This changing process in response to a particular environment is called adaption. Darwin concluded that the species in a particular place evolved from a species that previously lived there or that migrated from a nearby area.\nDarwin's evidence was based on the idea that in any population, individuals that are best suited to survive and do well in their environment will produce the most offspring. By doing so, the traits of that offspring will be passed on and become more common as each new generation arrives. Traits are the genetic characteristics that may be physical, such as hair color; or behavioral, such as birds building nests.'nScientist now know that genes are responsible for inherited traits. Therefore, certain forms of a trait become more common because more of the species carry the gene that is passed on. In other words, natural selection causes the frequency of genes in a population to increase or decrease over time.\nFossils offer the most direct evidence that evolution takes place. A fossil is the preserved or mineralized remains or imprint of an organism that lived past life-forms. Change over time, or evolution can be seen in the fossils. For example, fossil links have been found between fish and amphibians, between retiles and birds, and between reptiles and mammals. All of which add valuable evidence to the history of vertebrates./nToday, Darwin's theory of evolution is almost universally accepted by scientists as the best available explanation for the biological diversity on earth. Based on this supporting evidence, most scientist agree on the following three major points: 1) Earth is about 4.5 billion years old, 2) Organisms have inhabited earth for most of its history, and 3) All organisms living today evolved from earlier, simpler life-forms.\nIn summary, at age 22, Charles Darwin set off on a journey by the urging of his college professor on the naval voyage of the HMS Beagle that forever changed his life and the way people think of themselves. It was on this journey that evidence was collected to support what is universally accepted today as Darwin's Theory of Evolution."
model_predict(evolution)
corona = "A letter, published in The New England Journal of Medicine (NEJM), warns of new-onset diabetes in COVID-19 patients.\nDr Abd Tahrani, Senior Lecturer in Metabolic Endocrinology and Obesity Medicine and NIHR Clinician Scientist; University of Birmingham, said:\,“The current evidence suggest that diabetes and poor glycaemic control are risk factors for severe COVID-19 and that COVID-19 might precipitate diabetes-related ketoacidosis. However, whether COVID-19 can increase the risk of developing diabetes mellitus remains unknown and the correspondence published at NEJM provides plausible mechanisms for a bi-directional relationship between diabetes and COVID-19. Hence, it is important to examine these complex interactions between diabetes and COVID-19 further and the setting up of the global registry CoviDIAB is an important step in that regards. However, for now and till further evidence is available, it is important that we improve glycaemic control in people with diabetes to reduce the risk of severe COVID-19 as well as working on strategies that prevent the development of Type 2 diabetes.  An essential step in that direction is treating people with obesity and making obesity treatments (from life style interventions to bariatric surgery) available to patients with obesity that require treatment nationally as current treatment option availability is post code lottery.” \nDr Gabriela da Silva Xavier, Senior Lecturer in Cellular Metabolism, University of Birmingham, said:\n“The three papers that were cited to support the need for further research represents one case study describing the case of one patient with new onset diabetes and two papers describing larger cohorts with diabetes where the disease may have been aggravated by COVID-19.  None of the studies cited explicitly describe diabetes brought on by COVID-19.\n“One of the cited papers provides circumstantial evidence for potential damage of the cells that produce insulin, the pancreatic beta cells, by SARS, not COVID-19.\n“Taken together, it is currently unclear whether COVID-19 is causal for diabetes, but there is some correlation with worsening of disease with COVID-19.  It is, therefore, logical to want to investigate whether COVID-19 is causal for or exacerbates diabetes by establishing a registry where patient history of diabetes is systematically logged, as proposed in the letter. \n“In short it would be unfair to take the cited data to indicate that COVID-19 is causal of diabetes and diabetes complications but, given the observations, it is reasonable to propose to look at this carefully, as proposed in the letter.”\nProf Naveed Sattar, Professor of Metabolic Medicine, University of Glasgow, said:\n“This is definitely an important question that many doctors are watching but we need to see what happens to diabetes rates over the next 1-2 years to confirm or refute such risks.  We will certainly get these data as all national datasets are keeping close watch.  In meantime, people should be encouraged to keep active and eat healthily or as best they can to keep their weights stable or to lose a few pounds to lessen their risks of diabetes now (and in the future) and potentially to reduce their risks of developing more severe COVID-19 should then succumb to the infection. That is the best we can do and we need more governmental help to promote and make healthier lifestyles easier as we come out of lockdown.”  \nDr Riyaz Patel, Associate Professor of Cardiology & Consultant Cardiologist, UCLH, said:\n“There is no robust data yet to indicate that COVID-19 causes new diabetes or worsens existing diabetes. Some data however suggests there could be a possible link so researchers are seeking to explore this further.\n“Observational data linking the two may be confounded for a few reasons. For example we know that any stress inducing illness can cause blood sugar levels to temporarily rise and we see this for example with heart attacks. Also people who are more likely to get very sick with covid may be at risk of developing diabetes anyway perhaps because they are overweight. We know that obesity is linked to worse outcomes with COVID.\n“However there are many uncertainties about COVID, how it affects our cells and especially the longer term effects after surviving the illness. The researchers are experts in diabetes and propose to study this in more detail by collecting data at scale internationally. If there is a direct link then that will have important implications on how we treat COVID patients during and after the acute illness.\nProf Lora Heisler, Chair in Human Nutrition, University of Aberdeen, said:\n“Information collected about people who have unfortunately caught highly contagious COVID-19 has shown that some people are at more risk of developing more serious symptoms and particularly upsetting, at greater risk of not surviving if they get COVID-19.  This higher risk group includes older folks and those who already have diseases, including type 2 diabetes.  What isn’t clear is why people who already have type 2 diabetes are at greater risk for more serious symptoms and possible fatality from COVID-19.  Since COVID-19 is so new, scientists and doctors just don’t know.  Some positive news was reported by Hongliang Li earlier this month in the journal Cell Metabolism.  They found that people with type 2 diabetes with better management of their blood sugar had a better outcome with COVID-19 as compared to patients with poorer control of their blood sugar. \n“What is really important about this new initiative is that it is asking a different question – does COVID-19 trigger diabetes, either type 1 or type 2, in people who did not have diabetes before getting COVID-19?  To begin to answer this, these experts are establishing a Global Registry –  or a list – of new cases of diabetes in patients with COVID-19.  This registry is a great first step in trying to answer this question.  A challenge, however, will be trying to figure out whether the diabetes is actually new before COVID-19 because some people may have undiagnosed diabetes.  Even still, this registry will provide very valuable information about the emerging association between these two diseases.”"
model_predict(corona)
perception = "Scientists’ Perception of a “Gap” The notion of a gap between science and the media may not be a valid description of the science–media interface if it is taken to mean a gap separating scientists and journalists, with scientists standing on one side and journalists on the other. As shown later, scientists and journalists seem to get along together quite well. However, as a metaphor to describe a separation of “arenas” of internal scientific and public communication, it captures some important aspects of public science communication. According to this view, scientists are communicators in each of the two arenas,which are structured by different institutions and governed by different rules. Scientists in the “public arena”—an arena still ultimately structured by journalistic mass media—have to adjust to the logic of the media to attract attention.\nSurvey data presented later suggest that the majority of scientists do actually distinguish clearly between the arenas of internal scientific and public communication as far as journalistic mass media are concerned. This distinction has at least two aspects: the exclusion of the public from communication dealing with knowledge creation and validation, and the conceptualization of scientific knowledge as “special knowledge.” Both distinctions lead to a communication pattern that is usually labeled “popularization,” i.e., the use of selected, simplified, sensationalized, and pedagogically tailored messages when addressing the general public. These messages systematically differ from the content of the internal scientific discussion without being completely detached from it (40). Dissemination of “science reconstructed for public consumption” (41) is thus seen as a follow-up step after scientific results have been achieved.\nEmpirically, we find evidence of both aspects of demarcation of public communication from internal scientific communication in the surveys mentioned earlier. More than half of US neuroscientists and more than 60% of German neuroscientists perceive the so-called Ingelfinger rule (42) as still effective. According to that rule, “acceptance of a publication by a scientific journal [is] threatened if the research results have already been reported in the mass media” (Table 1). The data also suggest that this rule is not simply imposed on reluctant scientists by jealous journal editors attempting to protect the exclusivity of the content of their journals, but that it actually conforms to scientific norms, in particular those of the biomedical research community. In the fivecountry study of biomedical researchers in 2005 mentioned earlier, 71% to 83% of the respondents agreed that “scientists should communicate research findings to the general public only after they have been published in a scientific journal.” In another study, leading US nanoresearchers also tended to agree with that statement (35). Approximately half of the neuroscientists and scientists at large surveyed in Germany and the United States in 2011 to 2012 disagree with the demand that scientists, if asked, should “provide information about current research or research that has not yet appeared in scientific publications” (Table 1). Perhaps most relevant as an indicator of a respective norm, 48% of German scientists, 57% of German neuroscientists, and 69% of US neuroscientists think it is an important condition that makes talking to the media about research results acceptable to their peers, namely that these results have been previously published in a scientific journal (Table 1)."
model_predict(perception)
act_passage = "Of all the numerous enemies of the honey-bee, the Bee-Moth (Tinea mellonella), in climates of hot summers, is by far the most to be dreaded. So widespread and fatal have been its ravages in this country that thousands have abandoned the cultivation of bees in despair, and in districts which once produced abundant supplies of the purest honey, bee-keeping has gradually dwindled down into a very insignificant pursuit. Contrivances almost without number have been devised to defend the bees against this invidious foe, but still it continues its desolating inroads, almost unchecked, laughing as it were to scorn at all the so-called “moth-proof” hives, and turning many of the ingenious fixtures designed to entrap or exclude it into actual aids and comforts in its nefarious designs."
model_predict(act_passage)


[nltk_data] Downloading package wordnet to /Users/jonah/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Splitting input.
Processing input.
Returning data.
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
Reevaluating.
Lexile Found!
Splitting input.
Processing input.
Returning data.
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step
Reevaluating.
Lexile Found!
Splitting input.
Processing input.
Returning data.
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step
Reevaluating.
Lexile Found!
Splitting input.
Processing input.
Returning data.
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step
Lexile Found!
Splitting input.
Processing input.
Returning data.
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step
Lexile Found!
Splitting input.
Processing input.
Returning data.
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step
Reevaluating.
Lexile Found!
Splitting input.
Processing input.
Returning data.
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step

'This text is most similar to texts suitable for middle school aged readers. The lexile match is closest to the range 1050L to 1200L.'