In [109]:
import pandas as pd
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [110]:
df = pd.read_csv('df.csv')

In [111]:
from dotenv import load_dotenv
import os

load_dotenv()

open_ai_key = os.getenv("OPEN_AI_KEY")
google_search_key = os.getenv("GOOGLE_PROGRAMMABLE_SEARCH_KEY")


In [112]:
len(df)

28682

setup

In [113]:
import gensim
model = gensim.models.KeyedVectors.load_word2vec_format('word2vec_model/GoogleNews-vectors-negative300.bin', binary=True)  

KeyboardInterrupt: 

In [None]:
from gensim.models.doc2vec import Doc2Vec
doc2vec_model = Doc2Vec.load("doc2vec_model/doc2vec_wiki_d300_n5_w8_mc50_t12_e10_dbow.model")

In [None]:
import nltk
from nltk import ngrams, pos_tag
from nltk.tokenize import word_tokenize
from collections import Counter

def generate_ngram_results(set_ids, df, count_threshold = 3):
    # Initialize dictionary to store results
    ngram_results = {i: [] for i in set_ids}

    # Loop through each EssaySet
    for set_id in set_ids:
        # Filter DataFrame for current EssaySet and score1 == 2
        filtered_df = df[(df['EssaySet'] == set_id) & (df['Score1'] == 2)]
        
        # Print progress for each EssaySet
        print(f"Processing EssaySet {set_id} with {len(filtered_df)} essays.")
        
        # Initialize a Counter to count n-grams across all essays in the set
        ngram_counter = Counter()
        
        # Process each EssayText in the filtered_df DataFrame
        for essay in filtered_df['EssayText']:
            tokens = word_tokenize(essay)
            pos_tags = pos_tag(tokens)
            
            # Generate bi-grams, tri-grams, and tetra-grams
            bi_grams = list(ngrams(pos_tags, 2))
            tri_grams = list(ngrams(pos_tags, 3))
            tetra_grams = list(ngrams(pos_tags, 4))
            
            # Concatenate all n-grams into a single list and update the counter
            all_ngrams = bi_grams + tri_grams + tetra_grams
            ngram_counter.update(all_ngrams)
        
        # Filter n-grams that appeared at least 3 times
        frequent_ngrams = [ngram for ngram, count in ngram_counter.items() if count >= count_threshold]
        
        # Store the frequent n-grams in the results dictionary
        ngram_results[set_id] = frequent_ngrams
        
        # Print after processing each set
        print(f"Completed processing EssaySet {set_id}, with {len(frequent_ngrams)} n-grams.")
    
    return ngram_results


In [None]:
ngram_results = generate_ngram_results([3], df, 30)

Processing EssaySet 3 with 699 essays.
Completed processing EssaySet 3, with 403 n-grams.


In [None]:
import os

def load_prompts(set_ids):
    prompts = {}
    for set_id in set_ids:
        file_name = f"prompts/asap_{set_id:02d}.txt"
        if os.path.exists(file_name):
            with open(file_name, 'r') as file:
                prompts[set_id] = file.read().strip()
        else:
            print(f"Prompt file {file_name} not found.")
    return prompts

set_ids = list(range(1, 11))  # Example set ids, you can modify this as needed
prompts = load_prompts(set_ids)


In [126]:
import os

def load_only_prompts(set_ids):
    prompts = {}
    for set_id in set_ids:
        file_name = f"only_prompts/asap_{set_id:02d}.txt"
        if os.path.exists(file_name):
            with open(file_name, 'r') as file:
                prompts[set_id] = file.read().strip()
        else:
            print(f"Prompt file {file_name} not found.")
    return prompts

only_prompts = load_only_prompts(list(range(1, 11)))


In [117]:
import requests
import tiktoken
from bs4 import BeautifulSoup
from openai import OpenAI
client = OpenAI(api_key=open_ai_key)
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
nltk.download('punkt')
import json
import concurrent.futures

def extract_articles(keyword):
      url = 'https://customsearch.googleapis.com/customsearch/v1'
      params = {
          'key': "",
          'cx': '',
          'q': keyword,
          'start': 1,
          'num': 10,
          'lr': "lang_en",
          "filter": 1
      }      
      articles = []
      all_links = []
      try:
        for i in range(2):
          response = requests.get(url, params={
              **params,
              "start": i * 10 + 1
          })
          data = response.json()
          links = [item['link'] for item in data['items'] if 'link' in item]
          all_links.extend(links)

        print("All links: ", all_links)
        
        # Function to scrape a single link
        def scrape_link(link):
            try:
                page = requests.get(link, timeout=10)
                soup = BeautifulSoup(page.content, 'html.parser')
                return soup.get_text()
            except Exception as e:
                print(f"Failed to scrape {link}: {e}")
                return None

        # Use ThreadPoolExecutor to scrape links concurrently
        with concurrent.futures.ThreadPoolExecutor() as executor:
            results = list(executor.map(scrape_link, all_links))

        # Filter out None results and extend articles list
        articles.extend([result for result in results if result is not None])

      except Exception as e:
          print(f"An error occurred: {e}")
          # all.extend(data["items"])
          
      return articles

def extract_domain_specific_keywords(prompt, answers):
    # first 50k tokens
    tokenizer = tiktoken.get_encoding("cl100k_base")
    tokens = tokenizer.encode(answers)
    first_50000_tokens = tokens[:50000]
    extracted_text = tokenizer.decode(first_50000_tokens)

    response = client.chat.completions.create(
        model="gpt-4o",
        response_format={ "type": "json_object" },
        messages=[
            {"role": "system", "content": "You are a helpful assistant designed to output JSON."},
            {"role": "user", "content": f"Here is a question: {prompt}. Identify 100 domain-specific keywords that are most relevant to the question from the following set of student answers, and return them in a JSON object with an array field named 'keywords':\n\n{extracted_text}"}
        ]
    )
    
    keywords = json.loads(response.choices[0].message.content)

    return keywords

def extract_tfidf_from_articles(keyword, articles):
    def preprocess(text):
        return text.lower()

    # Preprocess articles
    preprocessed_articles = [preprocess(article) for article in articles]

    # Initialize the vectorizer
    vectorizer = TfidfVectorizer()

    # Fit and transform the articles
    tfidf_matrix = vectorizer.fit_transform(preprocessed_articles)

    # Get the feature names (i.e., the words)
    feature_names = vectorizer.get_feature_names_out()

    # Split the keyword into individual words and lower each
    keywords = keyword.lower().split()

    # Initialize list to store tf-idf values for each word in the keyword
    tfidf_values_list = []

    # Iterate over each word in the keyword
    for word in keywords:
        try:
            # Find the index of the word
            word_index = feature_names.tolist().index(word)
            # Get the tf-idf values for the word across all documents
            tfidf_values = tfidf_matrix[:, word_index].toarray().flatten()
            # Append the average tf-idf value for the word
            tfidf_values_list.append(tfidf_values.mean())
        except ValueError:
            # If the word is not found, append 0.0
            tfidf_values_list.append(0.0)

    # Calculate the average tf-idf value for the entire keyword phrase
    if tfidf_values_list:
        average_tfidf = sum(tfidf_values_list) / len(tfidf_values_list)
    else:
        average_tfidf = 0.0

    return average_tfidf


def extract_weighted_keywords(setnumbers, df, prompts):
  weighted_keywords = {}
  all_articles = {}
  for setno in setnumbers:
    answers = " ".join(df[df['EssaySet'] == setno]['EssayText'].tolist())
    keywords = extract_domain_specific_keywords(prompts[setno], answers)["keywords"]
    # keywords = ['animals', 'Asia', 'Australia', 'Bamboo', 'Bears', 'Biologists', 'Biodiversity', 'Boa constrictor', 'Burmese python', 'Bushnell', 'Cape Coral', 'Carnivores', 'Change', 'China', 'Climates', 'Cobra', 'Cold blooded', 'Creatures', 'Customer', 'Ecosystem', 'Ecophobia', 'Effective threat', 'Endangered species', 'Environment', 'Everglades', 'Exclusive food', 'Exploded', 'Food source', 'Florida', 'Food stability', 'Food variety', 'Generalists', 'Government', 'Habitat', 'Hankering', 'Herbivores', 'Hood spread', 'Households', 'Humans', 'Hurricane Andrew', 'Invasive species', 'Key Largo wood rat', 'Kimberly', 'Koala', 'Lizards', 'Macroherbivores', 'Macinnes', 'Madagascar', 'Mammals', 'Mcinnes', 'Microchip', 'Monitor lizard', 'Natural predators', 'Niche environments', 'Nile monitor lizard', 'Non-native', 'Organisms', 'Panda', 'Pandas', 'Permit', 'Pets', 'Plants', 'Policy adviser', 'Potomac', 'Predators', 'Prey', 'Python', 'Raccoon', 'Radiated tortoises', 'Reed', 'Reptiles', 'Resource-specific', 'RobRoy Macinnes', 'Scales', 'Scorpion', 'Sensitive habitats', 'Skip Snow', 'Specialists', 'Stability', 'Survival', 'Tourists', 'Traditional diet', 'Urban areas', 'U.S. Fish and Wildlife Service', 'U.S. Geological Survey', 'Unique habitat', 'Variety of foods', 'Warm blooded', 'Weeds', 'Wildlife biologist', 'Zoologists']
    
    set_scores = {}

    total_keywords = len(keywords)
    for i, keyword in enumerate(keywords, 1):
       articles = extract_articles(keyword)
       tfidf = extract_tfidf_from_articles(keyword, articles)
       
       set_scores[keyword] = tfidf
       print(f"Progress: {i}/{total_keywords} - keyword: {keyword}, tfidf: {tfidf}")
       
       if setno not in all_articles:
           all_articles[setno] = {}
       all_articles[setno][keyword] = articles
    
    weighted_keywords[setno] = set_scores
  
  return weighted_keywords, all_articles

weighted_keywords, all_articles = extract_weighted_keywords([3], df, prompts)

    
   
   


[nltk_data] Downloading package punkt to /Users/josephtey/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


All links:  ['https://en.wikipedia.org/wiki/China', 'https://travel.state.gov/content/travel/en/traveladvisories/traveladvisories/china-travel-advisory.html', 'https://english.www.gov.cn/', 'https://www.reuters.com/world/china/', 'https://m.airchina.com.cn/', 'https://www.state.gov/countries-areas/china/', 'https://www.stats.gov.cn/english/', 'https://www.china-airlines.com/us/en', 'https://www.fmprc.gov.cn/mfa_eng/', 'https://obamawhitehouse.archives.gov/the-press-office/2014/11/11/us-china-joint-announcement-climate-change', 'https://us.ceair.com/', 'https://www.chinamobileltd.com/', 'https://www.nottingham.edu.cn/en/index.aspx', 'https://www.census.gov/foreign-trade/balance/c5700.html', 'https://www.cia.gov/the-world-factbook/countries/china/', 'https://obamawhitehouse.archives.gov/the-press-office/2015/09/25/us-china-joint-presidential-statement-climate-change', 'https://www.cnsa.gov.cn/english/', 'https://www.scmp.com/asia', 'http://english.customs.gov.cn/', 'https://www.theguardi

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Progress: 5/94 - keyword: specialist, tfidf: 0.07720580872908468
All links:  ['https://www.eatsonponce.net/', 'https://www.ubereats.com/', 'https://www.dictionary.com/browse/eats', 'https://www.seriouseats.com/', 'https://www.youtube.com/@KatinaEatsKilos', 'https://eatsparkcity.org/', 'https://www.eatsrestaurantsf.com/', 'https://joanne-eatswellwithothers.com/', 'https://www.facebook.com/katinaeatskilos/', 'https://www.howsweeteats.com/', 'https://www.instagram.com/eats.and.the.city/', 'https://theeatsshow.us.messefrankfurt.com/us/en.html', 'https://www.reddit.com/r/seriouseats/', 'https://www.recipetineats.com/category/quick-and-easy/', 'https://littleeatsandthings.com/', 'https://www.recipetineats.com/', 'https://www.instagram.com/joelle_eats_cake/?hl=en', 'https://manila-eats.com/', 'https://www.instagram.com/dajeneats/?hl=en', 'https://www.tiktok.com/@how.kev.eats?lang=en']
Progress: 6/94 - keyword: eats, tfidf: 0.08731380107729611
All links:  ['https://www.bamboohr.com/', 'https:/

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Failed to scrape https://catsexclusively.com/: HTTPSConnectionPool(host='catsexclusively.com', port=443): Max retries exceeded with url: / (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:1122)')))
Progress: 14/94 - keyword: exclusively, tfidf: 0.07377445231283361
All links:  ['https://www.habitat.org/', 'https://www.habitat.com/', 'https://www.habitat.org/restores', 'https://habitatmetrodenver.org/', 'https://en.wikipedia.org/wiki/Habitat', 'https://www.ashevillehabitat.org/home/', 'https://www.1hotels.com/south-beach/taste/habitat', 'https://www.habitatcincinnati.org/', 'https://www.habitatphiladelphia.org/', 'https://www.habitatla.org/', 'https://community.chef.io/tools/chef-habitat', 'https://habitatportlandregion.org/', 'https://habitatskateboards.com/', 'https://www.cvillehabitat.org/', 'https://www.clevelandhabitat.org/', 'https://www.tchabitat.org/', 'https://habitatforsyt

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Progress: 28/94 - keyword: regions, tfidf: 0.06623139374332712
All links:  ['https://www.merriam-webster.com/dictionary/natural', 'https://www.nrcs.usda.gov/', 'https://www.youtube.com/@NaturalWorldFacts', 'https://naturalhistory.si.edu/', 'https://naturalcapitalproject.stanford.edu/', 'https://naturalbridgecaverns.com/', 'https://www.naturalgrocers.com/', 'https://www.naturalearthdata.com/', 'https://www.naturalbalanceinc.com/', 'https://nhm.org/', 'https://www.amnh.org/', 'https://naturalbridgeva.com/', 'https://www.nrdc.org/', 'https://capitalscoalition.org/capitals-approach/natural-capital-protocol/', 'https://cloud.google.com/natural-language', 'https://en.wikipedia.org/wiki/Natural_science', 'https://nhmu.utah.edu/', 'https://github.com/NaturalNode/natural', 'https://www.fda.gov/food/food-labeling-nutrition/use-term-natural-food-labeling', 'https://www.thesaurus.com/browse/natural']
Progress: 29/94 - keyword: natural, tfidf: 0.10603374673595238
All links:  ['https://en.wikipedia.

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Progress: 30/94 - keyword: displacement, tfidf: 0.08989628883831943
All links:  ['https://www.nhl.com/predators/', 'https://www.imdb.com/title/tt1424381/', 'https://en.wikipedia.org/wiki/Predators_(film)', 'https://www.espn.com/nhl/team/_/name/nsh/nashville-predators', 'https://www.tennessean.com/sports/predators/', 'https://www.facebook.com/nashvillepredators/', 'https://www.instagram.com/predsnhl/?hl=en', 'https://www.cnn.com/2019/03/07/politics/biden-1993-speech-predators/index.html', 'https://www.orlandopredatorsfootball.com/', 'https://www.mass.gov/news/protect-your-poultry-from-predators', 'https://apnews.com/article/nhl-free-agency-c8524040f3013d2335f900755a0b39bb', 'https://puckpedia.com/team/nashville-predators', 'https://nestwatch.org/learn/all-about-birdhouses/dealing-with-predators/', 'https://www.rottentomatoes.com/m/10012256-predators', 'https://forum.inaturalist.org/t/my-garden-has-too-many-predators-of-monarch-larvae-any-recommendations/34174', 'https://twitter.com/elit

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Progress: 33/94 - keyword: threat, tfidf: 0.08259975680621848
All links:  ['https://survivalinternational.org/', 'https://www.iiss.org/en/publications/survival/', 'https://www.redcross.org/get-help/how-to-prepare-for-emergencies/survival-kit-supplies.html', 'https://cran.r-project.org/package=survival', 'https://www.culturalsurvival.org/', 'https://mustangsurvival.com/', 'https://www.merriam-webster.com/dictionary/survival', 'https://www.cancerresearchuk.org/health-professional/cancer-statistics/survival', 'https://www.nybooks.com/online/2016/11/10/trump-election-autocracy-rules-for-survival/', 'https://cpw.state.co.us/learn/Pages/Survival.aspx', 'https://www.iucn.org/our-union/commissions/iucn-species-survival-commission-2021-2025', 'https://youngsurvival.org/', 'https://www.aza.org/species-survival-plan-programs', 'https://www.ready.gov/kit', 'https://www.survivalsystemsinc.com/', 'https://www.tandfonline.com/toc/tsur20/current', 'https://www.youtube.com/channel/UCA7RTjJU42Y3toOoRXiF

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Progress: 39/94 - keyword: urbanization, tfidf: 0.06850811787013947
All links:  ['https://climate.com/', 'http://www.climate.gov/', 'https://www.climatecentral.org/', 'https://www.climateweeknyc.org/', 'https://www.theclimategroup.org/', 'https://www.noaa.gov/climate', 'https://www.merriam-webster.com/dictionary/climate', 'https://www.cpc.ncep.noaa.gov/', 'https://www.climaterealityproject.org/', 'https://science.nasa.gov/climate-change/', 'https://www.theclimatepledge.com/', 'https://www.ipcc.ch/', 'https://www.climateactionreserve.org/', 'https://www.climateaction100.org/', 'https://www.un.org/en/climatechange/what-is-climate-change', 'https://citizensclimatelobby.org/', 'https://screeningtool.geoplatform.gov/', 'https://toolkit.climate.gov/content/us-climate-resilience-toolkit', 'https://www.cif.org/', 'https://unfccc.int/process-and-meetings/the-paris-agreement']
Progress: 40/94 - keyword: climate, tfidf: 0.18103328899489737
All links:  ['https://www.merriam-webster.com/dictionary/

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Failed to scrape https://ocfs.ny.gov/programs/fostercare/education.php: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Progress: 46/94 - keyword: stability, tfidf: 0.07269392539809184
All links:  ['https://www.change.org/', 'https://www.ipcc.ch/', 'https://unfccc.int/', 'https://www.changehealthcare.com/', 'https://www.merriam-webster.com/dictionary/change', 'https://www.un.org/en/climatechange/what-is-climate-change', 'https://colorofchange.org/', 'https://www.uscis.gov/addresschange', 'https://science.nasa.gov/climate-change/', 'https://www.dmv.pa.gov/Driver-Services/Driver-Licensing/pages/change-your-name-or-address.aspx', 'https://apps.ilsos.gov/addrchange/', 'https://www.dmv.virginia.gov/online-services/address-change', 'https://egov.uscis.gov/coa/displayCOAForm.do', 'https://www.southwest.com/air/change/', 'https://developer.mozilla.org/en-US/docs/Web/API/HTMLElement/change_event', 'https://climateknowledgeportal.worldbank.org/home', 'https://www.va.gov/education/apply-for-education-benefits/application/1995/', 'https://travel.state.gov/content/travel/en/passports/have-passport/change-correct.htm

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Failed to scrape https://ocfs.ny.gov/programs/fostercare/education.php: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Failed to scrape https://home.treasury.gov/policy-issues/financial-markets-financial-institutions-and-fiscal-service/fsoc: HTTPSConnectionPool(host='home.treasury.gov', port=443): Read timed out. (read timeout=10)
Progress: 49/94 - keyword: stability, tfidf: 0.07313694037192116
All links:  ['https://www.webmd.com/diet/default.htm', 'https://www.who.int/news-room/fact-sheets/detail/healthy-diet', 'https://en.wikipedia.org/wiki/Diet_(nutrition)', 'https://dceg.cancer.gov/research/who-we-study/nih-aarp-diet-health-study', 'https://my.clevelandclinic.org/health/articles/16037-mediterranean-diet', 'https://www.coca-cola.com/us/en/brands/diet-coke', 'https://www.healthline.com/nutrition/best-diet-plans', 'https://www.mayoclinic.org/healthy-lifestyle/nutrition-and-healthy-eating/in-depth/mediterranean-diet/art-20047801', 'https://www.nhlbi.nih.gov/education/dash-eating-plan', 'https://www.psychologytoday.com/us/basics/diet', 'https://www.merriam-webster.com/dictionary/diet', 'https://jamanetw

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Progress: 60/94 - keyword: pests, tfidf: 0.05285490212813814
All links:  ['https://www.range.com/', 'https://www.range.co/', 'https://www.merriam-webster.com/dictionary/range', 'https://www.range.net/', 'https://rangechicago.com/', 'https://www.frontrange.edu/index.html', 'https://docs.python.org/3/library/stdtypes.html', 'https://www.rangebank.com/', 'https://docs.python.org/3/library/functions.html', 'https://rangesa.com/', 'https://www.rangedowntown.com/', 'https://www.facebook.com/therangemusic/', 'https://www.php.net/manual/en/function.range.php', 'https://rangecafe.com/', 'https://www.landrover.com/range-rover/range-rover/index.html', 'https://www.landroverusa.com/range-rover/range-rover/index.html', 'http://burrow.com/living-room/seating/range', 'https://rangeusa.com/', 'https://www.dictionary.com/browse/range', 'https://developers.google.com/apps-script/reference/spreadsheet/range']
Progress: 61/94 - keyword: range, tfidf: 0.13915301297213176
All links:  ['https://www.spreadbag

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Progress: 67/94 - keyword: variation, tfidf: 0.07388602791729074
All links:  ['https://www.census.gov/popclock/', 'https://www.worldometers.info/world-population/', 'https://data.worldbank.org/indicator/SP.POP.TOTL', 'https://population.un.org/wpp/', 'https://popcouncil.org/', 'https://www.prb.org/', 'https://ourworldindata.org/population-growth', 'https://data.oecd.org/pop/population.htm', 'https://www.un.org/development/desa/pd/', 'https://www.unfpa.org/', 'https://ofm.wa.gov/washington-data-research/population-demographics', 'https://en.wikipedia.org/wiki/Population', 'https://www.populationmedia.org/', 'https://ofm.wa.gov/sites/default/files/public/dataresearch/pop/april1/ofm_april1_population_final.pdf', 'https://populationconnection.org/', 'https://www.osbm.nc.gov/facts-figures/population-demographics/state-demographer/countystate-population-projections', 'https://data.census.gov/', 'https://bhw.hrsa.gov/workforce-shortage-areas/shortage-designation', 'https://opb.georgia.gov/cen

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Progress: 68/94 - keyword: population, tfidf: 0.12411888422974915
All links:  ['https://www.ecdc.europa.eu/en/publications-and-data/monitoring/weekly-threats-reports', 'https://www.congress.gov/bill/115th-congress/senate-bill/2836', 'https://www.ecdc.europa.eu/en/publications-data/communicable-disease-threats-report-22-28-june-2024-week-26', 'https://www.justice.gov/opa/blog/justice-department-launches-task-force-combat-threats-against-election-workers-0', 'https://www.fws.gov/library/collections/threats-birds', 'https://myfwc.com/research/wildlife/sea-turtles/threats/', 'http://www.uscp.gov/media-center/press-releases/uscp-threat-assessment-cases-2023', 'https://www.nps.gov/subjects/bats/threats-to-bats.htm', 'https://www.cdc.gov/antimicrobial-resistance/data-research/threats/index.html', 'https://www.nato.int/cps/en/natohq/topics_156338.htm', 'https://training.fema.gov/is/courseoverview.aspx?code=is-915', 'https://www.dni.gov/files/ODNI/documents/assessments/ICA-declass-16MAR21.pdf',

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Progress: 69/94 - keyword: threats, tfidf: 0.05279186253227881
All links:  ['https://www.un.org/en/climatechange/science/climate-issues/biodiversity', 'https://www.worldwildlife.org/pages/what-is-biodiversity', 'https://education.nationalgeographic.org/resource/biodiversity/', 'https://en.wikipedia.org/wiki/Biodiversity', 'https://environment.ec.europa.eu/strategy/biodiversity-strategy-2030_en', 'https://www.amnh.org/research/center-for-biodiversity-conservation/what-is-biodiversity', 'https://www.cepf.net/our-work/biodiversity-hotspots', 'https://www.conservation.org/priorities/biodiversity-hotspots', 'https://www.ipbes.net/node/36759', 'https://www.biodiversitylibrary.org/', 'https://www.gbif.org/', 'https://www.cbd.int/sp/targets', 'https://www.financeforbiodiversity.org/', 'https://www.unep.org/news-and-stories/press-release/our-global-food-system-primary-driver-biodiversity-loss', 'https://unbiodiversitylab.org/en/', 'https://nap.nationalacademies.org/catalog/989/biodiversity', 'h

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Progress: 77/94 - keyword: feeding habits, tfidf: 0.025538463930587653
All links:  ['https://www.merriam-webster.com/dictionary/reproduce', 'https://oceanservice.noaa.gov/education/tutorial_corals/coral06_reproduction.html', 'https://wyss.harvard.edu/news/team-builds-first-living-robots-that-can-reproduce/', 'https://en.wikipedia.org/wiki/Reproduction', 'https://discuss.dvc.org/t/first-run-of-dvc-getting-a-failed-to-reproduce-error/171', 'https://coral.org/en/coral-reefs-101/how-corals-reproduce/', 'https://www.usda.gov/peoples-garden/pollinators', 'https://support.microsoft.com/en-us/windows/record-steps-to-reproduce-a-problem-46582a9b-620f-2e36-00c9-04e25d784e47', 'https://www.cnn.com/2021/11/29/americas/xenobots-self-replicating-robots-scn/index.html', 'https://www.purdue.edu/newsroom/releases/2013/Q1/scientists-learning-how-multiple-genome-plants-reproduce.html', 'https://authorservices.wiley.com/author-resources/Journal-Authors/Prepare/permissions-material.html', 'https://arxiv.or

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Progress: 79/94 - keyword: potentially, tfidf: 0.04025698969680286
All links:  ['https://www.merriam-webster.com/dictionary/endure', 'https://www.amazon.com/Endure-Curiously-Elastic-Limits-Performance/dp/0062499866', 'https://neuroscienceblueprint.nih.gov/training/endure-undergraduate-education', 'https://www.dictionary.com/browse/endure', 'http://www.endure-network.eu/', 'https://dictionary.cambridge.org/dictionary/english/endure', 'https://www.instagram.com/project.endure/', 'https://dictionary.cambridge.org/us/dictionary/english/endure', 'https://pathfinderrucktraining.com/products/pathfinder-endure-intermediate-ruck-training', 'https://www.nytimes.com/2010/01/03/weekinreview/03carr.html', 'https://groundedcuriosity.com/book-review-endure-mind-body-and-the-curiously-elastic-limits-of-human-performance-by-alex-hutchinson/', 'https://endure.wustl.edu/apply/', 'https://www.traceminerals.com/products/endure-performance-electrolyte-drops', 'https://specialinterestno.bandcamp.com/album/en

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Progress: 83/94 - keyword: living conditions, tfidf: 0.05974287400783851
All links:  ['https://en.wikipedia.org/wiki/Temperature', 'https://www.britannica.com/science/temperature', 'http://www.climate.gov/news-features/understanding-climate/climate-change-global-temperature', 'https://www.merriam-webster.com/dictionary/temperature', 'https://www.foodsafety.gov/food-safety-charts/safe-minimum-internal-temperatures', 'https://climate.nasa.gov/vital-signs/global-temperature/?intent=121', 'http://www.fsis.usda.gov/food-safety/safe-food-handling-and-preparation/food-safety-basics/safe-temperature-chart', 'https://www.oaktreecapital.com/insights/memo/taking-the-temperature', 'https://www.prelectronics.com/products/temperature-transmitters/', 'https://www.cpc.ncep.noaa.gov/', 'https://ugc.berkeley.edu/background-content/temperature/', 'https://www.weather.gov/forecastmaps/', 'https://www.onsetcomp.com/products/data-loggers-sensors/temperature', 'https://www.dictionary.com/browse/temperature',

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Progress: 87/94 - keyword: resource dependency, tfidf: 0.04786770143877246
All links:  ['https://www.sciencedirect.com/topics/biochemistry-genetics-and-molecular-biology/species-coexistence', 'https://besjournals.onlinelibrary.wiley.com/doi/10.1111/1365-2656.12879', 'https://www.journals.uchicago.edu/doi/10.1086/282505', 'https://www.jstor.org/stable/2459090', 'https://www.nature.com/articles/s41467-023-43279-y', 'https://www.cell.com/trends/ecology-evolution/fulltext/S0169-5347(20)30339-6', 'https://esajournals.onlinelibrary.wiley.com/doi/abs/10.1890/0012-9658(2001)082[0175%3AAVBIOH]2.0.CO%3B2', 'https://aslopubs.onlinelibrary.wiley.com/doi/abs/10.4319/lo.1978.23.6.1126', 'https://esajournals.onlinelibrary.wiley.com/doi/10.1890/10-0154.1', 'https://scholarworks.umt.edu/cgi/viewcontent.cgi?article=1034&context=wildbio_pubs', 'https://en.wikipedia.org/wiki/Coexistence_theory', 'https://nph.onlinelibrary.wiley.com/doi/abs/10.1111/nph.16865', 'https://academic.oup.com/jmammal/article/83/1

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Progress: 88/94 - keyword: coexisting species, tfidf: 0.03709276424597342
All links:  ['https://en.wikipedia.org/wiki/Predation', 'https://www.montecookgames.com/store/product/predation/', 'https://www.sciencedirect.com/topics/agricultural-and-biological-sciences/predation', 'https://www.reddit.com/r/mtgrules/comments/18hw122/ezuris_predation_and_ward/', 'https://www.fisheries.noaa.gov/feature-story/first-direct-evidence-killer-whale-predation-bowhead-whales-us-pacific-arctic', 'https://www.merriam-webster.com/dictionary/predation', 'https://www.nature.com/scitable/knowledge/library/predation-herbivory-and-parasitism-13261134/', 'https://www.ecologyandsociety.org/vol6/iss1/art11/', 'https://pubs.nmsu.edu/_circulars/CR688/', 'https://uapress.arizona.edu/book/landscapes-of-movement-and-predation', 'https://oceanservice.noaa.gov/education/tutorial_corals/coral08_naturalthreats.html', 'https://www.gwct.org.uk/game/research/predation-control/', 'https://www.britannica.com/science/predation'

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Progress: 93/94 - keyword: confinement, tfidf: 0.09750227742822383
All links:  ['https://reliance.com/', 'https://www.reliancetelephone.com/', 'https://www.ril.com/', 'https://reliancebank.bank/', 'https://www.relianceorthodontics.com/', 'https://reliancebankmn.com/', 'https://www.reliancestandard.com/home/', 'https://www.reliancecu.org/', 'https://www.youtube.com/channel/UCIMXKin1fXXCeq2UJePJEog/videos', 'https://en.wikipedia.org/wiki/Reliance_Industries', 'https://www.greif.com/reliance-products/', 'https://ilsr.org/', 'https://www.theparisreview.org/blog/2020/01/15/the-myth-of-self-reliance/', 'https://smartirb.org/reliance/', 'https://getreliancehealth.com/', 'https://research.rutgers.edu/faculty-staff/compliance/human-research-protection/reliance', 'https://www.research.uky.edu/office-research-integrity/single-irb-reliance', 'https://www.irs.gov/newsroom/general-overview-of-taxpayer-reliance-on-guidance-published-in-the-internal-revenue-bulletin-and-faqs', 'https://irbreliance.uco

In [140]:
from utils.feature_extractor import FeatureExtractor

feature_extractor = FeatureExtractor(prompts, only_prompts, model, doc2vec_model, ngram_results, weighted_keywords)

train model

In [142]:
from numpy import hstack

# run this for everythting ... so i can just do it once, and then save it as a csv to re-load ... 
def add_features_to_df(df, feature_extractor, add_word2vec=False, add_doc2vec=False, add_pos=False, add_prompt_overlap=False, add_weighted_keywords=False, add_lexical_overlap=False, add_stylistic_features=False, add_logical_operators=False, add_temporal_features=False):
    if add_word2vec:
        df['word2vec_features'] = None
    if add_doc2vec:
        df['doc2vec_features'] = None
    if add_pos:
        df['pos_features'] = None
    if add_prompt_overlap:
        df['prompt_overlap_features'] = None
    if add_weighted_keywords:
        df['weighted_keywords_features'] = None
    if add_lexical_overlap:
        df['lexical_overlap_features'] = None
    if add_stylistic_features:
        df['stylistic_features'] = None
    if add_logical_operators:
        df['logical_operators_features'] = None
    if add_temporal_features:
        df['temporal_features'] = None

    total_items = len(df)
    print(f"Processing {total_items} items...")

    # Iterate over each row in the training DataFrame
    num = 0
    for index, row in df.iterrows():
        # Extract features using the feature_extractor object
        if add_word2vec:
            word2vec_features = feature_extractor.word2vec(row['EssayText'])
            df.at[index, 'word2vec_features'] = word2vec_features
        if add_doc2vec:
            doc2vec_features = feature_extractor.doc2vec(row['EssayText'])
            df.at[index, 'doc2vec_features'] = doc2vec_features
        if add_pos:
            pos_features = feature_extractor.pos(row['EssaySet'], row['EssayText'])
            df.at[index, 'pos_features'] = pos_features
        if add_prompt_overlap:
            prompt_overlap_features = feature_extractor.prompt_overlap(row['EssaySet'], row['EssayText'])
            df.at[index, 'prompt_overlap_features'] = prompt_overlap_features
        if add_weighted_keywords:
            weighted_keywords_features = feature_extractor.weighted_domain_specific_keywords(row['EssaySet'], row['EssayText'])
            df.at[index, 'weighted_keywords_features'] = weighted_keywords_features
        if add_lexical_overlap:
            lexical_overlap_features = feature_extractor.lexical_overlap(row['EssaySet'], row['EssayText'])
            df.at[index, 'lexical_overlap_features'] = lexical_overlap_features
        if add_stylistic_features:
            stylistic_features = feature_extractor.stylistic_features(row['EssayText'])
            df.at[index, 'stylistic_features'] = stylistic_features
        if add_logical_operators:
            logical_operators_features = feature_extractor.logical_operators(row['EssayText'])
            df.at[index, 'logical_operators_features'] = logical_operators_features
        if add_temporal_features:
            temporal_features = feature_extractor.temporal_features(row['EssayText'])
            df.at[index, 'temporal_features'] = temporal_features

        # Update after every 1000 items
        if (num) % 1000 == 0:
            print(f"Processed {num} items.")
        num += 1

    return df

In [144]:
essay_set = 3

In [145]:
essay_set_df = df[df['EssaySet'] == essay_set].copy(deep=True)

In [161]:
# Save the updated DataFrame to a CSV file
import os
from datetime import datetime

# Execute the function to add features to the DataFrame and display the updated DataFrame
add_features_to_df(essay_set_df, feature_extractor, add_logical_operators=True)

# Create the folder if it doesn't exist
if not os.path.exists('updated_features'):
    os.makedirs('updated_features')

# Get the current timestamp
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')

# Save the updated DataFrame to a CSV file with the timestamp
essay_set_df.to_csv(f'updated_features/updated_features_{timestamp}.csv', index=False)


Processing 3152 items...
Processed 0 items.
Processed 1000 items.
Processed 2000 items.
Processed 3000 items.


In [162]:
essay_set_df

Unnamed: 0,Id,DataSet,EssaySet,Score1,Score2,EssayText,Word_Count,word2vec_features,doc2vec_features,pos_features,prompt_overlap_features,weighted_keywords_features,lexical_overlap_features,stylistic_features,logical_operators_features,temporal_features
4917,4918,Train,3,1,1,China's panda and Australia's koala are two an...,50,"[0.016498972, 0.031992994, -0.05212467, 0.1381...","[-0.12533814, -0.19425772, -0.11712208, 0.1578...","[0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, ...",[0.5],"[0.5367291916595557, 0.4403533819564889, 0.0, ...",[0.05],"[57, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[2, 1, 0.03508771929824561, 1, 57, 3]","[8, 0, 0, 10, 2, 0, 0]"
4918,4919,Train,3,1,2,Pandas and koalas are similar because they are...,57,"[0.024004769, 0.044829894, -0.07149012, 0.1326...","[-0.054628838, -0.020331863, -0.1796877, 0.262...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, ...",[0.54],"[0.0, 0.0, 0.6039808046123907, 0.0, 0.0, 0.0, ...",[0.06],"[63, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[5, 2, 0.07936507936507936, 1, 63, 2]","[6, 1, 0, 9, 0, 1, 0]"
4919,4920,Train,3,1,1,Pandas in China and Koalas in Australia are si...,33,"[0.026684571, 0.032433067, -0.060349528, 0.149...","[0.030490095, -0.09904233, 0.011396943, 0.2152...","[1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, ...",[0.42],"[0.5367291916595557, 0.4403533819564889, 0.0, ...",[0.03],"[36, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[2, 1, 0.05555555555555555, 0, 36, 3]","[6, 0, 0, 5, 1, 0, 0]"
4920,4921,Train,3,2,1,Pandas in China only eat bamboo and Koalas in ...,32,"[0.006107875, 0.04953003, -0.058074407, 0.1578...","[-0.047954723, -0.00897786, -0.018569963, 0.12...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",[0.38],"[0.5367291916595557, 0.4403533819564889, 0.0, ...",[0.03],"[34, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[1, 1, 0.029411764705882353, 0, 34, 2]","[5, 1, 0, 4, 1, 1, 0]"
4921,4922,Train,3,0,0,Pandas in China and koalas from Australia are ...,27,"[0.0075033437, 0.01469695, -0.046362836, 0.130...","[0.11114751, -0.024824172, -0.093851134, 0.109...","[1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, ...",[0.5],"[0.5367291916595557, 0.4403533819564889, 0.0, ...",[0.03],"[29, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[1, 1, 0.034482758620689655, 0, 29, 2]","[4, 0, 0, 4, 0, 0, 0]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8064,8065,Priva,3,2,2,China's panda are similar to koalas because Th...,14,"[-0.020585494, 0.07710405, -0.07188554, 0.1588...","[0.06200168, -0.13997632, -0.08725603, 0.11112...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",[0.33],"[0.0, 0.4403533819564889, 0.0, 0.0, 0.0, 0.0, ...",[0.02],"[16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[1, 1, 0.0625, 0, 16, 1]","[3, 0, 0, 3, 0, 0, 0]"
8065,8066,Priva,3,1,1,Pandas from China are similar to koalas in Aus...,25,"[0.009386699, 0.0152493315, -0.07514191, 0.119...","[0.07324394, -0.050994035, -0.07131314, 0.1118...","[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",[0.5],"[0.5367291916595557, 0.4403533819564889, 0.0, ...",[0.03],"[27, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[0, 0, 0.0, 0, 27, 2]","[2, 0, 0, 2, 0, 0, 0]"
8066,8067,Priva,3,1,1,Koalas and panda are both similar in that they...,49,"[0.019533284, 0.06098972, -0.046831258, 0.1476...","[0.051570136, -0.08533205, -0.05561295, 0.2763...","[0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, ...",[0.42],"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.58742134...",[0.05],"[55, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[2, 1, 0.03636363636363636, 0, 55, 4]","[7, 0, 0, 9, 0, 0, 0]"
8067,8068,Priva,3,1,0,Pandas in China and koalas in Australia are si...,53,"[0.013115416, 0.043629017, -0.0131985275, 0.14...","[-0.107658185, -0.012414971, -0.031618986, 0.2...","[1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ...",[0.5],"[0.5367291916595557, 0.4403533819564889, 0.0, ...",[0.05],"[59, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[3, 1, 0.05084745762711865, 1, 59, 3]","[9, 0, 0, 11, 0, 0, 0]"


In [163]:
features = ['word2vec_features', 'doc2vec_features', 'pos_features', 'prompt_overlap_features', 'weighted_keywords_features', 'lexical_overlap_features', 'stylistic_features', 'logical_operators_features', 'temporal_features']

In [164]:
# prepare data set 
import numpy as np

training_data = essay_set_df[(essay_set_df['DataSet'] == 'Train')].copy(deep=True)
training_data = training_data.dropna(subset=['word2vec_features', 'doc2vec_features'])
X_train = np.vstack(training_data[features].apply(lambda x: np.hstack(x), axis=1).values)
y_train = training_data['Score1']

In [165]:
# train model; hyper-parameter search
from sklearn.ensemble import RandomForestClassifier
from scipy.stats import randint
from sklearn.model_selection import RandomizedSearchCV

param_dist = {'n_estimators': randint(50,500),
              'max_depth': randint(1,20)}
clf = RandomForestClassifier()
rand_search = RandomizedSearchCV(clf, 
                                param_distributions = param_dist, 
                                n_iter=5, 
                                cv=5)
                                

rand_search.fit(X_train, y_train)
best_clf = rand_search.best_estimator_

In [None]:
# faster, more estimates
clf = RandomForestClassifier(max_depth=100, n_estimators=100)
clf.fit(X_train, y_train)

best_clf = clf

test model

In [None]:
from sklearn.metrics import cohen_kappa_score
from sklearn.metrics import confusion_matrix

val_set = essay_set_df[(essay_set_df['DataSet'] == 'Priva')].copy(deep=True)
val_set = val_set.dropna(subset=['word2vec_features', 'doc2vec_features'])

X_val = np.vstack(val_set[features].apply(lambda x: np.hstack(x), axis=1).values)
y_val = val_set['Score1']

y_pred = best_clf.predict(X_val)

conf_matrix = confusion_matrix(y_val, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

kappa_score = cohen_kappa_score(y_val, y_pred, weights='quadratic')
print(f"Cohen Kappa Score (Weighted): {kappa_score}")


Confusion Matrix:
[[ 24 123   2]
 [ 14 338   8]
 [  6  98  18]]
Cohen Kappa Score (Weighted): 0.1950373198372678
