In [18]:
from bs4 import BeautifulSoup
import requests
from urllib.request import urlopen
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import collections
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from pprint import pprint
from difflib import SequenceMatcher

In [19]:
def scrape_and_parse(site):
    ''' Grabs prompts from the ETS site
    '''
    page = urlopen(site).read()
    content = BeautifulSoup(page, "lxml")

    prompts = []
    
    # divider-50 tags sep topics
    for topic_header in content.find_all("div", {"class": "divider-50"}):
        # Topic is is contained in <p> tags
        iter_tag = topic_header.find_next_sibling()
        topic_string, iter_tag = grab_non_div_tag_content(iter_tag)
        # Skip over div string
        iter_tag = iter_tag.find_next()
        # Grab instruction string
        instruction_string, iter_tag = grab_non_div_tag_content(iter_tag)

        prompt_obj = Prompt(topic_string, instruction_string)
        prompts.append(prompt_obj)

    return prompts 

def grab_non_div_tag_content(tag):
    ''' Grabs any non-div tag content from a tag
    '''
    string = str()
    while tag.name != 'div':
        if not tag.string == None:
            string += tag.string + '\n'
        # Edge case where italics mess us the .string function
        elif tag.name == 'p':
            string += tag.contents[0]

        tag = tag.find_next()

    return (string, tag)

class Prompt:
    ''' Class for storing topics and instructions
    '''
    def __init__(self, topic, instructions):
        self.topic = topic.encode('ascii', 'replace')
        self.instructions = instructions.encode('ascii', 'replace')

    def copy(self):
        return copy.deepcopy(self)

In [20]:
prompts_issue = scrape_and_parse("https://www.ets.org/gre/revised_general/prepare/analytical_writing/issue/pool")

In [21]:
prompts_argument = scrape_and_parse("https://www.ets.org/gre/revised_general/prepare/analytical_writing/argument/pool")

In [22]:
print(f'There are {len(prompts_issue)} issue prompts and {len(prompts_argument)} argument prompts')

There are 149 issue prompts and 175 argument prompts


In [23]:
import re
track_words = {}
for index, issue in enumerate(prompts_issue):
    topic = str(issue.topic)
    topic = topic[1:len(topic)-2]
    topic_clean = topic.replace("\n", " ")
    topic_clean = re.sub("[^a-zA-Z]", " ", topic)
    words = topic_clean.split()
    for word in words:
        if word in track_words:
            if index not in track_words[word]:
                track_words[word].append(index)
        else:
            track_words[word] = [index]

In [24]:
count_words = {}
for key, values in track_words.items():
    count_words[key] = len(values)

In [25]:
sentences = []
for index, issue in enumerate(prompts_issue):
    topic = str(issue.topic)
    topic_clean = re.sub("[^a-zA-Z]", " ", topic)
    topic_clean = topic_clean[2:len(topic_clean)-2]
    sentences.append(topic_clean)
    

In [26]:
def word_tokenizer(text):
        #tokenizes and stems the text
        tokens = word_tokenize(text)
        stemmer = PorterStemmer()
        tokens = [stemmer.stem(t) for t in tokens if t not in stopwords.words('english')]
        return tokens


def cluster_sentences(sentences, nb_of_clusters=5):
        tfidf_vectorizer = TfidfVectorizer(tokenizer=word_tokenizer,
                                        stop_words=stopwords.words('english'),
                                        max_df=0.9,
                                        min_df=0.1,
                                        lowercase=True)
        #builds a tf-idf matrix for the sentences
        tfidf_matrix = tfidf_vectorizer.fit_transform(sentences)
        kmeans = KMeans(n_clusters=nb_of_clusters)
        kmeans.fit(tfidf_matrix)
        clusters = collections.defaultdict(list)
        for i, label in enumerate(kmeans.labels_):
                clusters[label].append(i)
        return dict(clusters)

In [27]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /home/julian/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/julian/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [28]:
nclusters = 20
clusters = cluster_sentences(sentences, nclusters)
for cluster in range(nclusters):
    print("cluster {}:".format(cluster))
    for i, sentence in enumerate(clusters[cluster]):
        print("\tsentence {}: {}".format(i, sentences[sentence]))

  'stop_words.' % sorted(inconsistent))


cluster 0:
	sentence 0: Scandals are useful because they focus our attention on problems in ways that no speaker or reformer ever could  
	sentence 1: Laws should be flexible enough to take account of various circumstances  times  and places  
	sentence 2: The best way to teach is to praise positive actions and ignore negative ones  
	sentence 3: As we acquire more knowledge  things do not become more comprehensible  but more complex and mysterious  
	sentence 4: It is primarily through our identification with social groups that we define ourselves  
	sentence 5: The greatness of individuals can be decided only by those who live after them  not by their contemporaries  
	sentence 6: The increasingly rapid pace of life today causes more problems than it solves  
	sentence 7: The best way to teach is to praise positive actions and ignore negative ones  
	sentence 8: If a goal is worthy  then any means taken to attain it are justifiable  
	sentence 9: Many important discoveries or creatio

In [29]:
sentences_issue = []
for index, issue in enumerate(prompts_issue):
    topic = str(issue.topic)
    topic_clean = re.sub("[^a-zA-Z]", " ", topic)
    topic_clean = topic_clean[2:len(topic_clean)-2]
    sentences_issue.append(topic_clean)
    
sentences_argument = []
for index, issue in enumerate(prompts_argument):
    topic = str(issue.topic)
    topic_clean = re.sub("[^a-zA-Z]", " ", topic)
    topic_clean = topic_clean[2:len(topic_clean)-2]
    sentences_argument.append(topic_clean)

In [31]:
from difflib import SequenceMatcher
def similar(a, b):
    return SequenceMatcher(None, a, b).ratio()

import itertools
def findSimilar(sentences, cutoff):
    similar_sentences = []
    for sentence1, sentence2 in itertools.combinations(sentences, 2):
        similarity = similar(sentence1, sentence2)
        if(similarity > cutoff[0] and similarity <= cutoff[1]):
            similar_sentences.append((similarity, sentence1, sentence2))
    return(similar_sentences)        

In [32]:
sentences = sentences_issue
issue_similar_95_100 = findSimilar(sentences, [0.95,1])
issue_similar_90_95 = findSimilar(sentences, [0.9, 0.95])
issue_similar_85_90 = findSimilar(sentences, [0.85, 0.9])
issue_similar_80_85 = findSimilar(sentences, [0.8, 0.85])
issue_similar_75_80 = findSimilar(sentences, [0.75, 0.8])

sentences = sentences_argument
argument_similar_95_100 = findSimilar(sentences, [0.95,1])
argument_similar_90_95 = findSimilar(sentences, [0.9, 0.95])
argument_similar_85_90 = findSimilar(sentences, [0.85, 0.9])
argument_similar_80_85 = findSimilar(sentences, [0.8, 0.85])
argument_similar_75_80 = findSimilar(sentences, [0.75, 0.8])

In [44]:
print(f'There are {len(issue_similar_95_100)} issue prompts that are 95%+ similar to eachother')
print(f'There are {len(issue_similar_90_95)} issue prompts that are 90-95% similar to eachother')
print(f'There are {len(issue_similar_85_90)} issue prompts that are 85-90% similar to eachother')
print(f'There are {len(issue_similar_80_85)} issue prompts that are 80-85% similar to eachother')
print(f'There are {len(issue_similar_75_80)} issue prompts that are 75-80% similar to eachother')
print("\n")
print(f'There are {len(argument_similar_95_100)} argument prompts that are 95%+ similar to eachother')
print(f'There are {len(argument_similar_90_95)} argument prompts that are 90-95% similar to eachother')
print(f'There are {len(argument_similar_85_90)} argument prompts that are 85-90% similar to eachother')
print(f'There are {len(argument_similar_80_85)} argument prompts that are 80-85% similar to eachother')
print(f'There are {len(argument_similar_75_80)} argument prompts that are 75-80% similar to eachother')

There are 24 issue prompts that are 95%+ similar to eachother
There are 2 issue prompts that are 90-95% similar to eachother
There are 2 issue prompts that are 85-90% similar to eachother
There are 7 issue prompts that are 80-85% similar to eachother
There are 7 issue prompts that are 75-80% similar to eachother


There are 33 argument prompts that are 95%+ similar to eachother
There are 4 argument prompts that are 90-95% similar to eachother
There are 5 argument prompts that are 85-90% similar to eachother
There are 7 argument prompts that are 80-85% similar to eachother
There are 8 argument prompts that are 75-80% similar to eachother


In [45]:
sentences_issue_copy = sentences_issue[:]
sorted_sentences_issue = []
for sentence_root in sentences_issue:
    if sentence_root in sentences_issue_copy:
        # Remove sentence from copy list, add to sorted list
        sorted_sentences_issue.append(sentence_root)
        sentences_issue_copy.remove(sentence_root)
        for sentence_copy in sentences_issue_copy:
            if similar(sentence_root, sentence_copy) >= 0.8:
                sorted_sentences_issue.append(sentence_copy)
                sentences_issue_copy.remove(sentence_copy)
                
sentences_argument_copy = sentences_argument[:]
sorted_sentences_argument = []
for sentence_root in sentences_argument:
    if sentence_root in sentences_argument_copy:
        # Remove sentence from copy list, add to sorted list
        sorted_sentences_argument.append(sentence_root)
        sentences_argument_copy.remove(sentence_root)
        for sentence_copy in sentences_argument_copy:
            if similar(sentence_root, sentence_copy) >= 0.8:
                sorted_sentences_argument.append(sentence_copy)
                sentences_argument_copy.remove(sentence_copy)
                


In [46]:
print(len(sorted_sentences_issue))
print(len(sorted_sentences_argument))

149
175


In [48]:
with open("sorted_prompts_issue.txt", "w") as f:
    for sentence_issue in sorted_sentences_issue:
        f.write(sentence_issue + "\n \n")

with open("sorted_prompts_argument.txt", "w") as f:
    for sentence_argument in sorted_sentences_argument:
        f.write(sentence_argument + "\n \n")