In [None]:
# when should we use a list or a dictionary?


In [1]:
# so: how many tweets written by Donald Trump (and only by him!) contain the word "great"?

import json

with open('../datasets/trump.json') as f:
    tweets = json.load(f)

print (len(tweets))

written_by_trump = 0
with_great = 0
for tweet in tweets:
    if "Android" in tweet["source"]:
        text = tweet["text"]
        written_by_trump += 1
        if "great" in text.lower():
            with_great += 1
        
print (with_great, with_great/len(tweets),with_great/written_by_trump)

35018
2236 0.06385287566394426 0.15372980405637676


In [7]:
# we need a few libraries
#os is needed to loop over files in a folder
# codecs is for encoding a file
#BeautifulSoup is needed for parsing the html of a scraped page

import codecs,os
from bs4 import BeautifulSoup

# we prepare an empty list
articles = []

# we loop over a folder
for filename in os.listdir("../datasets/Articles/"):
    # we check if the file is a txt file
    if ".txt" in filename:
        print (filename)
        # we open and read the file
        doc = open("../datasets/Articles/"+filename,"r")
        doc = doc.read()
        
        # look at the original HTML - we remove the internet archive toolbar
        html_page = str(doc).split("<!-- END WAYBACK TOOLBAR INSERT -->")[1]
            
        # we parse the page
        html_page = BeautifulSoup(html_page, "lxml")
        
        
        # we open a new file in writing mode (using codecs) / we need to create the "CleanedArticles" folder if it's not there
        output = codecs.open("../datasets/CleanedArticles/"+filename,"w","utf-8")
        
        # we define a new list, called text
        text = []

        # simply take all the paragraphs - we search for the elements "p"
        for para in html_page.find_all('p'):
            # we remove breaklines, tabs etc
            para = para.text.replace("\n"," ").replace("\t"," ").replace("\r"," ")
            text.append(para)
        
        # we put all paragraphs in a single text
        text = " ".join(text)
        # we write the text to the output file
        output.write(text)
        # we close the output file
        output.close()
        # we add the text to a list of articles
        articles.append(text)

0.txt
1.txt
10.txt
11.txt
12.txt
13.txt
15.txt
16.txt
18.txt
19.txt
2.txt
3.txt
4.txt
5.txt
6.txt
7.txt
8.txt
9.txt


In [8]:
# let's take the first article of the list of articles
article = articles[0]
print (article)

In ways that were once unimaginable, President Trump has discarded the conventions and norms established by his predecessors. Will that change the institution permanently? By PETER BAKERDEC. 31, 2017  WASHINGTON — When President Trump meets with aides to discuss policy or prepare for a speech, he may ask about the pros and cons of a new proposal. He may inquire about its possible effect. He may explore the best way to frame his case. But there is one thing he almost never does. “He very seldom asks how other presidents did this,” said John F. Kelly, the White House chief of staff. Mr. Trump is the 45th president of the United States, but he has spent much of his first year in office defying the conventions and norms established by the previous 44, and transforming the presidency in ways that were once unimaginable. Under Mr. Trump, it has become a blunt instrument to advance personal, policy and political goals. He has revolutionized the way presidents deal with the world beyond 1600 P

In [None]:
# we need nltk - one of the most used text processing library in python

import nltk # --> documentation: http://www.nltk.org/

# you will also need this
nltk.download('punkt')

In [11]:
# we start by dividing the text into sentences
sentences = nltk.sent_tokenize(article) # <-- documentation for this command: http://www.nltk.org/_modules/nltk/tokenize.html

# let's print all the sentences, so we can exam the quality of the output
for sentence in sentences:
    print (sentence)
    print (" ")


In ways that were once unimaginable, President Trump has discarded the conventions and norms established by his predecessors.
 
Will that change the institution permanently?
 
By PETER BAKERDEC.
 
31, 2017  WASHINGTON — When President Trump meets with aides to discuss policy or prepare for a speech, he may ask about the pros and cons of a new proposal.
 
He may inquire about its possible effect.
 
He may explore the best way to frame his case.
 
But there is one thing he almost never does.
 
“He very seldom asks how other presidents did this,” said John F. Kelly, the White House chief of staff.
 
Mr. Trump is the 45th president of the United States, but he has spent much of his first year in office defying the conventions and norms established by the previous 44, and transforming the presidency in ways that were once unimaginable.
 
Under Mr. Trump, it has become a blunt instrument to advance personal, policy and political goals.
 
He has revolutionized the way presidents deal with the

In [14]:
# let's consider a single sentence - how do we do that? 

sentence = sentences[0]
print (sentence)

In ways that were once unimaginable, President Trump has discarded the conventions and norms established by his predecessors.


In [15]:
# let's divide the sentence in tokens (aka single words)
tokenized_sentence = nltk.word_tokenize(sentence)

print (tokenized_sentence)

['In', 'ways', 'that', 'were', 'once', 'unimaginable', ',', 'President', 'Trump', 'has', 'discarded', 'the', 'conventions', 'and', 'norms', 'established', 'by', 'his', 'predecessors', '.']


In [38]:
# lower-casing the sentence
# without_capital_letters = [word.lower() for word in tokenized_sentence]

# print (type(without_capital_letters))

# homework: write a for-loop for doing the same thing

without_capital_letters = []
# print (type(without_capital_letters))

for sentence in sentences:
    tokenized_sentence = nltk.word_tokenize(sentence)
    without_capital_letters = [word.lower() for word in tokenized_sentence]
    # print(tokenized_sentence)
    print(without_capital_letters)
    break

['in', 'ways', 'that', 'were', 'once', 'unimaginable', ',', 'president', 'trump', 'has', 'discarded', 'the', 'conventions', 'and', 'norms', 'established', 'by', 'his', 'predecessors', '.']


In [35]:
# removing stopwords

# homework: download stopwords <- google it out
# import nltk
# nltk.download()

from nltk.corpus import stopwords

stop = stopwords.words('english')

# what is "stop" ?

print(stop)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [39]:
without_stop_words = [word for word in without_capital_letters if word not in stop]

print (without_stop_words)

['ways', 'unimaginable', ',', 'president', 'trump', 'discarded', 'conventions', 'norms', 'established', 'predecessors', '.']


In [51]:
# homework: how do we exclude punctuation? and numbers?

without_punc = words.translate(None, string.punctuation) for words in without_stop_words

for words in without_stop_words:
    words.translate(None, string.punctuation)

# without_punc = without_stop_words.punctuation


SyntaxError: invalid syntax (<ipython-input-51-02e27ac1f87e>, line 3)