In [1]:
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn import metrics
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.cross_validation import train_test_split
from sklearn.model_selection import cross_val_score

import nltk
from nltk import *
from nltk.corpus import PlaintextCorpusReader
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords

import sys
import codecs
import string
import random
import re

import requests
import urllib
import bs4
from bs4 import BeautifulSoup
from readability.readability import Document # https://github.com/buriy/python-readability. Tried Goose, Newspaper (python libraries on Github). Bad results.
from http.cookiejar import CookieJar # 

# Scraping the Full Article Texts
First, populate the "article" column by the whole article from the URL (will clean those full texts up later).<br> <b>NOTE:</b> The following cell will take a long time. Run the commented cell after to directly load in the result of the cell.

In [2]:
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36'}

urls = ["https://www.google.com/search?q=Alice+Corp.+v.+CLS+Bank&tbm=nws&ei=naW2W-jGDLL85gKO9LbADg&start=0&sa=N&biw=1531&bih=814&dpr=2"]

ahrefs = []

for i in range(10, 100, 10):
    urls.append(urls[0]+'&start='+str(i))
    
for url in urls:
    req = requests.get(url, headers=headers)
    soup = bs4.BeautifulSoup(req.text, "html.parser")
    
    #you don't even have to process the div container
    #just go strait to <a> and using indexing get "href"
    #headlines
    ahref  = [a["href"] for a in soup.find_all("a", class_="lLrAF")]
    for a in ahref:
        ahrefs.append(a)

print(ahrefs[0:10])
print(len(ahrefs))

['http://www.ipwatchdog.com/2018/09/08/time-retire-benson/id=100812/', 'http://www.ipwatchdog.com/2018/09/16/smartflash-files-petition-writ-supreme-court-challenge-ptab-appointments-clause/id=101366/', 'https://www.lexology.com/library/detail.aspx?g=21baf439-98d9-437b-b2e6-ba3d5f56c634', 'http://www.jdsupra.com/legalnews/gust-inc-v-alphacap-ventures-llc-fed-86330/', 'http://www.ipwatchdog.com/2018/09/17/capitol-hill-roundup/id=101431/', 'https://www.law360.com/articles/1081603/fed-circ-upholds-ax-of-online-security-ip-in-usaa-case', 'http://www.capitalpress.com/Business/20180912/farm-victim-of-patent-troll-lawsuit', 'https://www.ipwatchdog.com/2018/09/18/alice-age-four-grow-up/id=101447/', 'http://knowledge.wharton.upenn.edu/article/the-limitations-of-lean-startup-principles/', 'http://www.jdsupra.com/legalnews/claiming-artificial-intelligence-ai-71727/']
100


In [4]:
df = pd.DataFrame(columns=['title', 'articleUrl', 'article', 'keyword_score', 'author', 'date', 
                           'source', 'sourceUrl', 'readability_score', 'sentiment_polarity'])

for i in range(len(ahrefs)):
    try:
        url = ahrefs[i]
        opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor)
        opener.addheaders = [('User-agent', 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.7 (KHTML, like Gecko) Chrome/16.0.912.77 Safari/535.7')]
        html = opener.open(url).read()
        soup = BeautifulSoup(html, "lxml")
        title = soup.title.string
        readable_article = Document(html).summary()
        soup = BeautifulSoup(readable_article, "lxml")
        text = soup.get_text()
        df = df.append({'title': title, 'articleUrl': url, 'article': text, 'keyword_score':0, 'author':'', 'date':'', 
                               'source':'', 'sourceUrl':'', 'readability_score':0.0, 
                                'sentiment_polarity':0.0}, ignore_index=True)
    except:
        print('error', str(i) +':', url)

df.head()

error 78: https://www.i-programmer.info/news/99-professional/10935-alice-under-threat.html


Unnamed: 0,title,articleUrl,article,keyword_score,author,date,source,sourceUrl,readability_score,sentiment_polarity
0,"At Age 46, it’s Time to Retire Benson - IPWatc...",http://www.ipwatchdog.com/2018/09/08/time-reti...,\nThere Are Two Types of § 101 Rejections\nThe...,0,,,,,0.0,0.0
1,Smartflash Petitions Supreme Court to Challeng...,http://www.ipwatchdog.com/2018/09/16/smartflas...,"\nIn early August, patent owner Smartflash fil...",0,,,,,0.0,0.0
2,Berkheimer Increases Applicants’ Ability to Ov...,https://www.lexology.com/library/detail.aspx?g...,\rWe use cookies to customise content for your...,0,,,,,0.0,0.0
3,"Gust, Inc. v. AlphaCap Ventures, LLC (Fed. Cir...",http://www.jdsupra.com/legalnews/gust-inc-v-al...,"\n\n\n\tAlmost two years ago, we covered a dis...",0,,,,,0.0,0.0
4,Capitol Hill Roundup - IPWatchdog.com | Patent...,http://www.ipwatchdog.com/2018/09/17/capitol-h...,"\nThis week in Capitol Hill hearings, it will ...",0,,,,,0.0,0.0


# Cleaning
Web scraping is very messy, so we have to clean the "article" column. We will keep the '\n' characters to be able to split the text into paragraphs (without the use of NLTK).

In [8]:
df = df.replace({'\xa0': ' '}, regex=True) # clean these empty null characters.



### Keyword Score

In [10]:
special_keywords = ["startup", "venture", "alpha", "beta", "test", "launch", "release"]
for i in range(len(df)):
    count = 0
    for k in special_keywords:
        if k in df['article'][i]:
            count += 1
    df.loc[i, 'keyword_score'] = count

df.head()

Unnamed: 0,title,articleUrl,article,keyword_score,author,date,source,sourceUrl,readability_score,sentiment_polarity
0,"At Age 46, it’s Time to Retire Benson - IPWatc...",http://www.ipwatchdog.com/2018/09/08/time-reti...,\nThere Are Two Types of § 101 Rejections\nThe...,1,,,,,0.0,0.0
1,Smartflash Petitions Supreme Court to Challeng...,http://www.ipwatchdog.com/2018/09/16/smartflas...,"\nIn early August, patent owner Smartflash fil...",1,,,,,0.0,0.0
2,Berkheimer Increases Applicants’ Ability to Ov...,https://www.lexology.com/library/detail.aspx?g...,\rWe use cookies to customise content for your...,0,,,,,0.0,0.0
3,"Gust, Inc. v. AlphaCap Ventures, LLC (Fed. Cir...",http://www.jdsupra.com/legalnews/gust-inc-v-al...,"\n\n\n\tAlmost two years ago, we covered a dis...",1,,,,,0.0,0.0
4,Capitol Hill Roundup - IPWatchdog.com | Patent...,http://www.ipwatchdog.com/2018/09/17/capitol-h...,"\nThis week in Capitol Hill hearings, it will ...",0,,,,,0.0,0.0


In [17]:
# checkpoint
df.to_csv('patent_decision_news.csv', encoding='utf-8', index=False)
# df = pd.read_csv('patent_decision_news.csv')
# df.head()

In [18]:
df

Unnamed: 0,title,articleUrl,article,keyword_score,author,date,source,sourceUrl,readability_score,sentiment_polarity
0,"At Age 46, it’s Time to Retire Benson - IPWatc...",http://www.ipwatchdog.com/2018/09/08/time-reti...,\nThere Are Two Types of § 101 Rejections\nThe...,1,,,,,0.0,0.0
1,Smartflash Petitions Supreme Court to Challeng...,http://www.ipwatchdog.com/2018/09/16/smartflas...,"\nIn early August, patent owner Smartflash fil...",1,,,,,0.0,0.0
2,Berkheimer Increases Applicants’ Ability to Ov...,https://www.lexology.com/library/detail.aspx?g...,\rWe use cookies to customise content for your...,0,,,,,0.0,0.0
3,"Gust, Inc. v. AlphaCap Ventures, LLC (Fed. Cir...",http://www.jdsupra.com/legalnews/gust-inc-v-al...,"\n\n\n\tAlmost two years ago, we covered a dis...",1,,,,,0.0,0.0
4,Capitol Hill Roundup - IPWatchdog.com | Patent...,http://www.ipwatchdog.com/2018/09/17/capitol-h...,"\nThis week in Capitol Hill hearings, it will ...",0,,,,,0.0,0.0
5,Fed. Circ. Upholds Ax Of Online Security IP In...,https://www.law360.com/articles/1081603/fed-ci...,\n\n\n By Kevin Penton\n\n\n\n\t\t\t\t\t\tL...,0,,,,,0.0,0.0
6,Farm victim of ‘patent troll’ lawsuit - Busine...,http://www.capitalpress.com/Business/20180912/...,An Oregon farm that sells organic foods and ot...,0,,,,,0.0,0.0
7,Alice at Age Four: Time to Grow Up - IPWatchdo...,https://www.ipwatchdog.com/2018/09/18/alice-ag...,"\nTo be successful, patent practitioners must ...",2,,,,,0.0,0.0
8,The Limitations of Lean Startup Principles,http://knowledge.wharton.upenn.edu/article/the...,\nThe Lean Startup is a bestselling business b...,4,,,,,0.0,0.0
9,Claiming Artificial Intelligence: AI-related P...,http://www.jdsupra.com/legalnews/claiming-arti...,\n\n\n\tThe field of artificial intelligence (...,2,,,,,0.0,0.0


### Stopwords, stemming, and tokenization to clean the text even more
tokenize_and_stem: tokenizes and also stems each token.<br>
tokenize_only: tokenizes the text only to be able to convert stemmed tokens back to the full word.<br>

In [None]:
# load nltk's English stopwords
stopwords = nltk.corpus.stopwords.words('english')
# load nltk's SnowballStemmer
stemmer = SnowballStemmer("english")

# Define a tokenizer and stemmer that returns the set of stems of the text
def tokenize_and_stem(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if len(token) <= 1: # remove one character words, "a" apostraphe s.
            continue
        if token.isalpha():
            filtered_tokens.append(token)
    stems = [stemmer.stem(t) for t in filtered_tokens]
    return stems

def tokenize_only(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if len(token) <= 1: # remove one character words, "a" apostraphe s.
            continue
        if token.isalpha():
            filtered_tokens.append(token)
    return filtered_tokens