In [1]:
import spacy

from spacy.lang.en.stop_words import STOP_WORDS

STOP_WORDS

{"'d",
 "'ll",
 "'m",
 "'re",
 "'s",
 "'ve",
 'a',
 'about',
 'above',
 'across',
 'after',
 'afterwards',
 'again',
 'against',
 'all',
 'almost',
 'alone',
 'along',
 'already',
 'also',
 'although',
 'always',
 'am',
 'among',
 'amongst',
 'amount',
 'an',
 'and',
 'another',
 'any',
 'anyhow',
 'anyone',
 'anything',
 'anyway',
 'anywhere',
 'are',
 'around',
 'as',
 'at',
 'back',
 'be',
 'became',
 'because',
 'become',
 'becomes',
 'becoming',
 'been',
 'before',
 'beforehand',
 'behind',
 'being',
 'below',
 'beside',
 'besides',
 'between',
 'beyond',
 'both',
 'bottom',
 'but',
 'by',
 'ca',
 'call',
 'can',
 'cannot',
 'could',
 'did',
 'do',
 'does',
 'doing',
 'done',
 'down',
 'due',
 'during',
 'each',
 'eight',
 'either',
 'eleven',
 'else',
 'elsewhere',
 'empty',
 'enough',
 'even',
 'ever',
 'every',
 'everyone',
 'everything',
 'everywhere',
 'except',
 'few',
 'fifteen',
 'fifty',
 'first',
 'five',
 'for',
 'former',
 'formerly',
 'forty',
 'four',
 'from',
 'fron

In [2]:
nlp = spacy.load("en_core_web_sm")

doc = nlp("We just opened our wings, the flying part is coming soon")

for token in doc:
  if token.is_stop:
    print(token)

We
just
our
the
part
is


In [6]:
def preprocess(text):
  doc = nlp(text)

  no_stop_words = [token.text for token in doc if not token.is_stop and not token.is_punct]
  return " ".join(no_stop_words)


In [7]:
preprocess("We just opened our wings, the flying part is coming soon")

['opened', 'wings', 'flying', 'coming', 'soon']

In [8]:
preprocess("Don't hate the play, hate the game.")

['hate', 'play', 'hate', 'game']

In [9]:
from google.colab import drive
drive.mount('/gdrive')

Mounted at /gdrive


In [10]:
import os
os.environ['KAGGLE_CONFIG_DIR'] = "/content/drive/MyDrive/kaggle"

In [11]:
%cd "/content/drive/MyDrive/kaggle"

/content/drive/MyDrive/kaggle


In [12]:
!kaggle datasets download -d jbencina/department-of-justice-20092018-press-releases

Downloading department-of-justice-20092018-press-releases.zip to /content/drive/MyDrive/kaggle
  0% 0.00/13.8M [00:00<?, ?B/s] 29% 4.00M/13.8M [00:00<00:00, 40.7MB/s] 65% 9.00M/13.8M [00:00<00:00, 44.6MB/s]
100% 13.8M/13.8M [00:00<00:00, 53.7MB/s]


In [13]:
!unzip \*.zip && rm *.zip

Archive:  department-of-justice-20092018-press-releases.zip
  inflating: combined.json           


In [14]:
import pandas as pd

df = pd.read_json("/content/drive/MyDrive/kaggle/combined.json", lines=True)

df.shape

(13087, 6)

In [15]:
df.head()

Unnamed: 0,id,title,contents,date,topics,components
0,,Convicted Bomb Plotter Sentenced to 30 Years,"PORTLAND, Oregon. – Mohamed Osman Mohamud, 23,...",2014-10-01T00:00:00-04:00,[],[National Security Division (NSD)]
1,12-919,$1 Million in Restitution Payments Announced t...,WASHINGTON – North Carolina’s Waccamaw River...,2012-07-25T00:00:00-04:00,[],[Environment and Natural Resources Division]
2,11-1002,$1 Million Settlement Reached for Natural Reso...,BOSTON– A $1-million settlement has been...,2011-08-03T00:00:00-04:00,[],[Environment and Natural Resources Division]
3,10-015,10 Las Vegas Men Indicted \r\nfor Falsifying V...,WASHINGTON—A federal grand jury in Las Vegas...,2010-01-08T00:00:00-05:00,[],[Environment and Natural Resources Division]
4,18-898,$100 Million Settlement Will Speed Cleanup Wor...,"The U.S. Department of Justice, the U.S. Envir...",2018-07-09T00:00:00-04:00,[Environment],[Environment and Natural Resources Division]


In [16]:
type(df.topics[0])

list

In [17]:
df.info

<bound method DataFrame.info of             id                                              title  \
0         None       Convicted Bomb Plotter Sentenced to 30 Years   
1      12-919   $1 Million in Restitution Payments Announced t...   
2      11-1002  $1 Million Settlement Reached for Natural Reso...   
3       10-015  10 Las Vegas Men Indicted \r\nfor Falsifying V...   
4       18-898  $100 Million Settlement Will Speed Cleanup Wor...   
...        ...                                                ...   
13082   16-735  Yuengling to Upgrade Environmental Measures to...   
13083   10-473  Zarein Ahmedzay Pleads Guilty to Terror Violat...   
13084   17-045  Zimmer Biomet Holdings Inc. Agrees to Pay $17....   
13085   17-252  ZTE Corporation Agrees to Plead Guilty and Pay...   
13086   17-304  ZTE Corporation Pleads Guilty for  Violating U...   

                                                contents  \
0      PORTLAND, Oregon. – Mohamed Osman Mohamud, 23,...   
1        WASHINGTON

In [18]:
df.describe()

Unnamed: 0,id,title,contents,date,topics,components
count,12810,13087,13087,13087,13087,13087
unique,12672,12887,13080,2400,253,810
top,13-526,Northern California Real Estate Investor Agree...,"WASHINGTON – ING Bank N.V., a financial inst...",2018-04-13T00:00:00-04:00,[],[Criminal Division]
freq,3,8,2,20,8399,2680


In [25]:
df = df[ df["topics"].str.len()!=0 ]
df.head()

Unnamed: 0,id,title,contents,date,topics,components
4,18-898,$100 Million Settlement Will Speed Cleanup Wor...,"The U.S. Department of Justice, the U.S. Envir...",2018-07-09T00:00:00-04:00,[Environment],[Environment and Natural Resources Division]
7,14-1412,14 Indicted in Connection with New England Com...,A 131-count criminal indictment was unsealed t...,2014-12-17T00:00:00-05:00,[Consumer Protection],[Civil Division]
19,17-1419,2017 Southeast Regional Animal Cruelty Prosecu...,The United States Attorney’s Office for the Mi...,2017-12-14T00:00:00-05:00,[Environment],"[Environment and Natural Resources Division, U..."
22,15-1562,21st Century Oncology to Pay $19.75 Million to...,"21st Century Oncology LLC, has agreed to pay $...",2015-12-18T00:00:00-05:00,"[False Claims Act, Health Care Fraud]",[Civil Division]
23,17-1404,21st Century Oncology to Pay $26 Million to Se...,21st Century Oncology Inc. and certain of its ...,2017-12-12T00:00:00-05:00,"[Health Care Fraud, False Claims Act]","[Civil Division, USAO - Florida, Middle]"


In [22]:
df.shape

(4688, 6)

In [23]:
df = df.head(100)
df.shape

(100, 6)

In [27]:
len(df['contents'].iloc[4])

5504

In [29]:
df["contents_new"] = df["contents"].apply(preprocess)
df.head()

Unnamed: 0,id,title,contents,date,topics,components,contents_new
4,18-898,$100 Million Settlement Will Speed Cleanup Wor...,"The U.S. Department of Justice, the U.S. Envir...",2018-07-09T00:00:00-04:00,[Environment],[Environment and Natural Resources Division],"[U.S., Department, Justice, U.S., Environmenta..."
7,14-1412,14 Indicted in Connection with New England Com...,A 131-count criminal indictment was unsealed t...,2014-12-17T00:00:00-05:00,[Consumer Protection],[Civil Division],"[131, count, criminal, indictment, unsealed, t..."
19,17-1419,2017 Southeast Regional Animal Cruelty Prosecu...,The United States Attorney’s Office for the Mi...,2017-12-14T00:00:00-05:00,[Environment],"[Environment and Natural Resources Division, U...","[United, States, Attorney, Office, Middle, Dis..."
22,15-1562,21st Century Oncology to Pay $19.75 Million to...,"21st Century Oncology LLC, has agreed to pay $...",2015-12-18T00:00:00-05:00,"[False Claims Act, Health Care Fraud]",[Civil Division],"[21st, Century, Oncology, LLC, agreed, pay, $,..."
23,17-1404,21st Century Oncology to Pay $26 Million to Se...,21st Century Oncology Inc. and certain of its ...,2017-12-12T00:00:00-05:00,"[Health Care Fraud, False Claims Act]","[Civil Division, USAO - Florida, Middle]","[21st, Century, Oncology, Inc., certain, subsi..."


In [30]:
len(df["contents"].iloc[4])

5504

In [32]:
len(df["contents_new"].iloc[4])

530