In [1]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk import wordnet, pos_tag, WordNetLemmatizer

In [2]:
from multiprocess import Pool

In [3]:
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [4]:
sw_eng = set(stopwords.words('english'))

In [5]:
df = pd.read_csv('articles3.csv')

In [6]:
df.head()

Unnamed: 0.1,Unnamed: 0,id,title,publication,author,date,year,month,url,content
0,103459,151908,Alton Sterling’s son: ’Everyone needs to prote...,Guardian,Jessica Glenza,2016-07-13,2016.0,7.0,https://www.theguardian.com/us-news/2016/jul/1...,The son of a Louisiana man whose father was sh...
1,103460,151909,Shakespeare’s first four folios sell at auctio...,Guardian,,2016-05-25,2016.0,5.0,https://www.theguardian.com/culture/2016/may/2...,Copies of William Shakespeare’s first four boo...
2,103461,151910,My grandmother’s death saved me from a life of...,Guardian,Robert Pendry,2016-10-31,2016.0,10.0,https://www.theguardian.com/commentisfree/2016...,"Debt: $20, 000, Source: College, credit cards,..."
3,103462,151911,I feared my life lacked meaning. Cancer pushed...,Guardian,Bradford Frost,2016-11-26,2016.0,11.0,https://www.theguardian.com/commentisfree/2016...,"It was late. I was drunk, nearing my 35th birt..."
4,103463,151912,Texas man serving life sentence innocent of do...,Guardian,,2016-08-20,2016.0,8.0,https://www.theguardian.com/us-news/2016/aug/2...,A central Texas man serving a life sentence fo...


In [7]:
df.isna().sum()

Unnamed: 0       0
id               0
title            1
publication      0
author         972
date            15
year            15
month           15
url              0
content          0
dtype: int64

In [8]:
df = df.dropna()

In [9]:
def f(f):
  f = f.lower()
  f = re.sub('[^a-z]', ' ', f)
  f = re.sub('\s+', ' ', f)
  f = ' '.join([word for word in f.split() if not word in sw_eng])
  return f

In [10]:
def get_wordnet_pos(treebank_tag):
    my_switch = {
        'J': wordnet.wordnet.ADJ,
        'V': wordnet.wordnet.VERB,
        'N': wordnet.wordnet.NOUN,
        'R': wordnet.wordnet.ADV,
    }
    for key, item in my_switch.items():
        if treebank_tag.startswith(key):
            return item
    return wordnet.wordnet.NOUN

def my_lemmatizer(sent):
    lemmatizer = WordNetLemmatizer()
    tokenized_sent = sent.split()
    pos_tagged = [(word, get_wordnet_pos(tag))
                 for word, tag in pos_tag(tokenized_sent)]
    return ' '.join([lemmatizer.lemmatize(word, tag)
                    for word, tag in pos_tagged])

In [11]:
def pf(a):
    a = f(a)
    a = my_lemmatizer(a)
    return a

In [12]:
def norm(df):
    with Pool(8) as pool:
        a = list(pool.imap(pf, df))
    return a

In [13]:
df = df.drop(columns=['Unnamed: 0', 'date', 'url', 'id', 'year', 'month', 'author', 'publication'])

In [14]:
df.head()

Unnamed: 0,title,content
0,Alton Sterling’s son: ’Everyone needs to prote...,The son of a Louisiana man whose father was sh...
2,My grandmother’s death saved me from a life of...,"Debt: $20, 000, Source: College, credit cards,..."
3,I feared my life lacked meaning. Cancer pushed...,"It was late. I was drunk, nearing my 35th birt..."
5,My dad’s Reagan protests inspire me to stand u...,I have been battling depression and sleeplessn...
6,Flatmates of gay Syrian refugee beheaded in Tu...,Three flatmates of a gay Syrian refugee behead...


In [15]:
df['title1'] = norm(df['title'])

In [16]:
df['content1'] = norm(df['content'])

In [17]:
df.head()

Unnamed: 0,title,content,title1,content1
0,Alton Sterling’s son: ’Everyone needs to prote...,The son of a Louisiana man whose father was sh...,alton sterling son everyone need protest right...,son louisiana man whose father shot kill range...
2,My grandmother’s death saved me from a life of...,"Debt: $20, 000, Source: College, credit cards,...",grandmother death save life debt,debt source college credit card estimate time ...
3,I feared my life lacked meaning. Cancer pushed...,"It was late. I was drunk, nearing my 35th birt...",fear life lack meaning cancer push find,late drunk near th birthday past may alone dan...
5,My dad’s Reagan protests inspire me to stand u...,I have been battling depression and sleeplessn...,dad reagan protest inspire stand donald trump,battle depression sleeplessness think fight do...
6,Flatmates of gay Syrian refugee beheaded in Tu...,Three flatmates of a gay Syrian refugee behead...,flatmate gay syrian refugee behead turkey fear...,three flatmate gay syrian refugee behead homop...


In [18]:
df.to_csv('info_search.csv', index = False)