# NLP Project
## Topic extraction
The purpose of this project is to extract topics from news articles.

### Step-by-step Process
1. Find a suitable NLP model to use for topic extraction: RAKE (rapid automatic keyword extraction)
2. Preprocess the data
3. Get results
4. Documentation

In [1]:
# import dependencies
import pandas as pd
from gensim.parsing.preprocessing import preprocess_string
from gensim.parsing.preprocessing import strip_tags, strip_punctuation, strip_multiple_whitespaces, strip_numeric, remove_stopwords, strip_short, stem_text
import gensim  # necessary?
from gensim import corpora
import spacy
# ! pip install rake-nltk
from rake_nltk import Rake
import nltk
import warnings
warnings.filterwarnings('ignore')

### Data Pre-processing

In [2]:
# Large dataset model:
# from sklearn.model_selection import train_test_split

# # read in data
# df_1 = pd.read_csv('Data/articles1.csv')['content'].to_frame()  # only get content-column
# df_2 = pd.read_csv('Data/articles2.csv')['content'].to_frame()
# df_3 = pd.read_csv('Data/articles3.csv')['content'].to_frame()
# df = df_1.append(df_2).append(df_3)
# print('\nData set, shape:', df.shape)
# print(df.head(5))

# # check for missing data
# print(df.isna().sum())  # shows no null values in content-column

# # split data into ~67% training and ~33% testing
# train, test = train_test_split(df, test_size=0.33, random_state=1)
# print('\nTraining data set, shape:', train.shape)
# print('Testing data set, shape:', test.shape)

# # reset indices
# train = train.reset_index(drop=True)
# test = test.reset_index(drop=True)
# print(train.head(5))
# print('\n', test.head(5))

# Small dataset model:
# read in data
df = pd.read_csv('Data/articles1.csv')['content'].to_frame()
df.drop(df.index[0:49999],0,inplace=True)  # drop a few rows to make dataset smaller and more manageable
print('\nData set, shape:', df.shape)
# check for missing data
print(df.isna().sum())  # shows no null values in content-column

pd.set_option('display.max_colwidth', None)
print(df.head(1))

num_articles = df.shape[0]


Data set, shape: (1, 1)
content    0
dtype: int64
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     

### Pre-process the Data

In [3]:
# load spacy nlp pre-processing pipeline to use for lemmatization
nlp = spacy.load('en_core_web_sm')

In [4]:
# create filter for gensim nlp pre-processing pipeline to include all steps except stemmatization
CUSTOM_FILTERS = [lambda x: x.lower(),  # lowercase
                  strip_tags,
                  strip_punctuation,  # replace punctuation with whitespace
                  strip_multiple_whitespaces,  # remove repeating whitespaces
                  strip_numeric,  # remove numbers
                  remove_stopwords,  # remove stopwords
                  strip_short,  # remove words with less than 3 characters
                  #  stem_text  # return porter-stemmed text,
                 ]

In [5]:
sample = "Hello, my name is something you'll never guess, Kim! ...But I wrote my signature. Right! My parents called me this, what can I say?"
print(sample)

Hello, my name is something you'll never guess, Kim! ...But I wrote my signature. Right! My parents called me this, what can I say?


In [6]:
# test sample string without filtered pipeline, i.e., with stemmatizer
test_a = preprocess_string(sample)
print(test_a)

['hello', 'guess', 'kim', 'wrote', 'signatur', 'right', 'parent', 'call']


In [7]:
# test sample string with filtered pipeline and lemmatizer
test_b = ' '.join(preprocess_string(sample, CUSTOM_FILTERS))  # pre-process without stemmatizing
lem = [token.lemma_ for token in nlp(test_b)]  # lemmatize
print(lem)

['hello', 'guess', 'kim', 'write', 'signature', 'right', 'parent', 'call']


In [8]:
def preprocess_articles(x):
    prep = ' '.join(preprocess_string(x, CUSTOM_FILTERS))
    return [token.lemma_ for token in nlp(prep)]

In [9]:
# apply final pipeline to all data
df['preprocessed'] = df['content'].apply(preprocess_articles)

In [10]:
# print head of preprocessed df
print(df['preprocessed'].head(1))

49999    [force, gravity, describe, number, metaphor, it, ’s, glue, hold, solar, system, galaxy, anchor, keep, ground, slingshot, send, spacecraft, deep, solar, case, gravity, thief, astronomer, center, astrophysic, say, wednesday, discover, distant, star, milky, way, galaxy, actually, come, sagittarius, dwarf, galaxy, dozen, small, galaxy, surround, sagittarius, pass, milky, way, ’, gravitational, tide, pull, galaxy, star, interloper, farthest, know, star, milky, way, locate, stretch, star, outside, galaxy, ’s, spiral, disk, astronomer, determine, origin, simulation, simulate, movement, sagittarius, milky, way, galaxy, course, billion, year, test, show, time, sagittarius, lose, star, star, scientist, whisk, away, milky, way, settle, stream, galaxy, ’s, edge, observe, position, velocity, ...]
Name: preprocessed, dtype: object


In [11]:
# nltk.download('stopwords')
r = Rake()
r.extract_keywords_from_text(df['content'].iloc[0])
r.get_ranked_phrases_with_scores()

[(21.5, 'considered “ island universes ,”'),
 (16.0, 'milky way galaxy actually came'),
 (14.666666666666666, 'origin using computer simulations'),
 (14.181818181818182, 'stolen stars also suggests'),
 (9.0, 'sends spacecraft deeper'),
 (9.0, 'cosmic theft may'),
 (9.0, 'astrophysics said wednesday'),
 (9.0, '8 billion years'),
 (8.666666666666666, 'elements like iron'),
 (8.5, 'holds solar systems'),
 (8.4, 'smaller galaxies surrounding'),
 (8.4, 'closely orbiting galaxies'),
 (8.333333333333334, 'another satellite galaxy'),
 (8.181818181818182, 'farthest known stars'),
 (8.066666666666666, 'milky way galaxies'),
 (8.0, 'small magellanic cloud'),
 (8.0, 'large magellanic cloud'),
 (8.0, 'gravitational tides pulled'),
 (7.848484848484848, 'lot like stars'),
 (7.333333333333334, 'milky way ’'),
 (7.133333333333333, 'sagittarius dwarf galaxy'),
 (5.666666666666667, 'milky way'),
 (5.666666666666667, 'milky way'),
 (5.666666666666667, 'milky way'),
 (5.5, 'island ”'),
 (5.0, 'gravitationa