# NLP Project
## Topic extraction
The purpose of this project is to extract topics from news articles.

### Step-by-step Process
1. Find a suitable NLP model to use for topic extraction: NER
2. Preprocess the data
3. Get results
4. Documentation

In [16]:
# import dependencies
import pandas as pd
import spacy
from spacy import displacy
from collections import Counter

### Data Pre-processing

In [17]:
# read in data
df = pd.read_csv('Data/articles1.csv')['content'].to_frame()
# df.drop(df.index[0:49999],0,inplace=True)
print('\nData set, shape:', df.shape)
print(df.head(1))


Data set, shape: (50000, 1)
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           

In [18]:
# check for missing data
print(df.isna().sum())  # shows no null values in content-column

content    0
dtype: int64


### Loading the Pipeline

In [19]:
# load pipeline
nlp = spacy.load('en_core_web_sm')

In [20]:
# initialise array of type of keywords to get
# useful_entities = ['PERSON', 'NORP', 'ORG', 'GPE', 'LOC', 'PRODUCT', 'EVENT', 'WORK_OF_ART']
useful_entities = ['ORG', 'GPE', 'LOC', 'PRODUCT', 'EVENT', 'WORK_OF_ART']

pd.set_option('display.max_colwidth', None)

In [21]:
# define function to apply to each article: gets a list of the 3 most common keywords
# based on the types of entities defined in useful_entities above
def add_keywords(x):
    doc = nlp(x)  # run article through pipeline
    entities = []  # initialise array of entities
    for ent in doc.ents:
        if ent.label_ in useful_entities:
            entities.append(ent.lemma_)
    cmn_ents = Counter(entities).most_common(3)
    keywords = [e[0] for e in cmn_ents]
    return keywords

### Testing and Running the Pipeline

In [22]:
# run pipeline on first article and print keywords plus text with highlighted keywords for demonstration
first_keywords = add_keywords(df['content'].iloc[0])
print('Keywords:', first_keywords)
displacy.render(nlp(df['content'].iloc[0]), style="ent", jupyter=True)

Keywords: ['House', 'Congress', 'Trump']


In [None]:
# apply add_keywords-function to all articles to load each one's keywords into new column
df['keywords'] = df['content'].apply(add_keywords)

In [None]:
# save the df with added keywords into a csv file
df.to_csv('Data/articles_keywords.csv', index=False)

# print first 10 rows of topics
print(df['keywords'].head(10))