# Import libraries

In [1]:
import pandas as pd
import re
import string
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem import WordNetLemmatizer
import nltk
from sklearn.decomposition import NMF
import numpy as np

# Read in data

In [2]:
df = pd.read_csv('raw_data.csv',index_col=[0])

# Preprocessing Data

In [4]:
df_copy = df.copy()

In [5]:
#this function takes in text and removes the punctuation
def punctuation_removal(text):
    punctuationfree= "".join([i for i in text if i not in string.punctuation])
    return punctuationfree 

In [6]:
#turn all the letters to lowercase
df_copy.articles = df_copy.articles.str.lower()

In [7]:
df_copy.articles = df_copy.articles.map(lambda x: re.sub('[%s]' % re.escape(string.punctuation), ' ', str(x)))

In [8]:
df_copy.articles = df_copy.articles.apply(lambda x: punctuation_removal(str(x)))

# Create Vectorizer

In [10]:
#initializes a count vectorizer
vectorizer = CountVectorizer(stop_words={'english'},max_df=.70,min_df=2,token_pattern=r'(?u)\b[A-Za-z]+\b')

In [11]:
#creating a function that tokenizes the text
def tokenize_text(articles):
    output = articles.split(' ')
    return output

In [12]:
df_copy['tokenized'] = df_copy['articles'].apply(lambda x: tokenize_text(x))

In [13]:
#initialize lemmatizer
lemmatizer = WordNetLemmatizer()

In [14]:
nltk.download('wordnet')

#this function lemmatizes each word and returns it into a list
def lemma(text):
    output = [lemmatizer.lemmatize(word) for word in text]
    return output


df_copy['lemmatized'] = df_copy['tokenized'].apply(lambda x: lemma(x))

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\lmoran\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [16]:
#creates a string from a tokenized text
def creating_strings(text):
    empty = ''
    for i in text:
        empty = empty + ' '+ i
    return empty

In [17]:
df_copy['back_to_strings'] = df_copy['lemmatized'].apply(lambda x: creating_strings(x))

In [18]:
df_copy['back_to_strings'] = df_copy['back_to_strings'].str.lower()

In [19]:
df_copy['back_to_strings']

0        to the editor regarding daphne merkin’s revie...
1        along a dirt covered road deep in texas farm ...
2        with roe v  wade on thin ice  state legislatu...
3        phoenix — when the 99 day work stoppage in ma...
4         russia is teetering on the edge of a possibl...
                              ...                        
1023     johnny grier  who became the first black refe...
1024     sometimes lately  when he hasn’t been rehears...
1025     tijuana  mexico — the frontier shape this met...
1026     until recent event at the oscar  the film sea...
1027     lisette coly and anastasia damalas are at a c...
Name: back_to_strings, Length: 1028, dtype: object

In [20]:
X = vectorizer.fit_transform(df_copy.back_to_strings)
terms = vectorizer.get_feature_names()

In [21]:
nmf_10 = NMF(n_components=10)
doc_topic = nmf_10.fit_transform(X)

In [23]:
#function from Metis
def get_top_terms(topic, n_terms, nmf=nmf_10, terms=terms):
    # get the topic components (i.e., term weights)
    components = nmf.components_[topic, :]

    # get term indices, sorted (descending) by topic weights
    top_term_indices = components.argsort()[-n_terms:]
    
    # use the `terms` array to get the actual top terms
    top_terms = np.array(terms)[top_term_indices]
    
    return top_terms.tolist()

In [24]:
topics = ['-'.join(get_top_terms(i,10)) for i in range(10)]
topics

['political-white-election-biden-former-house-president-him-trump-mr',
 'u-where-me-my-mother-told-charo-woman-her-she',
 're-just-people-do-me-your-can-what-my-you',
 'c-player-season-n-no-first-tournament-team-point-game',
 'u-united-military-country-ukrainian-putin-war-ukraine-russian-russia',
 'map-election-bill-democrat-right-republican-law-court-abortion-state',
 'senator-school-case-justice-law-black-republican-court-jackson-judge',
 'group-first-because-family-work-she-company-her-people-m',
 'inflation-market-oil-rate-energy-will-percent-gas-price-company',
 'pandemic-school-york-many-vaccine-covid-health-people-dr-city']

In [25]:
get_top_terms(0, 10, nmf_10, terms)

['political',
 'white',
 'election',
 'biden',
 'former',
 'house',
 'president',
 'him',
 'trump',
 'mr']

# Test Different Values of max_df

In [26]:
vectorizer2 = CountVectorizer(stop_words={'english'},max_df=.8,min_df=2)

In [27]:
X = vectorizer.fit_transform(df_copy.back_to_strings)
terms2 = vectorizer.get_feature_names()

In [28]:
nmf_10_2= NMF(n_components=5)
doc_topic = nmf_10_2.fit_transform(X)

In [29]:
get_top_terms(0,10,nmf_10_2,terms2)

['them', 'could', 'dr', 'many', 'will', 'city', 'what', 'can', 'people', 'you']

In [30]:
topics = ['-'.join(get_top_terms(i,10)) for i in range(10)]
topics

['political-white-election-biden-former-house-president-him-trump-mr',
 'u-where-me-my-mother-told-charo-woman-her-she',
 're-just-people-do-me-your-can-what-my-you',
 'c-player-season-n-no-first-tournament-team-point-game',
 'u-united-military-country-ukrainian-putin-war-ukraine-russian-russia',
 'map-election-bill-democrat-right-republican-law-court-abortion-state',
 'senator-school-case-justice-law-black-republican-court-jackson-judge',
 'group-first-because-family-work-she-company-her-people-m',
 'inflation-market-oil-rate-energy-will-percent-gas-price-company',
 'pandemic-school-york-many-vaccine-covid-health-people-dr-city']

Through initial results, the most important hyperparameter going forward will be the max_df. When we had it set at .9 we received too many generic words in our topics. So we lowered it to .8. We will try lowering it even further with future work.

The topics that we see with 10 topics are 
1. Pandemic and New York City
2. Unclear, ends with feminine pronouns
3. US politics
4. Unclear, ends with masculine pronouns
5. Sports
6. Ukraine-Russia Conflict
7. Pronouns, could be editirials?
8. ketanji brown jackson supreme court
9. Covid and vaccines
10. Gas prices rising

In [32]:
#creates year column for dynamic topic modeling
df_copy['year'] = df_copy.url.apply(lambda x : x[1:11])

In [36]:
df_copy.drop(df_copy[df_copy['year'] == 'live/2022/'].index, inplace = True)

In [39]:
df_copy['year'] = pd.to_datetime(df_copy["year"])

In [40]:
df_copy.to_csv('converted_data.csv')