<a href="https://colab.research.google.com/github/khuloodnasher/Non-negative-matrix-factorization-/blob/main/Non_negative_matrix_factorization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
# Reading data
import pandas as pd
npr = pd.read_csv('/content/drive/My Drive/npr.csv')
npr.head()

Unnamed: 0,Article
0,"In the Washington of 2016, even when the polic..."
1,Donald Trump has used Twitter — his prefe...
2,Donald Trump is unabashedly praising Russian...
3,"Updated at 2:50 p. m. ET, Russian President Vl..."
4,"From photography, illustration and video, to d..."


In [4]:
npr.tail(15)

Unnamed: 0,Article
11977,"Elections aren’t exactly cozy, even in the bes..."
11978,"Although her oldest child, Ben, is 10 years ol..."
11979,"When a political scandal explodes in France, t..."
11980,The darkest moment for American police this ye...
11981,Russia was ordered to vacate two compounds it ...
11982,A North Carolina judge is temporarily blocking...
11983,China’s police are under fire this week as cit...
11984,"Before the virus overwhelmed Puerto Rico, Zika..."
11985,"It seems everything today has a flavor wheel, ..."
11986,In the final days of a year that has become kn...


In [5]:
# vectorizing the text through tfidf
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english')
dtm = tfidf.fit_transform(npr['Article'])
dtm

<11992x54777 sparse matrix of type '<class 'numpy.float64'>'
	with 3033388 stored elements in Compressed Sparse Row format>

In [6]:
# Modeling with NMF
from sklearn.decomposition import NMF
nmf_model = NMF(n_components=7,random_state=42)
nmf_model.fit(dtm)

NMF(alpha=0.0, beta_loss='frobenius', init=None, l1_ratio=0.0, max_iter=200,
    n_components=7, random_state=42, shuffle=False, solver='cd', tol=0.0001,
    verbose=0)

In [7]:
len(tfidf.get_feature_names())

54777

In [8]:
import random
for i in range(10):
    random_word_id = random.randint(0,54776)
    print(tfidf.get_feature_names()[random_word_id])

replayed
shovelful
airborne
hierarchies
wobbly
ageism
luddites
gasser
nowruz
commenters


In [9]:
len(nmf_model.components_)

7

In [10]:
nmf_model.components_

len(nmf_model.components_[0])

54777

In [11]:
single_topic = nmf_model.components_[0]

single_topic.argsort()

array([    0, 27208, 27206, ..., 36283, 54692, 42993])

In [12]:
len(single_topic)

54777

In [13]:
single_topic[18302]

0.0

In [14]:
single_topic[42993]

2.005055165418588

In [15]:
single_topic.argsort()[-10:]

array([14441, 36310, 53989, 52615, 47218, 53152, 19307, 36283, 54692,
       42993])

In [16]:
top_word_indices = single_topic.argsort()[-10:]

for index in top_word_indices:
    print(tfidf.get_feature_names()[index])

disease
percent
women
virus
study
water
food
people
zika
says


In [17]:
# viewing top 15 words in each topic out of 7 topics
for index,topic in enumerate(nmf_model.components_):
    print(f'THE TOP 15 WORDS FOR TOPIC #{index}')
    print([tfidf.get_feature_names()[i] for i in topic.argsort()[-15:]])
    print('\n')



THE TOP 15 WORDS FOR TOPIC #0
['new', 'research', 'like', 'patients', 'health', 'disease', 'percent', 'women', 'virus', 'study', 'water', 'food', 'people', 'zika', 'says']


THE TOP 15 WORDS FOR TOPIC #1
['gop', 'pence', 'presidential', 'russia', 'administration', 'election', 'republican', 'obama', 'white', 'house', 'donald', 'campaign', 'said', 'president', 'trump']


THE TOP 15 WORDS FOR TOPIC #2
['senate', 'house', 'people', 'act', 'law', 'tax', 'plan', 'republicans', 'affordable', 'obamacare', 'coverage', 'medicaid', 'insurance', 'care', 'health']


THE TOP 15 WORDS FOR TOPIC #3
['officers', 'syria', 'security', 'department', 'law', 'isis', 'russia', 'government', 'state', 'attack', 'president', 'reports', 'court', 'said', 'police']


THE TOP 15 WORDS FOR TOPIC #4
['primary', 'cruz', 'election', 'democrats', 'percent', 'party', 'delegates', 'vote', 'state', 'democratic', 'hillary', 'campaign', 'voters', 'sanders', 'clinton']


THE TOP 15 WORDS FOR TOPIC #5
['love', 've', 'don', 'al

In [18]:
len(npr)

11992

In [19]:
topic_results = nmf_model.transform(dtm)

topic_results.shape

(11992, 7)

In [20]:
topic_results[0].round(2)

array([0.  , 0.12, 0.  , 0.06, 0.02, 0.  , 0.  ])

In [21]:
topic_results[0].argmax()

1

In [22]:
npr.head()

Unnamed: 0,Article
0,"In the Washington of 2016, even when the polic..."
1,Donald Trump has used Twitter — his prefe...
2,Donald Trump is unabashedly praising Russian...
3,"Updated at 2:50 p. m. ET, Russian President Vl..."
4,"From photography, illustration and video, to d..."


In [23]:
topic_results.argmax(axis=1)

array([1, 1, 1, ..., 0, 4, 3])

In [24]:
npr['Topic'] = topic_results.argmax(axis=1)



topicdict  = {0:'health',1:'election',2:'legis',3:'policy',4:'candidates',5:'music',6:'educaion'}

npr['Topic Label'] = npr['Topic'].map(topicdict)

npr.head(10)

Unnamed: 0,Article,Topic,Topic Label
0,"In the Washington of 2016, even when the polic...",1,election
1,Donald Trump has used Twitter — his prefe...,1,election
2,Donald Trump is unabashedly praising Russian...,1,election
3,"Updated at 2:50 p. m. ET, Russian President Vl...",3,policy
4,"From photography, illustration and video, to d...",6,educaion
5,I did not want to join yoga class. I hated tho...,5,music
6,With a who has publicly supported the debunk...,0,health
7,"I was standing by the airport exit, debating w...",0,health
8,"If movies were trying to be more realistic, pe...",0,health
9,"Eighteen years ago, on New Year’s Eve, David F...",5,music
