In [1]:
import numpy as np
from sklearn.decomposition import NMF
import pandas as pd

In [2]:
# movie, ratings by users
data = [[5,3,0,1],[4,2,0,1],[1,1,0,5],[1,0,0,4],[0,1,5,4], [3,3,3,3]]

#movies
index = ['Titanic', 'Tiffany', 'Terminator', 'Star Trek', 'Star Wars', 'Shrek']

#users
columns = ['u1', 'u2', 'u3', 'u4']

### Creating a DataFrame

In [3]:
ratings = pd.DataFrame(data, index=index, columns=columns)

In [18]:
ratings.head(6)

Unnamed: 0,u1,u2,u3,u4
Titanic,5,3,0,1
Tiffany,4,2,0,1
Terminator,1,1,0,5
Star Trek,1,0,0,4
Star Wars,0,1,5,4
Shrek,3,3,3,3


In [5]:
X = ratings.values

In [6]:
X

array([[5, 3, 0, 1],
       [4, 2, 0, 1],
       [1, 1, 0, 5],
       [1, 0, 0, 4],
       [0, 1, 5, 4],
       [3, 3, 3, 3]])

In [7]:
# model assumes X ~ PQ'
model = NMF(n_components=3, init='random', random_state=10)


In [8]:
model

NMF(alpha=0.0, beta_loss='frobenius', init='random', l1_ratio=0.0,
  max_iter=200, n_components=3, random_state=10, shuffle=False,
  solver='cd', tol=0.0001, verbose=0)

In [9]:
model.fit(X)

NMF(alpha=0.0, beta_loss='frobenius', init='random', l1_ratio=0.0,
  max_iter=200, n_components=3, random_state=10, shuffle=False,
  solver='cd', tol=0.0001, verbose=0)

In [10]:
# H : movie feature
P = model.components_

In [11]:
P #Matrix describes the strenght relation between User and Feature
  #4 Users 3 Features for example

array([[0.        , 0.30119006, 1.23059224, 0.        ],
       [1.14187221, 0.68290275, 0.        , 0.22828247],
       [0.        , 0.        , 0.        , 1.08951129]])

In [12]:
# W : user feature
Q = model.transform(X)      # association strenght between movie and feature


In [13]:
Q

array([[1.37113653e-03, 4.38231312e+00, 0.00000000e+00],
       [0.00000000e+00, 3.35171135e+00, 2.15567331e-01],
       [5.63885328e-02, 1.02426274e+00, 4.37460246e+00],
       [0.00000000e+00, 6.45042660e-01, 3.53621675e+00],
       [4.02109588e+00, 0.00000000e+00, 3.67137086e+00],
       [2.50401168e+00, 2.80149849e+00, 2.16653743e+00]])

In [14]:
print(model.reconstruction_err_)

0.8336106077106371


In [15]:
nR = np.dot(Q,P)
print(nR)

[[5.00404157e+00 2.99310667e+00 1.68730997e-03 1.00040528e+00]
 [3.82722604e+00 2.28889291e+00 0.00000000e+00 1.00000000e+00]
 [1.16957716e+00 7.16455511e-01 6.93912907e-02 5.00000000e+00]
 [7.36556287e-01 4.40501409e-01 0.00000000e+00 4.00000000e+00]
 [0.00000000e+00 1.21111413e+00 4.94832937e+00 4.00000000e+00]
 [3.19895328e+00 2.66733448e+00 3.08141733e+00 3.00000000e+00]]


In [36]:
# recommendations for new data
new = np.array([0, 0, 1, 0, 1, 1])
segments = new * Q.T  # get category weights
user_cat = segments.sum(axis=1)
movie_sums = user_cat * Q
movie_sums.sum(axis=1)
# now rank the hits and output the first that the user hasnt seen yet

array([16.77470778, 15.02433103, 48.96538031, 38.58143078, 63.95874122,
       49.32379428])

## Topic Modelling

### Importing Data

In [44]:
from sklearn.datasets import fetch_20newsgroups

dataset = fetch_20newsgroups(shuffle=True, random_state=1, remove=('headers', 'footers', 'quotes'))
documents = dataset.data

### Preprocessing Data

In [47]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer


In [49]:
no_features = 1000  # Defining K

In [None]:
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words='english')

In [None]:
tfidf = tfidf_vectorizer.fit_transform(documents) 

In [52]:
tfidf_feature_names = tfidf_vectorizer.get_feature_names()

In [79]:
from sklearn.decomposition import NMF 

### Applying Matrix Factorization

In [80]:

no_topics = 30 #

nmf = NMF(n_components=no_topics, random_state=1, alpha=.1, l1_ratio=.5, init='nndsvd').fit(tfidf)


In [81]:
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print ("Topic % d:" % (topic_idx))
        print (" ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]))

In [82]:
no_top_words = 10

In [83]:
display_topics(nmf, tfidf_feature_names, no_top_words)

Topic  0:
people right government said say make way point believe law
Topic  1:
problem problems using error apple screen fine work try having
Topic  2:
god jesus bible christ faith believe christian christians sin lord
Topic  3:
game team year games season players play hockey win league
Topic  4:
used software 10 need sale data offer available using mail
Topic  5:
thanks advance mail hi looking info appreciated help know anybody
Topic  6:
windows dos ms version running using drivers os run driver
Topic  7:
edu soon cs university article internet ftp email david pub
Topic  8:
key chip clipper keys encryption escrow government public algorithm nsa
Topic  9:
drive scsi drives hard disk ide controller floppy cd tape
Topic  10:
just ll thought tell work little oh wanted maybe mean
Topic  11:
does know anybody mean help say work doesn exist program
Topic  12:
card video monitor drivers cards bus vga driver color mode
Topic  13:
like sounds looks look things lot sound really thing doesn
Topi