In [29]:
from bs4 import BeautifulSoup
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.datasets import fetch_20newsgroups

In [30]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\doranala493\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [31]:
categories = ['rec.sport.baseball']
dataset = fetch_20newsgroups(subset='all',shuffle=True, random_state=42, categories=categories)
corpus = dataset.data


In [32]:
stopset = set(stopwords.words('english'))
stopset.update(['lt','p','/p','br','amp','quot','field','font','normal','span','0px','rgb','style','51', 
                'spacing','text','helvetica','size','family', 'space', 'arial', 'height', 'indent', 'letter'
                'line','none','sans','serif','transform','line','variant','weight','times', 'new','strong', 'video', 'title'
                'white','word','letter', 'roman','0pt','16','color','12','14','21', 'neue', 'apple', 'class',  ])

In [33]:
vectorizer = TfidfVectorizer(stop_words=stopset,
                                 use_idf=True, ngram_range=(1, 3))
X = vectorizer.fit_transform(corpus)

In [34]:
X[0]
print X[0]

  (0, 51460)	0.0734463096625
  (0, 189431)	0.0734463096625
  (0, 28954)	0.0734463096625
  (0, 146556)	0.0734463096625
  (0, 64407)	0.0734463096625
  (0, 77917)	0.0734463096625
  (0, 133158)	0.0734463096625
  (0, 103424)	0.0734463096625
  (0, 66793)	0.0734463096625
  (0, 114520)	0.0734463096625
  (0, 163261)	0.0734463096625
  (0, 181100)	0.0734463096625
  (0, 119624)	0.0734463096625
  (0, 64543)	0.0734463096625
  (0, 103918)	0.0734463096625
  (0, 95532)	0.0734463096625
  (0, 133073)	0.0734463096625
  (0, 143720)	0.0734463096625
  (0, 98126)	0.0734463096625
  (0, 115161)	0.0734463096625
  (0, 165620)	0.0734463096625
  (0, 59664)	0.0734463096625
  (0, 51500)	0.0734463096625
  (0, 35721)	0.0734463096625
  (0, 64533)	0.0734463096625
  :	:
  (0, 143954)	0.0551931192048
  (0, 122291)	0.11038623841
  (0, 37510)	0.0989842910659
  (0, 40680)	0.104524831508
  (0, 16529)	0.0734463096625
  (0, 94943)	0.11038623841
  (0, 25779)	0.0160794683389
  (0, 100805)	0.010248916482
  (0, 163850)	0.06068403802

In [35]:
X.shape

(994, 190159)

In [36]:
lsa = TruncatedSVD(n_components=27, n_iter=100)
lsa.fit(X)

TruncatedSVD(algorithm='randomized', n_components=27, n_iter=100,
       random_state=None, tol=0.0)

In [37]:
lsa.components_[0]

array([ 0.01601607,  0.00499136,  0.0008138 , ...,  0.00102587,
        0.00102587,  0.00102587])

In [38]:
import sys
print (sys.version)

2.7.11 |Anaconda 2.5.0 (64-bit)| (default, Jan 29 2016, 14:26:21) [MSC v.1500 64 bit (AMD64)]


In [39]:
terms = vectorizer.get_feature_names()
for i, comp in enumerate(lsa.components_): 
    termsInComp = zip (terms,comp)
    sortedTerms =  sorted(termsInComp, key=lambda x: x[1], reverse=True) [:10]
    print "Concept %d:" % i
    for term in sortedTerms:
        print term[0]
    print " "

Concept 0:
edu
com
year
writes
team
would
game
article
re
cs
 
Concept 1:
year
organization
00 00 baltimore
really
00 00 chicago
would
think
distribution
last
jays
 
Concept 2:
aix
writes
go
like
roger
10
ibm
team
would
kingston
 
Concept 3:
edu
team
last year
like
two
runs
nntp posting
game
jays
cs
 
Concept 4:
think
00
also
subject re
baseball
one
years
big
00 00
get
 
Concept 5:
hit
year
edu
lines
play
win
00 00 american
ball
first
see
 
Concept 6:
00
00 00
00 00 00
00 00 chicago
last
first
00 00 01
like
cs
games
 
Concept 7:
game
would
games
player
think
even
series
second
00 00 baltimore
gant
 
Concept 8:
like
well
game
00 00 000
even
alomar
year
would
home
00 00 american
 
Concept 9:
university
like
back
host
pitcher
good
win
one
first
baseball
 
Concept 10:
university
organization
subject
article
edu
subject re
lines
writes
re
cs
 
Concept 11:
team
know
morris
lines
00
game
john
people
might
would
 
Concept 12:
com
re
00 00 baltimore
subject
much
lines
player
00 00 00
games
even

In [40]:
lsa.components_

array([[ 0.01601607,  0.00499136,  0.0008138 , ...,  0.00102587,
         0.00102587,  0.00102587],
       [ 0.00860166,  0.00250451,  0.05211497, ..., -0.00119403,
        -0.00119403, -0.00119403],
       [ 0.01734904,  0.00914593,  0.01823258, ...,  0.00038301,
         0.00038301,  0.00038301],
       ..., 
       [-0.01458689,  0.01366209,  0.0772346 , ..., -0.00183074,
        -0.00183074, -0.00183074],
       [-0.02065688, -0.01332887,  0.0925238 , ...,  0.00207076,
         0.00207076,  0.00207076],
       [-0.07795267, -0.02435029,  0.20074849, ..., -0.00048796,
        -0.00048796, -0.00048796]])