In [1]:
from sklearn.datasets import fetch_20newsgroups
categories = ['rec.sport.baseball']
dataset = fetch_20newsgroups(subset='all',shuffle=True, random_state=42, categories=categories)
corpus = dataset.data

In [2]:
from bs4 import BeautifulSoup
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

In [4]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [29]:
stopset = set(stopwords.words('english'))
stopset.update(['lt','p','/p','br','amp','quot','field','font','normal','span','0px','rgb','style','51', 
                'spacing','text','helvetica','size','family', 'space', 'arial', 'height', 'indent', 'letter',
                'line','none','sans','serif','transform','line','variant','weight','times', 'new','strong', 'video', 'title',
                'white','word','letter', 'roman','0pt','16','color','12','14','21', 'neue', 'apple', 'class', 'nntp', '00',
                'posting', '@', 'edu', 'com', '000', 'cs', 'net', 'from', 'subject', 'organization', 'uiuc', 'morris',
                'tc', 'rose', 'jhunix', 'rose', 'duke', 'hulman', 'hcf', 'cc', 'bob', 'cornell', 'stanford', 'hp', 'ca',
                'netcom', 'williams', 'university', 'ted', 'aix', 'ibm', 'scott', 'roger', 'vb30', 'lafibm', '\n'])

In [30]:
vectorizer = TfidfVectorizer(stop_words=stopset, use_idf=True, ngram_range=(1, 3))
X = vectorizer.fit_transform(corpus)

In [31]:
X[0]

<1x186441 sparse matrix of type '<class 'numpy.float64'>'
	with 215 stored elements in Compressed Sparse Row format>

In [43]:
print(X[0])

  (0, 51333)	0.0742541584225
  (0, 187193)	0.0742541584225
  (0, 29035)	0.0742541584225
  (0, 144367)	0.0742541584225
  (0, 62630)	0.0742541584225
  (0, 76217)	0.0742541584225
  (0, 131561)	0.0742541584225
  (0, 101515)	0.0742541584225
  (0, 65020)	0.0742541584225
  (0, 112745)	0.0742541584225
  (0, 161266)	0.0742541584225
  (0, 179185)	0.0742541584225
  (0, 117938)	0.0742541584225
  (0, 62770)	0.0742541584225
  (0, 102007)	0.0742541584225
  (0, 93837)	0.0742541584225
  (0, 131476)	0.0742541584225
  (0, 141507)	0.0742541584225
  (0, 96457)	0.0742541584225
  (0, 113386)	0.0742541584225
  (0, 163663)	0.0742541584225
  (0, 59258)	0.0742541584225
  (0, 51373)	0.0742541584225
  (0, 35846)	0.0742541584225
  (0, 62756)	0.0742541584225
  :	:
  (0, 82873)	0.0346001351527
  (0, 183717)	0.0157169729261
  (0, 141755)	0.0558001979418
  (0, 120623)	0.111600395884
  (0, 37644)	0.100073036534
  (0, 40876)	0.105674518346
  (0, 16587)	0.0742541584225
  (0, 93223)	0.111600395884
  (0, 25839)	0.0162563292

In [32]:
X.shape

(994, 186441)

In [33]:
lsa = TruncatedSVD(n_components=100, n_iter=100)
lsa.fit(X)

TruncatedSVD(algorithm='randomized', n_components=100, n_iter=100,
       random_state=None, tol=0.0)

In [34]:
terms = vectorizer.get_feature_names()
for i, comp in enumerate(lsa.components_): 
    termsInComp = zip (terms,comp)
    sortedTerms =  sorted(termsInComp, key=lambda x: x[1], reverse=True) [:10]
    print("Concept %d:" % i )
    for term in sortedTerms:
        print(term[0])
    print (" ")

Concept 0:
year
team
would
game
writes
article
baseball
players
games
one
 
Concept 1:
jewish
jewish baseball
baseball players
jewish baseball players
come
lafayette
baseball
players
play
lowenstein
 
Concept 2:
clutch
hit
sabo
samuel
performance
year
hitter
situations
may
well
 
Concept 3:
03
02
games
lost
04
really
hitter
05
01
colorado
 
Concept 4:
fame
dave
sox
well
hall
kingman
04
hall fame
clutch
pitcher
 
Concept 5:
gant
hirschbeck
hall
bonds
teams
two
hall fame
kingman
right
want
 
Concept 6:
anyone
let
would
pitcher
hall
game
players
hirschbeck
credit
gant
 
Concept 7:
runs
time
anyone
well
defensive
rather
year
ball
001 100
last
 
Concept 8:
team
bonds
best
series
lopez
win
fan
performance
giants
david
 
Concept 9:
alomar
let
news
bad
look
go
mets
home
runs
young
 
Concept 10:
year
smith
gant
toronto
could
ball
team
better
pitching
way
 
Concept 11:
league
got
mail
believe
right
want
come
teams
alomar
mark
 
Concept 12:
might
could
play
game
david
pitch
long
people
average
ma

In [28]:
corpus[40]

"From: dansmith@mcopn2.dseg.ti.com (Danny Smith)\nSubject: Braves win opener\nNntp-Posting-Host: s355.dseg.ti.com\nReply-To: dansmith@mcopn2.dseg.ti.com\nOrganization: Texas Instruments, Inc.\nLines: 15\n\nWell, Maddux looked excellent as the Braves shutout the Cubs 1 - 0.\nJustice drove in the only run with an RBI single in the first. Get\nready for him to have a monster year. He is now hitting the ball to\nthe opposite field with a lot of power to go with his natural \npower to right field and his good batting eye. If he stays healthy\nwhich he should (his back is full strength this year) he should\nget over 100 RBI and close to 30 HR.\n\nIn another note, the Marlins got off to a good start beating the\nDodgers. I believe the score was 6 - 3 but I'm not sure. I wish\nthem and the Rockies well this year. Hell. I think it would be\nfunny to watch the Dodgers hit the cellar again this year.\n\nDan Smith\n\n"