In [3]:
from sklearn.datasets import fetch_20newsgroups

categories = ['alt.atheism', 'soc.religion.christian', 'comp.graphics', 'sci.med']
news = fetch_20newsgroups(categories=categories, shuffle=False, random_state=42)
X = news.data
y = news.target

In [4]:
X[0]

"From: keith@cco.caltech.edu (Keith Allan Schneider)\nSubject: Re: <Political Atheists?\nOrganization: California Institute of Technology, Pasadena\nLines: 14\nNNTP-Posting-Host: lloyd.caltech.edu\n\nbobbe@vice.ICO.TEK.COM (Robert Beauchaine) writes:\n\n>To show that the examples I and others\n>have provided are *not* counter examples of your supposed inherent\n>moral hypothesis, you have to successfully argue that\n>domestication removes or alters this morality.\n\nI think that domestication will change behavior to a large degree.\nDomesticated animals exhibit behaviors not found in the wild.  I\ndon't think that they can be viewed as good representatives of the\nwild animal kingdom, since they have been bred for thousands of years\nto produce certain behaviors, etc.\n\nkeith\n"

In [5]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=1)

In [6]:
import re
def preprocessor(text):
    text = re.sub('<[^>]*>','', text)        
    text = re.sub('[\W]+',' ', text)         
    text = text.lower()             
    return text

In [7]:
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer

stemmer = PorterStemmer()

def tokenizer_stemmer(text):
    text_tokens = word_tokenize(text)
    return [stemmer.stem(word) for word in text_tokens]


In [8]:
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')

stop = stopwords.words('english')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\kocan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(strip_accents=None,
                        lowercase=False,
                        preprocessor=preprocessor,       
                        tokenizer=tokenizer_stemmer,      
                        stop_words=stop,     
                        min_df=10,
                        max_df=0.1
                       )
X_train_vector = tfidf.fit_transform(X_train)
X_test_vector = tfidf.transform(X_test)



In [10]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(penalty='l2', verbose=1)
lr.fit(X_train_vector, y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.2s finished


In [11]:
print(f'train accuracy = {lr.score(X_train_vector, y_train)}')
print(f'test accuracy = {lr.score(X_test_vector, y_test)}')

train accuracy = 0.9911336288790373
test accuracy = 0.9542772861356932


In [33]:
import numpy as np
max_val = np.max(lr.coef_, axis=1)
idx = np.where(np.max(max_val, axis=0) == max_val)
print(f'most important term : {tfidf.get_feature_names_out()[np.where(lr.coef_[idx[0][0]] ==  np.max(lr.coef_[idx[0][0]], axis = 0))][0]}')

most important term : graphic


In [34]:
from sklearn.tree import DecisionTreeClassifier

tree = DecisionTreeClassifier(max_depth=20)
tree.fit(X_train_vector, y_train)

In [35]:
print(f'train accuracy = {tree.score(X_train_vector, y_train)}')
print(f'test accuracy = {tree.score(X_test_vector, y_test)}')

train accuracy = 0.8176060797973401
test accuracy = 0.7389380530973452


In [36]:
importances = tree.feature_importances_
indices = np.argsort(importances)[::-1]

for f in range(10):
    print("%2d. %-30s %f" % (f+1, 
                             [w for w, n in tfidf.vocabulary_.items() if n == indices[f]],
                             importances[indices[f]]))

 1. ['graphic']                    0.136507
 2. ['christ']                     0.092242
 3. ['keith']                      0.078496
 4. ['islam']                      0.064112
 5. ['church']                     0.062738
 6. ['file']                       0.053615
 7. ['pitt']                       0.048363
 8. ['doctor']                     0.041248
 9. ['atheism']                    0.038496
10. ['faith']                      0.034632


In [37]:
tweets = ['The outbreak was declared a global pandemic by the World Health Organization (WHO) on 11 March.',
          'Today, computer graphics is a core technology in digitalphotography, film, video games, cell phone and computer displays,and many specialized applications.',
          'Arguments for atheism range from philosophical to social and historical approaches.',
          'The Bible is a compilation of many shorter books written at different times by a variety of authors, and later assembled into the biblical canon.'
]


tweets_tfidf = tfidf.transform(tweets)


y_pred = lr.predict(tweets_tfidf)

for i in range(len(tweets)):
    if y_pred[i] == 0:
        print(tweets[i], "--> Negative")
    else:
        print(tweets[i], "--> Positive")


The outbreak was declared a global pandemic by the World Health Organization (WHO) on 11 March. --> Positive
Today, computer graphics is a core technology in digitalphotography, film, video games, cell phone and computer displays,and many specialized applications. --> Positive
Arguments for atheism range from philosophical to social and historical approaches. --> Negative
The Bible is a compilation of many shorter books written at different times by a variety of authors, and later assembled into the biblical canon. --> Positive


In [38]:
tweets = ['The outbreak was declared a global pandemic by the World Health Organization (WHO) on 11 March.',
          'Today, computer graphics is a core technology in digitalphotography, film, video games, cell phone and computer displays,and many specialized applications.',
          'Arguments for atheism range from philosophical to social and historical approaches.',
          'The Bible is a compilation of many shorter books written at different times by a variety of authors, and later assembled into the biblical canon.'
]

tweets_tfidf = tfidf.transform(tweets)

y_pred = tree.predict(tweets_tfidf)

for i in range(len(tweets)):
    if y_pred[i] == 0:
        print(tweets[i], "--> Negative")
    else:
        print(tweets[i], "--> Positive")

The outbreak was declared a global pandemic by the World Health Organization (WHO) on 11 March. --> Positive
Today, computer graphics is a core technology in digitalphotography, film, video games, cell phone and computer displays,and many specialized applications. --> Positive
Arguments for atheism range from philosophical to social and historical approaches. --> Negative
The Bible is a compilation of many shorter books written at different times by a variety of authors, and later assembled into the biblical canon. --> Positive
