In [16]:
import sklearn as sk
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
import pandas as pd
import numpy as np
import seaborn as sns

<h1> Preproccessing </h1> 

In [17]:
#Fetching the dataset
news_groups = fetch_20newsgroups(subset='train',remove=(['headers', 'footers', 'quotes']),shuffle=True, random_state=42)

In [18]:
#intializing count vectorizer
count_vect = CountVectorizer()

In [19]:
#vectorizes our data set
X_train_counts = count_vect.fit_transform(news_groups.data)

In [20]:
#tf (term frequency) check each document and calculates for each word its frequency in the document 
#idf (inverse document frequency) accounts for the fact that 
#words that are in many documents are less useful and gives them lower weights making more accurate calculations ideally

In [27]:
#Here we are tranforming our vectors to frequencies
tfidf_transformer = TfidfTransformer(use_idf=True).fit(X_train_counts)
X_train_tf_idf = tfidf_transformer.transform(X_train_counts)
X_train_tf_idf.shape

(11314, 101631)

<h1> Logistic Regression Classifier </h1>

In [28]:
#clf = (Classifier) 
#here we are fitting our logistic regression classifier 
clf = LogisticRegression(random_state=0,multi_class='auto',solver='liblinear').fit(X_train_tf_idf, news_groups.target)

In [29]:
#two quick test cases 
docs_new = ['God is love', 'OpenGL on the GPU is fast']
X_new_counts = count_vect.transform(docs_new)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)
predicted = clf.predict(X_new_tfidf)

for doc, category in zip(docs_new, predicted):
    print('%r => %s' % (doc, news_groups.target_names[category]))



'God is love' => soc.religion.christian
'OpenGL on the GPU is fast' => rec.autos


In [30]:
#initializing a pipeline in order to make the vectorizer to tranformer to classifier easier to work with 
text_clf = Pipeline([
('vect', CountVectorizer()),
('tfidf', TfidfTransformer()),
('clf', LogisticRegression(random_state=0,multi_class='auto',solver='liblinear')),
])

In [31]:
#fitting our pipeline with news_groups data 
text_clf.fit(news_groups.data, news_groups.target)

Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('tfidf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('clf',
                 LogisticRegression(C=1.0, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scaling=1,
  

In [32]:
#This block is testing our classifiers 
twenty_test = fetch_20newsgroups(subset='test',remove=(['headers', 'footers', 'quotes']),shuffle=True, random_state=42)
docs_test = twenty_test.data
predicted = text_clf.predict(docs_test)
np.mean(predicted == twenty_test.target)


0.6775092936802974

<h1> Decsion Tree Classifier </h1> 

In [35]:
#here we are fitting our Descion tree classifier 
clf_DTC = DecisionTreeClassifier(random_state=0).fit(X_train_tf_idf, news_groups.target)

In [36]:
#two quick test cases 
docs_new = ['God is love', 'OpenGL on the GPU is fast']
X_new_counts = count_vect.transform(docs_new)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)
predicted = clf_DTC.predict(X_new_tfidf)

for doc, category in zip(docs_new, predicted):
    print('%r => %s' % (doc, news_groups.target_names[category]))

'God is love' => soc.religion.christian
'OpenGL on the GPU is fast' => comp.os.ms-windows.misc


In [42]:
#initializing a pipeline in order to make the vectorizer to tranformer to classifier easier to work with 
text_clf_DTC = Pipeline([
('vect', CountVectorizer()),
('tfidf', TfidfTransformer()),
('clf_DTC', DecisionTreeClassifier(random_state=0))
])

In [43]:
#fitting our pipeline with news_groups data 
text_clf_DTC.fit(news_groups.data, news_groups.target)

Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=Non...
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('clf_DTC',
                 DecisionTreeClassifier(class_weight=None, criterion='gini',
                                        max_depth=None, max_features=None,
                        

In [44]:
#This block is testing our classifiers 
twenty_test = fetch_20newsgroups(subset='test',remove=(['headers', 'footers', 'quotes']),shuffle=True, random_state=42)
docs_test = twenty_test.data
predicted = text_clf_DTC.predict(docs_test)
np.mean(predicted == twenty_test.target)


0.40480616038236855

<h1> Support Vector Machine Classifier </h1> 

<h1> Ada boost Classifier </h1> 

<h1> Random Forest Classifier </h1> 