In [None]:
import sklearn as sk
import re
from sklearn.model_selection import GridSearchCV
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.ensemble import AdaBoostClassifier
from sklearn.naive_bayes import MultinomialNB
import pandas as pd
import numpy as np
import seaborn as sns

<h1> Preproccessing </h1> 

In [None]:
#Fetching the dataset
news_groups = fetch_20newsgroups(subset='train',remove=(['headers', 'footers', 'quotes']),shuffle=True, random_state=42)

In [None]:
#intializing count vectorizer
count_vect = CountVectorizer()

In [None]:
#vectorizes our data set
X_train_counts = count_vect.fit_transform(news_groups.data)

In [None]:
#tf (term frequency) check each document and calculates for each word its frequency in the document 
#idf (inverse document frequency) accounts for the fact that 
#words that are in many documents are less useful and gives them lower weights making more accurate calculations ideally

In [None]:
#Here we are tranforming our vectors to frequencies
tfidf_transformer = TfidfTransformer(use_idf=True).fit(X_train_counts)
X_train_tf_idf = tfidf_transformer.transform(X_train_counts)
X_train_tf_idf.shape

In [None]:
#Here we get a list of stopping words
stopping_words = []
with open('./function_words.txt', 'r') as f:
    for line in f:
        no_numbers = ''+re.sub('\d', '', line) 
        stopping_words.append(no_numbers.strip())
stopping_words = sorted(stopping_words, key=len, reverse=True)

<h1> Logistic Regression Classifier </h1>

In [None]:
#clf = (Classifier) 
#here we are fitting our logistic regression classifier 
clf = LogisticRegression(random_state=0,multi_class='auto',solver='liblinear').fit(X_train_tf_idf, news_groups.target)

In [None]:
#two quick test cases 
docs_new = ['God is love', 'OpenGL on the GPU is fast']
X_new_counts = count_vect.transform(docs_new)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)
predicted = clf.predict(X_new_tfidf)

for doc, category in zip(docs_new, predicted):
    print('%r => %s' % (doc, news_groups.target_names[category]))



In [None]:
#initializing a pipeline in order to make the vectorizer to tranformer to classifier easier to work with 
text_clf = Pipeline([
('vect', CountVectorizer(stop_words='english')),
('tfidf', TfidfTransformer()),
('clf', LogisticRegression(random_state=0,multi_class='auto',solver='liblinear')),
])

In [None]:
#fitting our pipeline with news_groups data 
text_clf.fit(news_groups.data, news_groups.target)

In [None]:
#This block is testing our classifiers 
twenty_test = fetch_20newsgroups(subset='test',remove=(['headers', 'footers', 'quotes']),shuffle=True, random_state=42)
docs_test = twenty_test.data
predicted = text_clf.predict(docs_test)
np.mean(predicted == twenty_test.target)


<h1> Decsion Tree Classifier </h1> 

In [None]:
#here we are fitting our Descion tree classifier 
clf_DTC = DecisionTreeClassifier(random_state=0).fit(X_train_tf_idf, news_groups.target)

In [None]:
#two quick test cases 
docs_new = ['God is love', 'OpenGL on the GPU is fast']
X_new_counts = count_vect.transform(docs_new)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)
predicted = clf_DTC.predict(X_new_tfidf)

for doc, category in zip(docs_new, predicted):
    print('%r => %s' % (doc, news_groups.target_names[category]))

In [None]:
#initializing a pipeline in order to make the vectorizer to tranformer to classifier easier to work with 
text_clf_DTC = Pipeline([
('vect', CountVectorizer(stop_words='english')),
('tfidf', TfidfTransformer()),
('clf_DTC', DecisionTreeClassifier(random_state=0, criterion='gini', max_depth = 500 ))
])

In [None]:
#fitting our pipeline with news_groups data 
text_clf_DTC.fit(news_groups.data, news_groups.target)

In [None]:
#This block is testing our classifiers 
twenty_test = fetch_20newsgroups(subset='test',remove=(['headers', 'footers', 'quotes']),shuffle=True, random_state=42)
docs_test = twenty_test.data
predicted = text_clf_DTC.predict(docs_test)
np.mean(predicted == twenty_test.target)


<h1> Support Vector Machine Classifier </h1> 

In [None]:
clf_SVM = LinearSVC(random_state=0, tol=1e-4, loss='hinge', penalty='l2', C=10)
clf_SVM.fit(X_train_tf_idf, news_groups.target)

In [None]:
#two quick test cases 
docs_new = ['God is love', 'OpenGL on the GPU is fast']
X_new_counts = count_vect.transform(docs_new)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)
predicted = clf_SVM.predict(X_new_tfidf)

for doc, category in zip(docs_new, predicted):
    print('%r => %s' % (doc, news_groups.target_names[category]))

In [None]:
#initializing a pipeline in order to make the vectorizer to tranformer to classifier easier to work with 
text_clf_SVM = Pipeline([
('vect', CountVectorizer()),
('tfidf', TfidfTransformer()),
('clf', LinearSVC(tol=1e-4, loss='hinge', penalty='l2', C=1)),
])

In [None]:
text_clf_SVM.fit(news_groups.data, news_groups.target)

In [None]:
#This block is testing our classifiers 
twenty_test = fetch_20newsgroups(subset='test',remove=(['headers', 'footers', 'quotes']),shuffle=True, random_state=42)
docs_test = twenty_test.data
predicted = text_clf_SVM.predict(docs_test)
np.mean(predicted == twenty_test.target)

<h1> Ada boost Classifier </h1> 

In [None]:
#here we are fitting our Descion tree classifier 
clf_ADA = AdaBoostClassifier(n_estimators=100, random_state=0).fit(X_train_tf_idf, news_groups.target)

In [None]:
#two quick test cases 
docs_new = ['God is love', 'OpenGL on the GPU is fast']
X_new_counts = count_vect.transform(docs_new)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)
predicted = clf_ADA.predict(X_new_tfidf)

for doc, category in zip(docs_new, predicted):
    print('%r => %s' % (doc, news_groups.target_names[category]))

In [None]:
#initializing a pipeline in order to make the vectorizer to tranformer to classifier easier to work with 
text_clf_ADA = Pipeline([
('vect', CountVectorizer()),
('tfidf', TfidfTransformer()),
('clf_DTC', AdaBoostClassifier(n_estimators=100, random_state=0) )
])

In [None]:
#fitting our pipeline with news_groups data 
text_clf_ADA.fit(news_groups.data, news_groups.target)

In [None]:
#This block is testing our classifiers 
twenty_test = fetch_20newsgroups(subset='test',remove=(['headers', 'footers', 'quotes']),shuffle=True, random_state=42)
docs_test = twenty_test.data
predicted = text_clf_ADA.predict(docs_test)
np.mean(predicted == twenty_test.target)

<h1> Random Forest Classifier </h1> 

In [None]:
clf_RF = RandomForestClassifier()
clf_RF.fit(X_train_tf_idf, news_groups.target)

In [None]:
#two quick test cases 
docs_new = ['God is love', 'OpenGL on the GPU is fast']
X_new_counts = count_vect.transform(docs_new)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)
predicted = clf_RF.predict(X_new_tfidf)

for doc, category in zip(docs_new, predicted):
    print('%r => %s' % (doc, news_groups.target_names[category]))

In [None]:
#initializing a pipeline in order to make the vectorizer to tranformer to classifier easier to work with 
text_clf_RF = Pipeline([
('vect', CountVectorizer(stop_words=stopping_words)),
('tfidf', TfidfTransformer()),
('clf', RandomForestClassifier()),
])


In [None]:
text_clf_RF.fit(news_groups.data, news_groups.target)

In [None]:
#This block is testing our classifiers 
twenty_test = fetch_20newsgroups(subset='test',remove=(['headers', 'footers', 'quotes']),shuffle=True, random_state=42)
docs_test = twenty_test.data
predicted = text_clf_RF.predict(docs_test)
np.mean(predicted == twenty_test.target)

<h1> Multinomial Bayes </h1> 


In [None]:
#initializing a pipeline in order to make the vectorizer to tranformer to classifier easier to work with 
text_clf_MNB = Pipeline([
('vect', CountVectorizer(stop_words=stopping_words)),
('tfidf', TfidfTransformer()),
('clf', MultinomialNB()),
])

parameters = {'vect__ngram_range': [(1, 1), (1, 2)],
               'tfidf__use_idf': (True, False),
               'clf__alpha': (1e-2, 1e-3),
}

gs_clf = GridSearchCV(text_clf_MNB, parameters, n_jobs=-1)
gs_clf = gs_clf.fit(news_groups.data, news_groups.target)
print(gs_clf.best_score_)
print(gs_clf.best_params_)

In [None]:
text_clf_MNB.fit(news_groups.data, news_groups.target)

In [None]:
#This block is testing our classifiers 
twenty_test = fetch_20newsgroups(subset='test',remove=(['headers', 'footers', 'quotes']),shuffle=True, random_state=42)
docs_test = twenty_test.data
predicted = text_clf_MNB.predict(docs_test)
np.mean(predicted == twenty_test.target)