In [None]:
import sklearn as sk
from sklearn.model_selection import GridSearchCV
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.ensemble import AdaBoostClassifier
from sklearn.naive_bayes import MultinomialNB
import pandas as pd
import numpy as np
import seaborn as sns

<h1> Preproccessing </h1> 

In [None]:
#Fetching the dataset
news_groups = fetch_20newsgroups(subset='train',remove=(['headers', 'footers', 'quotes']),shuffle=True, random_state=42)

In [None]:
#intializing count vectorizer
count_vect = CountVectorizer()

In [None]:
#vectorizes our data set
X_train_counts = count_vect.fit_transform(news_groups.data)

In [None]:
#tf (term frequency) check each document and calculates for each word its frequency in the document 
#idf (inverse document frequency) accounts for the fact that 
#words that are in many documents are less useful and gives them lower weights making more accurate calculations ideally

In [None]:
#Here we are tranforming our vectors to frequencies
tfidf_transformer = TfidfTransformer(use_idf=True).fit(X_train_counts)
X_train_tf_idf = tfidf_transformer.transform(X_train_counts)
X_train_tf_idf.shape

<h1> Logistic Regression Classifier </h1>

In [None]:
parameters = {
        'C' : [0.5, 1.0, 1.5, 2],
        'tol' : [1e-3, 1e-4, 1e-5]
             }
gs_clf = GridSearchCV(LogisticRegression(), parameters, n_jobs=-1, verbose=10, cv=3)
gs_clf = gs_clf.fit(X_train_tf_idf, news_groups.target)
print(gs_clf.best_score_)
print(gs_clf.best_params_)

In [None]:
#initializing a pipeline in order to make the vectorizer to tranformer to classifier easier to work with 
text_clf = Pipeline([
('vect', CountVectorizer(stop_words='english')),
('tfidf', TfidfTransformer()),
('clf', LogisticRegression(random_state=0,solver='saga', C=8)),
])

In [None]:
#fitting our pipeline with news_groups data 
text_clf.fit(news_groups.data, news_groups.target)

In [None]:
#This block is testing our classifiers 
twenty_test = fetch_20newsgroups(subset='test',remove=(['headers', 'footers', 'quotes']),shuffle=True, random_state=42)
docs_test = twenty_test.data
predicted = text_clf.predict(docs_test)
np.mean(predicted == twenty_test.target)


<h1> Decsion Tree Classifier </h1> 

In [None]:
parameters = {
        'ccp_alpha' : [0.0, .01,.02,.03,.04,.05,.06,.07,.08,.09]
             }
gs_clf = GridSearchCV(DecisionTreeClassifier(), parameters, n_jobs=-1)
gs_clf = gs_clf.fit(X_train_tf_idf, news_groups.target)
print(gs_clf.best_score_)
print(gs_clf.best_params_)

In [None]:
#initializing a pipeline in order to make the vectorizer to tranformer to classifier easier to work with 
text_clf_DTC = Pipeline([
('vect', CountVectorizer(stop_words='english')),
('tfidf', TfidfTransformer()),
('clf_DTC', DecisionTreeClassifier(random_state=0, max_depth = 500, min_samples_split = 170))
])

In [None]:
#fitting our pipeline with news_groups data 
text_clf_DTC.fit(news_groups.data, news_groups.target)

In [None]:
#This block is testing our classifiers 
twenty_test = fetch_20newsgroups(subset='test',remove=(['headers', 'footers', 'quotes']),shuffle=True, random_state=42)
docs_test = twenty_test.data
predicted = text_clf_DTC.predict(docs_test)
np.mean(predicted == twenty_test.target)


<h1> Support Vector Machine Classifier </h1> 

In [None]:
parameters = {
        'tol' : [1,1e-1,1e-2,1e-3,1e-4],
        'C' : [1,2,4],
             }
gs_clf = GridSearchCV(LinearSVC(), parameters, n_jobs=-1, verbose=10)
gs_clf = gs_clf.fit(X_train_tf_idf, news_groups.target)
print(gs_clf.best_score_)
print(gs_clf.best_params_)

In [None]:
#initializing a pipeline in order to make the vectorizer to tranformer to classifier easier to work with 
text_clf_SVM = Pipeline([
('vect', CountVectorizer(ngram_range =(1,2))),
('tfidf', TfidfTransformer()),
('clf', LinearSVC(tol=1e-1, loss='hinge', penalty='l2', C=1)),
])

In [None]:
text_clf_SVM.fit(news_groups.data, news_groups.target)

In [None]:
#This block is testing our classifiers 
twenty_test = fetch_20newsgroups(subset='test',remove=(['headers', 'footers', 'quotes']),shuffle=True, random_state=42)
docs_test = twenty_test.data
predicted = text_clf_SVM.predict(docs_test)
np.mean(predicted == twenty_test.target)

<h1> Ada boost Classifier </h1> 

In [None]:
parameters = {
        'n_estimators' : [100, 200, 5]
             }
gs_clf = GridSearchCV(AdaBoostClassifier(), parameters, n_jobs=-1)
gs_clf = gs_clf.fit(X_train_tf_idf, news_groups.target)
print(gs_clf.best_score_)
print(gs_clf.best_params_)

In [None]:
#initializing a pipeline in order to make the vectorizer to tranformer to classifier easier to work with 
text_clf_ADA = Pipeline([
('vect', CountVectorizer()),
('tfidf', TfidfTransformer()),
('clf_DTC', AdaBoostClassifier( random_state=0, n_estimators = 150) )
])

In [None]:
#fitting our pipeline with news_groups data 
text_clf_ADA.fit(news_groups.data, news_groups.target)

In [None]:
#This block is testing our classifiers 
twenty_test = fetch_20newsgroups(subset='test',remove=(['headers', 'footers', 'quotes']),shuffle=True, random_state=42)
docs_test = twenty_test.data
predicted = text_clf_ADA.predict(docs_test)
np.mean(predicted == twenty_test.target)

<h1> Random Forest Classifier </h1> 

In [None]:
parameters = {
 'max_depth': [10, 50, 100],
 'criterion': ['gini', 'entropy'],
 'n_estimators': [100,500, 1000]

}
gs_clf = GridSearchCV(RandomForestClassifier(), parameters, n_jobs=20,verbose=10)
gs_clf = gs_clf.fit(X_train_tf_idf, news_groups.target)
print(gs_clf.best_score_)
print(gs_clf.best_params_)

In [None]:
DecisionTreeClassifier().get_params().keys()

In [None]:
#initializing a pipeline in order to make the vectorizer to tranformer to classifier easier to work with 
text_clf_RF = Pipeline([
('vect', CountVectorizer(stop_words='english')),
('tfidf', TfidfTransformer()),
('clf', RandomForestClassifier(n_estimators=500, criterion = 'gini', max_depth = 600)),
])


In [None]:
text_clf_RF.fit(news_groups.data, news_groups.target)

In [None]:
#This block is testing our classifiers 
twenty_test = fetch_20newsgroups(subset='test',remove=(['headers', 'footers', 'quotes']),shuffle=True, random_state=42)
docs_test = twenty_test.data
predicted = text_clf_RF.predict(docs_test)
np.mean(predicted == twenty_test.target)

<h1> Multinomial Bayes </h1> 


In [None]:
parameters = {
              'alpha': (1e-1,1e-2, 1e-3,1e-4,1),
              'fit_prior': (True,False)
             }
gs_clf = GridSearchCV(MultinomialNB(), parameters, n_jobs=-1)
gs_clf = gs_clf.fit(X_train_tf_idf, news_groups.target)
print(gs_clf.best_score_)
print(gs_clf.best_params_)

In [None]:
#initializing a pipeline with the parameters
text_clf_MNB = Pipeline([
('vect', CountVectorizer(stop_words='english', ngram_range = (1,2))),
('tfidf', TfidfTransformer(use_idf = True)),
('clf', MultinomialNB(alpha = 0.01, fit_prior = False)),
])

In [None]:
text_clf_MNB.fit(news_groups.data, news_groups.target)

In [None]:
#This block is testing our classifiers 
twenty_test = fetch_20newsgroups(subset='test',remove=(['headers', 'footers', 'quotes']),shuffle=True, random_state=42)
docs_test = twenty_test.data
predicted = text_clf_MNB.predict(docs_test)
np.mean(predicted == twenty_test.target)