**(1) Original Analysis**

In [116]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import nltk
import re
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords 

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [0]:
#get the train data with following categories 
#remove the metadata information 
category = ['alt.atheism','talk.religion.misc','comp.graphics','sci.space']
newsgroups_train = fetch_20newsgroups(subset='train',categories = category, remove= ('headers','footers', 'quotes'))


In [0]:
#vectorize by TfidfVectorizer with the vocabulary in the train data 

vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(newsgroups_train.data) #fit.transform vectorizes vocab with the train data 


In [0]:
from sklearn.naive_bayes import MultinomialNB #since there are several categories 
from sklearn import metrics 

In [0]:
newsgroups_test = fetch_20newsgroups(subset='test', categories=category, remove= ('headers','footers', 'quotes'))
vector_test = vectorizer.transform(newsgroups_test.data) #use transform not fit_transform to generate vectors related to train data vectors 

In [121]:
clf = MultinomialNB(alpha =0.1)
clf.fit(vectors, newsgroups_train.target) #fit the model with the train data 
pred = clf.predict(vector_test)
metrics.f1_score(newsgroups_test.target, pred, average= 'macro') 

0.7564535630493667

**(2) Additional pre-processing:deleting stop words**

In [122]:
#look at the vocaulary dictionary that has been processed 
#vectorizer.vocabulary_ #dictionary which shows all the words with index as the item (key: word, item: index of word )
#look at the feature names of vocab vectors
vectorizer.get_feature_names() #feature names of dic(=key)
len(vectorizer.get_feature_names()) #26879 words inside the vocab vector

26879

In [123]:
newsgroups_train['data'][0] #the data is inside the dictionary['data] with sentences to lists 

"Hi,\n\nI've noticed that if you only save a model (with all your mapping planes\npositioned carefully) to a .3DS file that when you reload it after restarting\n3DS, they are given a default position and orientation.  But if you save\nto a .PRJ file their positions/orientation are preserved.  Does anyone\nknow why this information is not stored in the .3DS file?  Nothing is\nexplicitly said in the manual about saving texture rules in the .PRJ file. \nI'd like to be able to read the texture rule information, does anyone have \nthe format for the .PRJ file?\n\nIs the .CEL file format available from somewhere?\n\nRych"

In [0]:
#preperation for removing stop words : 
newsgroups_train['filenames'] #an array of the data -not needed  
newsgroups_train['target_names']
newsgroups_train['target']
newsgroups_train['DESCR'] #not needed 

data = newsgroups_train['data']  #includes the news data 
target_name =newsgroups_train['target_names']  #includes the news data target_names
target = newsgroups_train['target'] #includes each data's target 


In [125]:
#delete stop words in the trainset :takes long time since goes through every sentence, token 

clean = []
for sentence in data:
  #print(sentence)
  cleanwords = []
  words = nltk.word_tokenize(sentence)
  for w in words:
    w = w.lower()
    if w not in stopwords.words('english'):
      cleanwords.append(w)
  clean.append(cleanwords)

len(clean)

2034

In [0]:
#made useful dataset only for inlcuding needed data 

train_data = {'data': clean, 'target_name': target_name, 'target': target} 



In [127]:
type(newsgroups_train['data']) # type is a list of each news words 
train_data['data'] #list
train_data['target'] #array

array([1, 3, 2, ..., 1, 0, 1])

In [0]:
#The words need to be not in a list, but in strings (should be list of words, not list of lists )
data=[]
element = ''
for lists in train_data['data']:
  for words in lists:
    element=' '.join(lists)
  data.append(element)


In [129]:
data[0]

"hi , 've noticed save model ( mapping planes positioned carefully ) .3ds file reload restarting 3ds , given default position orientation . save .prj file positions/orientation preserved . anyone know information stored .3ds file ? nothing explicitly said manual saving texture rules .prj file . 'd like able read texture rule information , anyone format .prj file ? .cel file format available somewhere ? rych"

In [130]:
data_array = np.array(data)
#train = np.concatenate((data_array, train_data['target']), axis = 1)
np_data =np.column_stack((data_array,train_data['target']))
np_data.shape #2034 list of words with  2 columns 

(2034, 2)

In [0]:
clean_vector = vectorizer.fit_transform(np_data[:,0]) #fit.transform vectorizes vocab with the train data with stop words deleted 

In [132]:
#do not need to do the same thing for the text data : since vectorize according to the train data 
clean_vector_test = vectorizer.transform(newsgroups_test.data)
clf2 = MultinomialNB(alpha =0.1)
clf2.fit(clean_vector, train_data['target']) #fit the model with the train data
pred2 = clf2.predict(clean_vector_test)
metrics.f1_score(newsgroups_test.target, pred2, average= 'macro') 

0.6902730699679155

In [133]:
#There was an easier way to just delete stop words by a parameter: 
vectorizer = TfidfVectorizer(stop_words='english')
replaced_vector = vectorizer.fit_transform(third_try)
replaced_vector_test = vectorizer.transform(newsgroups_test.data)
clf3 = MultinomialNB(alpha =0.1)
clf3.fit(replaced_vector, train_data['target']) #fit the model with the train data
pred10 = clf3.predict(replaced_vector_test)
metrics.f1_score(newsgroups_test.target, pred3, average= 'macro')  #use this version 

0.7609971497415258

In [134]:
print('Original version: ' +str(metrics.f1_score(newsgroups_test.target, pred, average= 'macro') ))
print('Deleting stop words: ' + str(metrics.f1_score(newsgroups_test.target, pred10, average= 'macro') ))


Original version: 0.7564535630493667
Deleting stop words: 0.7609971497415258


Deleting stop words is has a better result: use new vectorizer 

**(3) Try other preprocessing : deleting the email address**


In [135]:
#a string that includes a email address
newsgroups_train['data'] [30]

'\n  Actually, my atheism is based on ignorance.  Ignorance of the\n  existence of any god.  Don\'t fall into the "atheists don\'t believe\n  because of their pride" mistake.\n\n\n/\\/\\/\\/\\/\\/\\/\\/\\/\\/\\/\\/\\/\\/\\/\\/\\/\\/\\/\\/\\/\\/\\/\\/\\/\\/\\/\\/\\/\\/\\/\\/\\/\\/\\ \n\nBob Beauchaine bobbe@vice.ICO.TEK.COM \n\nThey said that Queens could stay, they blew the Bronx away,\nand sank Manhattan out at sea.'

In [136]:
#try deleting the address 
def delete(sent):
  sent = re.sub(r'[\w]+\@[\w]+.[\w]+\.+[\w]+\.+[\w]+','',sent)
  sent = re.sub(r'[\w]+\@[\w]+.[\w]+\.*?[\w]*?\.*?[\w]*?','',sent)
  sent = re.sub(r'\b[\W]\b',' ',sent) #delete non-word
  sent = re.sub(r'\b\d+?\b','',sent) #delete not needed digits 
  #sent = re.sub(r'\b\d+?[\w]*\b','',sent) #delete weird digit-word 
  sent = re.sub(r'\b\_+[\w]*\b','',sent)
  sent = re.sub(r'\b[\d]*\_+\b','',sent)
  return sent

delete(newsgroups_train['data'][40]) #works
delete(newsgroups_train['data'][30]) #works

'\n  Actually, my atheism is based on ignorance.  Ignorance of the\n  existence of any god.  Don t fall into the "atheists don t believe\n  because of their pride" mistake.\n\n\n/\\/\\/\\/\\/\\/\\/\\/\\/\\/\\/\\/\\/\\/\\/\\/\\/\\/\\/\\/\\/\\/\\/\\/\\/\\/\\/\\/\\/\\/\\/\\/\\/\\/\\ \n\nBob Beauchaine  \n\nThey said that Queens could stay, they blew the Bronx away,\nand sank Manhattan out at sea.'

In [137]:
third_try =[]

for sentence in newsgroups_train['data']:
  third_try.append(delete(sentence))
third_try[40] #now train-data 


'As I was created in the image of Gaea, therefore I must be the pinnacle of creation, She which Creates, She which Births, She which Continues.\n\nOr, to cut all the religious crap, I m a woman, thanks.\nAnd it s sexism that started me on the road to atheism.\n\n-- \nMaddi Hausmann                       \nCentigram Communications Corp        San Jose California    '

In [138]:
train_data['target'] #same target data 
len(third_try) #2034 elements 

2034

In [139]:
vectorizer = TfidfVectorizer(stop_words='english')
replaced_vector = vectorizer.fit_transform(third_try)
replaced_vector_test = vectorizer.transform(newsgroups_test.data)
clf3 = MultinomialNB(alpha =0.1)
clf3.fit(replaced_vector, train_data['target']) #fit the model with the train data
pred3 = clf3.predict(replaced_vector_test)
metrics.f1_score(newsgroups_test.target, pred3, average= 'macro') 

0.7609971497415258

In [140]:
print('Original version: ' +str(metrics.f1_score(newsgroups_test.target, pred, average= 'macro') ))
print('Deleting stop words: ' + str(metrics.f1_score(newsgroups_test.target, pred10, average= 'macro') ))
print('Deleting email, non-words: ' + str(metrics.f1_score(newsgroups_test.target, pred3, average= 'macro') ))

Original version: 0.7564535630493667
Deleting stop words: 0.7609971497415258
Deleting email, non-words: 0.7609971497415258


deleting emails is appropriate than leaving them , so ues new dataset 




**(4) Try other model: TF**


In [141]:
from sklearn.feature_extraction.text import CountVectorizer

tfvectorizer = CountVectorizer(stop_words='english')
tf_vect = tfvectorizer.fit_transform(third_try) #used email deleted data 

tf_vect_test = tfvectorizer.transform(newsgroups_test.data)
clf4 = MultinomialNB(alpha =0.1)
clf4.fit(tf_vect, train_data['target']) #fit the model with the train data
pred4 = clf4.predict(tf_vect_test)
metrics.f1_score(newsgroups_test.target, pred4, average= 'macro') 


0.767115194879272

In [142]:
tfvectorizer2 = CountVectorizer(binary=True, stop_words='english') #unigram/bigram 

tf_vect2 = tfvectorizer2.fit_transform(third_try) #used email deleted data 

tf_vect_test2 = tfvectorizer.transform(newsgroups_test.data)
clf5= MultinomialNB(alpha =0.1)
clf5.fit(tf_vect2, train_data['target']) #fit the model with the train data
pred5 = clf5.predict(tf_vect_test2)
metrics.f1_score(newsgroups_test.target, pred5, average= 'macro') 


0.7654156497733925

In [143]:
print('Original version: ' +str(metrics.f1_score(newsgroups_test.target, pred, average= 'macro') ))
print('Deleting stop words: ' + str(metrics.f1_score(newsgroups_test.target, pred10, average= 'macro') ))
print('Deleting email, non-words: ' + str(metrics.f1_score(newsgroups_test.target, pred3, average= 'macro') ))
print('Using TF: '+ str(metrics.f1_score(newsgroups_test.target, pred4, average= 'macro') ))
print('Using TF binary encoding (one hot encoding): '+ str(metrics.f1_score(newsgroups_test.target, pred5, average= 'macro') ))

Original version: 0.7564535630493667
Deleting stop words: 0.7609971497415258
Deleting email, non-words: 0.7609971497415258
Using TF: 0.767115194879272
Using TF binary encoding (one hot encoding): 0.7654156497733925


**(5) NEW MODEL: Logistic Regression**


In [0]:
#Cleaning the datasets with nice names
new_test = newsgroups_test.data
new_train = third_try #train data (list of words)
vector_train = tf_vect #vector made by train set using tf: appearance of word 
vector_test = tf_vect_test #vector made by test set using replaced_vector 
new_train_y = train_data['target'] #labels of train data 
new_test_y =newsgroups_test.target #label of test data 


In [145]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(vector_train, new_train_y)
predictions = model.predict(vector_test)
metrics.f1_score(new_test_y, predictions, average= 'macro') 

0.7105273272621087

In [146]:
print('Original version: ' +str(metrics.f1_score(newsgroups_test.target, pred, average= 'macro') ))
print('Deleting stop words: ' + str(metrics.f1_score(newsgroups_test.target, pred10, average= 'macro') ))
print('Deleting email, non-words: ' + str(metrics.f1_score(newsgroups_test.target, pred3, average= 'macro') ))
print('Using TF: '+ str(metrics.f1_score(newsgroups_test.target, pred4, average= 'macro') ))
print('Using TF binary encoding (one hot encoding): '+ str(metrics.f1_score(newsgroups_test.target, pred5, average= 'macro') ))
print('------Now using, TF vector matrix for modeling-------')
print('Using logistic regression : '+ str(metrics.f1_score(newsgroups_test.target, predictions, average= 'macro') ))

Original version: 0.7564535630493667
Deleting stop words: 0.7609971497415258
Deleting email, non-words: 0.7609971497415258
Using TF: 0.767115194879272
Using TF binary encoding (one hot encoding): 0.7654156497733925
------Now using, TF vector matrix for modeling-------
Using logistic regression : 0.7105273272621087


**(6) NEW MODEL: KNN**

In [147]:
from sklearn.neighbors import KNeighborsClassifier

model = KNeighborsClassifier(n_neighbors=5)


model.fit(vector_train,new_train_y)


predicted= model.predict(vector_test) 
metrics.f1_score(new_test_y, predicted, average= 'macro') #has bad f1 score 


0.3540099646415944

In [148]:
print('Original version: ' +str(metrics.f1_score(newsgroups_test.target, pred, average= 'macro') ))
print('Deleting stop words: ' + str(metrics.f1_score(newsgroups_test.target, pred10, average= 'macro') ))
print('Deleting email, non-words: ' + str(metrics.f1_score(newsgroups_test.target, pred3, average= 'macro') ))
print('Using TF: '+ str(metrics.f1_score(newsgroups_test.target, pred4, average= 'macro') ))
print('Using TF binary encoding (one hot encoding): '+ str(metrics.f1_score(newsgroups_test.target, pred5, average= 'macro') ))
print('------Now using, TF vector matrix for modeling-------')
print('Using logistic regression : '+ str(metrics.f1_score(newsgroups_test.target, predictions, average= 'macro') ))
print('Using KNN : '+ str(metrics.f1_score(newsgroups_test.target, predicted, average= 'macro') ))

Original version: 0.7564535630493667
Deleting stop words: 0.7609971497415258
Deleting email, non-words: 0.7609971497415258
Using TF: 0.767115194879272
Using TF binary encoding (one hot encoding): 0.7654156497733925
------Now using, TF vector matrix for modeling-------
Using logistic regression : 0.7105273272621087
Using KNN : 0.3540099646415944


In [149]:
from sklearn import svm


SVM = svm.SVC()
SVM.fit(vector_train,new_train_y)

predicts= SVM.predict(vector_test) 
metrics.f1_score(new_test_y, predicts, average= 'macro') #has not good f1 score 


0.5510312967096778

In [150]:
print('Original version: ' +str(metrics.f1_score(newsgroups_test.target, pred, average= 'macro') ))
print('Deleting stop words: ' + str(metrics.f1_score(newsgroups_test.target, pred10, average= 'macro') ))
print('Deleting email, non-words: ' + str(metrics.f1_score(newsgroups_test.target, pred3, average= 'macro') ))
print('Using TF: '+ str(metrics.f1_score(newsgroups_test.target, pred4, average= 'macro') ))
print('Using TF binary encoding (one hot encoding): '+ str(metrics.f1_score(newsgroups_test.target, pred5, average= 'macro') ))
print('------Now using, TF vector matrix for modeling-------')
print('Using logistic regression : '+ str(metrics.f1_score(newsgroups_test.target, predictions, average= 'macro') ))
print('Using KNN : '+ str(metrics.f1_score(newsgroups_test.target, predicted, average= 'macro') ))
print('Using SVM : '+ str(metrics.f1_score(newsgroups_test.target, predicts, average= 'macro') ))

Original version: 0.7564535630493667
Deleting stop words: 0.7609971497415258
Deleting email, non-words: 0.7609971497415258
Using TF: 0.767115194879272
Using TF binary encoding (one hot encoding): 0.7654156497733925
------Now using, TF vector matrix for modeling-------
Using logistic regression : 0.7105273272621087
Using KNN : 0.3540099646415944
Using SVM : 0.5510312967096778


**(7) NEW MODEL: TREE CLASSIFIER **

In [151]:
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
dt = DecisionTreeClassifier()
dt.fit(vector_train,new_train_y)

predict= dt.predict(vector_test) 
metrics.f1_score(new_test_y, predict, average= 'macro') #has not good f1 score 

0.5722461332824256

In [152]:
print('Original version: ' +str(metrics.f1_score(newsgroups_test.target, pred, average= 'macro') ))
print('Deleting stop words: ' + str(metrics.f1_score(newsgroups_test.target, pred10, average= 'macro') ))
print('Deleting email, non-words: ' + str(metrics.f1_score(newsgroups_test.target, pred3, average= 'macro') ))
print('Using TF: '+ str(metrics.f1_score(newsgroups_test.target, pred4, average= 'macro') ))
print('Using TF binary encoding (one hot encoding): '+ str(metrics.f1_score(newsgroups_test.target, pred5, average= 'macro') ))
print('------Now using, TF vector matrix for modeling-------')
print('Using logistic regression : '+ str(metrics.f1_score(newsgroups_test.target, predictions, average= 'macro') ))
print('Using KNN : '+ str(metrics.f1_score(newsgroups_test.target, predicted, average= 'macro') ))
print('Using SVM : '+ str(metrics.f1_score(newsgroups_test.target, predicts, average= 'macro') ))
print('Using Decision-Tree  : '+ str(metrics.f1_score(newsgroups_test.target, predict, average= 'macro') ))


Original version: 0.7564535630493667
Deleting stop words: 0.7609971497415258
Deleting email, non-words: 0.7609971497415258
Using TF: 0.767115194879272
Using TF binary encoding (one hot encoding): 0.7654156497733925
------Now using, TF vector matrix for modeling-------
Using logistic regression : 0.7105273272621087
Using KNN : 0.3540099646415944
Using SVM : 0.5510312967096778
Using Decision-Tree  : 0.5722461332824256


When using better parameters, or doing some greed search each model may get better f-1 scores
(In this example, I only used the default values for the modeling)
