In [1]:
from sklearn.datasets import fetch_20newsgroups
categories = ['alt.atheism', 'soc.religion.christian','comp.graphics', 'sci.med']
twenty_train = fetch_20newsgroups(subset='train',categories=categories, shuffle=True, random_state=42)

Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)


In [2]:
twenty_train.target_names
len(twenty_train.data)

2257

In [8]:
item_num=1
print(twenty_train.data[item_num])
print(twenty_train.target_names[twenty_train.target[item_num]])


From: ani@ms.uky.edu (Aniruddha B. Deglurkar)
Subject: help: Splitting a trimming region along a mesh 
Organization: University Of Kentucky, Dept. of Math Sciences
Lines: 28



	Hi,

	I have a problem, I hope some of the 'gurus' can help me solve.

	Background of the problem:
	I have a rectangular mesh in the uv domain, i.e  the mesh is a 
	mapping of a 3d Bezier patch into 2d. The area in this domain
	which is inside a trimming loop had to be rendered. The trimming
	loop is a set of 2d Bezier curve segments.
	For the sake of notation: the mesh is made up of cells.

	My problem is this :
	The trimming area has to be split up into individual smaller
	cells bounded by the trimming curve segments. If a cell
	is wholly inside the area...then it is output as a whole ,
	else it is trivially rejected. 

	Does any body know how thiss can be done, or is there any algo. 
	somewhere for doing this.

	Any help would be appreciated.

	Thanks, 
	Ani.
-- 
To get irritated is human, to stay cool, divi

In [9]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(twenty_train.data)
X_train_counts.shape

(2257, 35788)

In [10]:
X_train_counts

<2257x35788 sparse matrix of type '<class 'numpy.int64'>'
	with 365886 stored elements in Compressed Sparse Row format>

In [18]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(2257, 35788)

In [16]:
from sklearn.feature_extraction.text import TfidfTransformer
tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
X_train_tf = tf_transformer.transform(X_train_counts)
X_train_tf.shape

(2257, 35788)

In [19]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(X_train_tfidf, twenty_train.target)

In [26]:
docs_new = ['God is love', 'OpenGL on the GPU is fast',"symptoms are critical","who did this"]
X_new_counts = count_vect.transform(docs_new)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

predicted = clf.predict(X_new_tfidf)

for doc, category in zip(docs_new, predicted):
    print('%r => %s' % (doc, twenty_train.target_names[category]))

'God is love' => soc.religion.christian
'OpenGL on the GPU is fast' => comp.graphics
'symptoms are critical' => sci.med
'who did this' => soc.religion.christian


In [27]:
for t in twenty_train.target[:50]:
    print(twenty_train.target_names[t])

comp.graphics
comp.graphics
soc.religion.christian
soc.religion.christian
soc.religion.christian
soc.religion.christian
soc.religion.christian
sci.med
sci.med
sci.med
soc.religion.christian
comp.graphics
alt.atheism
alt.atheism
comp.graphics
comp.graphics
sci.med
alt.atheism
soc.religion.christian
alt.atheism
soc.religion.christian
alt.atheism
soc.religion.christian
comp.graphics
comp.graphics
comp.graphics
soc.religion.christian
soc.religion.christian
sci.med
sci.med
sci.med
soc.religion.christian
sci.med
soc.religion.christian
sci.med
soc.religion.christian
alt.atheism
alt.atheism
alt.atheism
comp.graphics
soc.religion.christian
alt.atheism
comp.graphics
comp.graphics
sci.med
alt.atheism
soc.religion.christian
soc.religion.christian
comp.graphics
sci.med


In [28]:
from sklearn.pipeline import Pipeline
text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', MultinomialNB()),
])

In [29]:
text_clf.fit(twenty_train.data, twenty_train.target)

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...inear_tf=False, use_idf=True)), ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [30]:
import numpy as np
twenty_test = fetch_20newsgroups(subset='test',
    categories=categories, shuffle=True, random_state=42)
docs_test = twenty_test.data
predicted = text_clf.predict(docs_test)
np.mean(predicted == twenty_test.target)

0.83488681757656458

In [31]:
len(predicted)

1502

In [32]:
predicted[:10]


array([2, 2, 3, 0, 3, 0, 1, 3, 2, 3], dtype=int64)

In [33]:
twenty_test.target[:10]

array([2, 2, 2, 0, 3, 0, 1, 3, 2, 2], dtype=int64)

In [34]:
from sklearn.linear_model import SGDClassifier
text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', SGDClassifier(loss='hinge', penalty='l2',
                                           alpha=1e-3, random_state=42,
                                           max_iter=5, tol=None)),
])
text_clf.fit(twenty_train.data, twenty_train.target)  

predicted = text_clf.predict(docs_test)
np.mean(predicted == twenty_test.target) 

0.9127829560585885

In [37]:
predicted[:10]


array([2, 2, 2, 0, 3, 0, 1, 3, 1, 2], dtype=int64)

In [38]:
twenty_test.target[:10]

array([2, 2, 2, 0, 3, 0, 1, 3, 2, 2], dtype=int64)

In [39]:
from sklearn import metrics
print(metrics.classification_report(twenty_test.target, predicted,
    target_names=twenty_test.target_names))

                        precision    recall  f1-score   support

           alt.atheism       0.95      0.81      0.87       319
         comp.graphics       0.88      0.97      0.92       389
               sci.med       0.94      0.90      0.92       396
soc.religion.christian       0.90      0.95      0.93       398

           avg / total       0.92      0.91      0.91      1502



In [40]:
metrics.confusion_matrix(twenty_test.target, predicted)

array([[258,  11,  15,  35],
       [  4, 379,   3,   3],
       [  5,  33, 355,   3],
       [  5,  10,   4, 379]], dtype=int64)

In [41]:
fopen=open("myfile.txt","w")
fopen.write("John Smith")
fopen.close()

In [42]:
fopen=open("myfile.txt")
content=fopen.read()
fopen.close()
content

'John Smith'

In [43]:
fopen=open("myfile.txt","a")
fopen.write("Sarah Smith")
fopen.close()

In [44]:
fopen=open("myfile.txt")
content=fopen.read()
fopen.close()
content

'John SmithSarah Smith'

In [45]:
fopen=open("myfile1.txt","a")
fopen.write("John Smith\n")
fopen.close()

In [46]:
fopen=open("myfile1.txt","a")
fopen.write("Sarah Smith\n")
fopen.close()

In [47]:
fopen=open("myfile1.txt","a")
fopen.write("Noah Smith\n")
fopen.close()

In [49]:
fopen=open("myfile1.txt")
content=fopen.read()
fopen.close()
print(content)

John Smith
Sarah Smith
Noah Smith



In [50]:
fopen=open("myfile1.txt")
list_att=fopen.readlines()
fopen.close()
print(list_att)

['John Smith\n', 'Sarah Smith\n', 'Noah Smith\n']


In [51]:
list_att.append("Kale Smith")

In [57]:
list_att=['John Smith\n', 'Sarah Smith\n', 'Noah Smith\n', 'Kale Smith']

In [67]:
fopen=open("myfile1.txt","w")
#new_content="\n".join([v.strip("\n") for v in list_att])
fopen.write('John Smith\nSarah Smith\nNoah Smith\nKale Smith')
fopen.close()
    

In [69]:
fopen=open("myfile1.txt")
list_att=fopen.readlines()
fopen.close()
print([v.strip("\n") for v in list_att])

['John Smith', 'Sarah Smith', 'Noah Smith', 'Kale Smith']


In [66]:
new_content

'John Smith\n\nSarah Smith\n\nNoah Smith\n\nKale Smith'

In [70]:
fopen=open("myfile1.txt")
list_att=fopen.readlines()
list_att=[v.strip("\n") for v in list_att]
fopen.close()
print(list_att)

['John Smith', 'Sarah Smith', 'Noah Smith', 'Kale Smith']


In [71]:
dir(list_att)

['__add__',
 '__class__',
 '__contains__',
 '__delattr__',
 '__delitem__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__gt__',
 '__hash__',
 '__iadd__',
 '__imul__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__le__',
 '__len__',
 '__lt__',
 '__mul__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__reversed__',
 '__rmul__',
 '__setattr__',
 '__setitem__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 'append',
 'clear',
 'copy',
 'count',
 'extend',
 'index',
 'insert',
 'pop',
 'remove',
 'reverse',
 'sort']

In [72]:
list_att

['John Smith', 'Sarah Smith', 'Noah Smith', 'Kale Smith']

In [73]:
list_att.pop(2)

'Noah Smith'

In [74]:
list_att

['John Smith', 'Sarah Smith', 'Kale Smith']

In [75]:
new_file_content="\n".join(list_att)

In [76]:
new_file_content

'John Smith\nSarah Smith\nKale Smith'

In [77]:
fopen=open("myfile.txt","w")
fopen.write(new_file_content)
fopen.close()

In [78]:
fopen=open("myfile1.txt")
new_list_att=fopen.readlines()
new_list_att=[v.strip("\n") for v in list_att]
fopen.close()
print(new_list_att)

['John Smith', 'Sarah Smith', 'Kale Smith']


In [79]:
info=["John Smith","Company","NY","email@email.com"]
info_line="\t".join(info)

In [80]:
info_line

'John Smith\tCompany\tNY\temail@email.com'

In [83]:
def rec_func(input_str):
    if input_str=="": return
    print(input_str)
    new_str=input_str[1:-1]
    rec_func(new_str)
our_str="A man, a plan, a canal, Panama"
rec_func(our_str)
    
    

A man, a plan, a canal, Panama
 man, a plan, a canal, Panam
man, a plan, a canal, Pana
an, a plan, a canal, Pan
n, a plan, a canal, Pa
, a plan, a canal, P
 a plan, a canal, 
a plan, a canal,
 plan, a canal
plan, a cana
lan, a can
an, a ca
n, a c
, a 
 a
