In [32]:
import pandas as pd


df = pd.read_table('Chineseshairo.txt',  
                   sep='\t', 
                   header=None,
                   names=['label', 'message'])
df.head()

Unnamed: 0,label,message
0,yes,Chinese Beijing Chinese
1,yes,Chinese Chinese Shanghai
2,yes,Chinese Macao
3,yes,Tokyo Japan Chinese
4,yes,Taipei Taiwan


In [33]:
df['label'] = df.label.map({'yes': 0, 'no': 1})
df['message'] = df.message.map(lambda x: x.lower())
df['message'] = df.message.str.replace('[^\w\s]', '')
df.head()

Unnamed: 0,label,message
0,0,chinese beijing chinese
1,0,chinese chinese shanghai
2,0,chinese macao
3,0,tokyo japan chinese
4,0,taipei taiwan


In [34]:
# https://www.nltk.org/ Natural Language Toolkit
# Punkt Sentence Tokenizer https://www.nltk.org/api/nltk.tokenize.html#module-nltk.tokenize.punkt
import nltk
nltk.download('punkt')

df['message'] = df['message'].apply(nltk.word_tokenize)
df.head()

[nltk_data] Downloading package punkt to C:\Users\Jairo
[nltk_data]     Castrellón\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Unnamed: 0,label,message
0,0,"[chinese, beijing, chinese]"
1,0,"[chinese, chinese, shanghai]"
2,0,"[chinese, macao]"
3,0,"[tokyo, japan, chinese]"
4,0,"[taipei, taiwan]"


In [35]:
# https://www.nltk.org/api/nltk.stem.html
#https://tartarus.org/martin/PorterStemmer/
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
df['message'] = df['message'].apply(lambda x: [stemmer.stem(y) for y in x]) 
df.head()

Unnamed: 0,label,message
0,0,"[chines, beij, chines]"
1,0,"[chines, chines, shanghai]"
2,0,"[chines, macao]"
3,0,"[tokyo, japan, chines]"
4,0,"[taipei, taiwan]"


In [36]:
# Converts the list of words into space-separated strings
df['message'] = df['message'].apply(lambda x: ' '.join(x))
df.head()

Unnamed: 0,label,message
0,0,chines beij chines
1,0,chines chines shanghai
2,0,chines macao
3,0,tokyo japan chines
4,0,taipei taiwan


In [37]:
# Convert a collection of text documents to a matrix of token counts
# http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html
# http://scikit-learn.org/stable/modules/feature_extraction.html#text-feature-extraction
# to allow one letter words count_vect = CountVectorizer(token_pattern = r"(?u)\b\w+\b")
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer(token_pattern = r"(?u)\b\w+\b") 
counts = count_vect.fit_transform(df['message'])  
print (counts)

  (0, 0)	1
  (0, 1)	2
  (1, 6)	1
  (1, 1)	2
  (2, 3)	1
  (2, 1)	1
  (3, 2)	1
  (3, 9)	1
  (3, 1)	1
  (4, 8)	1
  (4, 7)	1
  (5, 8)	1
  (5, 3)	1
  (5, 6)	1
  (6, 5)	1
  (6, 2)	1
  (7, 4)	1
  (7, 5)	1
  (7, 8)	1
  (8, 5)	1
  (8, 8)	2


In [38]:
counts.shape

(9, 10)

In [39]:
# https://stackoverflow.com/questions/28064634/random-state-pseudo-random-numberin-scikit-learn
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(counts, df['label'], test_size=0.1, random_state=29) 

from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB().fit(X_train, y_train)  

import numpy as np
predicted = model.predict(X_test)
print(np.mean(predicted == y_test))

from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test, predicted))

1.0
[[1]]


In [40]:
per = 0
for i in range(0,10):
    X_train, X_test, y_train, y_test = train_test_split(counts, df['label'], test_size=0.1) 
    model = MultinomialNB().fit(X_train, y_train)  

    import numpy as np
    predicted = model.predict(X_test)
    print(np.mean(predicted == y_test))
    per += np.mean(predicted == y_test)

    from sklearn.metrics import confusion_matrix
    print(confusion_matrix(y_test, predicted))

print ("average perfromance")
print (per/10.0)

1.0
[[1]]
1.0
[[1]]
1.0
[[1]]
1.0
[[1]]
1.0
[[1]]
1.0
[[1]]
1.0
[[1]]
1.0
[[1]]
1.0
[[1]]
1.0
[[1]]
average perfromance
1.0


In [42]:
per = 0
for i in range(0,10):
    X_train, X_test, y_train, y_test = train_test_split(counts, df['label'], test_size=0.2) 
    model = MultinomialNB().fit(X_train, y_train)  

    import numpy as np
    predicted = model.predict(X_test)
    print(np.mean(predicted == y_test))
    per += np.mean(predicted == y_test)

    from sklearn.metrics import confusion_matrix
    print(confusion_matrix(y_test, predicted))

print ("average perfromance")
print (per/10.0)

1.0
[[1 0]
 [0 1]]
0.5
[[1 1]
 [0 0]]
1.0
[[1 0]
 [0 1]]
0.5
[[0 1]
 [0 1]]
1.0
[[1 0]
 [0 1]]
0.5
[[1 1]
 [0 0]]
0.5
[[1 1]
 [0 0]]
0.5
[[1 1]
 [0 0]]
0.5
[[0 0]
 [1 1]]
0.5
[[0 0]
 [1 1]]
average perfromance
0.65


In [43]:
per = 0
for i in range(0,10):
    X_train, X_test, y_train, y_test = train_test_split(counts, df['label'], test_size=0.5) 
    model = MultinomialNB().fit(X_train, y_train)  

    import numpy as np
    predicted = model.predict(X_test)
    print(np.mean(predicted == y_test))
    per += np.mean(predicted == y_test)

    from sklearn.metrics import confusion_matrix
    print(confusion_matrix(y_test, predicted))

print ("average perfromance")
print (per/10.0)

0.6
[[3 0]
 [2 0]]
1.0
[[3 0]
 [0 2]]
0.8
[[3 1]
 [0 1]]
0.8
[[3 0]
 [1 1]]
0.6
[[3 0]
 [2 0]]
0.4
[[2 0]
 [3 0]]
1.0
[[3 0]
 [0 2]]
0.8
[[3 1]
 [0 1]]
0.6
[[3 0]
 [2 0]]
1.0
[[3 0]
 [0 2]]
average perfromance
0.76


  self.class_log_prior_ = (np.log(self.class_count_) -
