# Text Mining with Scikit-learn

* Create your own sample data
    * Vectorization
        * Term Occurence, also called Bag-of-Word
    
* Sentiment analysis on SMS data

## Create your own sample data

In [None]:
import numpy as np

X_train = ["my name is apple who love eating an orange",
          "orange is my favorite fruit",
          "our orange tree is planted on the yard",
          "farmer grows an apple tree",
          "farmer keep apple seeds in a barn"]

y_train = np.array([0,0,1,1,1]) # argricultural class or not

### Vectorization

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
vectorizer.fit(X_train)

X_vector = vectorizer.transform(X_train).toarray()
print(X_vector)

In [None]:
print(vectorizer.get_feature_names())

In [None]:
from sklearn.tree import DecisionTreeClassifier

model = DecisionTreeClassifier()  #criterion = "entropy")
model.fit(X_vector, y_train)

In [None]:
import os

from io import StringIO
import pydotplus
from IPython.display import Image
from sklearn.tree import export_graphviz

In [None]:
feature_names = vectorizer.get_feature_names()
class_names = ['0','1']

dot_data = StringIO()
export_graphviz(model, out_file=dot_data,  
                     feature_names=feature_names,  
                     class_names=class_names,  
                     filled=True, rounded=True,  
                     special_characters=True)  

graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
Image(graph.create_png(), retina=True)

## Sentiment analysis on SMS data

In [None]:
text_filename = 'datasets/sms_survey/sms.txt'
#target_class_filename = 'datasets/sms_survey/sms_class.txt'  # used for categorization 
target_sentiment_filename = 'datasets/sms_survey/sms_sentiment.txt' # used for sentiment analysis

In [None]:
with open(text_filename, encoding="utf8") as f:
    X_train = f.read().splitlines()    
    
print(X_train[:3])

In [None]:
with open(target_sentiment_filename) as f:
    y_train = f.read().splitlines()
    
print(y_train[:3])

In [None]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

vocabulary=['ล่าช้า', 'บริการ', 'พนักงาน', 'ดี', 'เวลา', 'คิว', 'เคาน์เตอร์', 
            'เปิด', 'ช่อง' , 'ลูกค้า', 'หน้าตา', 'ยิ้มแย้ม']

tok = lambda x: x.split('|')
vectorizer = CountVectorizer(vocabulary=vocabulary, tokenizer=tok)

vectorizer.fit(X_train)

X_vector = vectorizer.transform(X_train).toarray()

In [None]:
from sklearn.tree import DecisionTreeClassifier, export_graphviz

model = DecisionTreeClassifier(criterion = "entropy", max_depth=3)
model.fit(X_vector, y_train)

In [None]:
print(vectorizer.get_feature_names())

In [None]:
feature_names = ['late', 'service', 'staff', 'good', 'time', 'q', 'counter', 'open', 'boost', 
                           'customer', 'face', 'smile']
class_names = ['positive','negative']

dot_data = StringIO()
export_graphviz(model, out_file=dot_data,  
                     feature_names=feature_names,  
                     class_names=class_names,  
                     filled=True, rounded=True,  
                     special_characters=True)  

In [None]:
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
Image(graph.create_png(),retina=True)

In [None]:
test = ['[บริการ|ดี|ยิ้มแย้ม|]']
test_vec = vectorizer.transform(test).toarray()

predicted = model.predict(test_vec)

print(test_vec)
print(predicted)

In [None]:
test = ['[บริการ|ล่าช้า|ดี|จริง|ๆ|]']
test_vec = vectorizer.transform(test).toarray()

predicted = model.predict(test_vec)

print(test_vec)
print(predicted)