# Preprocessing

#### step taken
- 문장 부호 및 기호 제거
- 불용어 제거
- 다른 단어 다듬은 후 토큰화.
- 개별 리뷰로부터 벡터로 만들기

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split 
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer

In [7]:
dataset = pd.read_csv("../data/PN_dataset/dataset_1600.csv")
dataset = dataset[["review", "PN"]]
dataset.head()

Unnamed: 0,review,PN
0,So people just go nuts for this place and I'm ...,0
1,Always part of our itinerary whenever in KR. t...,1
2,Very disappointed. Went there on a Saturday ni...,0
3,I just had a horrible experience at Genghis Gr...,0
4,Absolutely great place for a good pint and a r...,1


- def clean_string(review) 함수 => 데이터 전처리 & stopwords 사용

In [8]:
def clean_string(review):
    review = re.sub('[^a-zA-Z]',' ', review)
    review = review.lower()
    review = review.split()
    ps = PorterStemmer()
    review = [ps.stem(word) for word in review if not word in set(stopwords.words("english"))]
    review = ' '.join(review)
    return review

In [9]:
corpus = dataset["review"].apply(clean_string)

In [10]:
cv = CountVectorizer(max_features = 1500)

- x 에는 함수를 적용시킨 리뷰를 벡터로 변형해 어레이로 담기
- y 에는 두번째 컬럼의 값들을 전부 담기

In [11]:
x = cv.fit_transform(corpus).toarray()
y = dataset.iloc[:,1].values

- train_test_split 사용해 나누기

In [31]:
X_train, X_test, y_train, y_test = train_test_split(x,y, test_size =0.20, random_state = 0)

In [32]:
X_train.shape, y_train.shape

((1280, 1500), (1280,))

# 값들을 정리해서 보여주는 함수 정의

In [14]:
from sklearn.metrics import confusion_matrix

In [36]:
def describe_performance(model_name, y_train, y_pred):
    cm = confusion_matrix(y_test,y_pred)
    TP = cm[0][0]
    FP = cm[0][1]
    TN = cm[1][1]
    FN = cm[1][0]
    accuracy = (TP+TN)/(TP+TN+FP+FN)
    precision = TP/(TP+FP)
    recall = TP/(TP+FN)
    f1 = (2*precision*recall)/(precision+recall)
    print("Summary of ", model_name)
    print("the confusion matrix is :")
    print(cm)
    print("accuracy is " , accuracy)
    print("precision is ", precision)
    print("recall is ", recall)
    print("F1 Score is ", f1)

# Gaussian Naive Bayes

In [16]:
from sklearn.naive_bayes import GaussianNB

In [33]:
gaussNB_classifier = GaussianNB()
gaussNB_classifier.fit(X_train, y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [34]:
y_pred = gaussNB_classifier.predict(X_test)

In [37]:
describe_performance("Gaussian Naive Bayes", y_test, y_pred)

Summary of  Gaussian Naive Bayes
the confusion matrix is :
[[ 95  54]
 [ 27 144]]
accuracy is  0.746875
precision is  0.6375838926174496
recall is  0.7786885245901639
F1 Score is  0.7011070110701106


# Decision Tree Classifier

In [38]:
from sklearn.tree import DecisionTreeClassifier

In [39]:
DTclassifier = DecisionTreeClassifier(criterion="entropy", random_state=0)
DTclassifier.fit(X_train, y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='entropy',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=0, splitter='best')

In [40]:
y_pred = DTclassifier.predict(X_test)

In [41]:
describe_performance("Decision Tree", y_test, y_pred)

Summary of  Decision Tree
the confusion matrix is :
[[108  41]
 [ 35 136]]
accuracy is  0.7625
precision is  0.7248322147651006
recall is  0.7552447552447552
F1 Score is  0.7397260273972601


# Random Forest Classifier

In [42]:
from sklearn.ensemble import RandomForestClassifier

In [44]:
RFclassifier = RandomForestClassifier(n_estimators=10, criterion="entropy", random_state=0)
RFclassifier.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='entropy', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [45]:
y_pred = RFclassifier.predict(X_test)

In [46]:
describe_performance("Random Forest Classifier", y_test, y_pred)

Summary of  Random Forest Classifier
the confusion matrix is :
[[136  13]
 [ 37 134]]
accuracy is  0.84375
precision is  0.912751677852349
recall is  0.7861271676300579
F1 Score is  0.84472049689441
