# Data Mining Project

In [540]:
from pathlib import Path
import sys
import numpy as np
import pandas as pd
import nltk
import matplotlib.pyplot as plt
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
import re
nltk.download('punkt')
nltk.download('stopwords')
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split,KFold
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report,confusion_matrix, accuracy_score
from sklearn.tree import DecisionTreeClassifier,plot_tree
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm

[nltk_data] Downloading package punkt to C:\Users\Kaan-
[nltk_data]     PC\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Kaan-
[nltk_data]     PC\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### READING DOCUMENTS

In [455]:
def read_documents(doc_type,path):
    files = path.iterdir()
    document_arr = []

    for index,file in enumerate(files, start=0):
        reader = open(file,"r")
        text = reader.read()
        document_arr.append({'type':doc_type,'text':text})
    return document_arr

real_positive_path = Path('opinion_spam/positive/truthful')
fake_positive_path = Path('opinion_spam/positive/deceptive')
real_negative_path = Path('opinion_spam/negative/truthful')
fake_negative_path = Path('opinion_spam/negative/deceptive')

docs = pd.DataFrame()
docs = docs.append(read_documents('real_positive',real_positive_path))
docs = docs.append(read_documents('fake_positive',fake_positive_path))
docs = docs.append(read_documents('real_negative',real_negative_path))
docs = docs.append(read_documents('fake_negative',fake_negative_path))
len(docs)
docs

Unnamed: 0,type,text
0,real_positive,I was completely blown away by this hotel. It ...
1,real_positive,We've just returned from a two night stay at t...
2,real_positive,"Excellent location, feels like a boutique hote..."
3,real_positive,"I travel a lot for business and quite frankly,..."
4,real_positive,We visited for my 40th birthday. We had never ...
...,...,...
395,fake_negative,My husband and I were planning our 1st year we...
396,fake_negative,I recently stayed at The Talbott Hotel for 3 n...
397,fake_negative,"I'd expect a ""luxury"" hotel to pay more attent..."
398,fake_negative,I selected The Talbott for my recent family va...


### PREPROCESSING FUNCTIONS

In [456]:
def remove_stopwords (text):
    stop_words = set(stopwords.words("english"))
    new_text = ' '.join([x for x in text.split() if x not in stop_words])
    return new_text

In [457]:
def apply_stemming (text):
    stemmer = SnowballStemmer("english")
    new_text = ' '.join([stemmer.stem(x) for x in text.split()])
    return new_text

In [458]:
def remove_by_filters (text,filters=[]):
        new_text = ' '.join([x for x in text.split() if x not in filters])
        return new_text

In [459]:
def preprocess_text (text):
    # Lower text
    new_text = text.lower()
    
    # Remove numbers
    new_text = re.sub(r'\d+', '', new_text) 
    
    # Remove punctuations
    translator = str.maketrans('', '', string.punctuation)
    new_text = new_text.translate(translator)
    
    # Remove white spaces
    new_text = " ".join(new_text.split())
    
    # Remove Stopwords
    new_text = remove_stopwords(new_text)
    
    # Stemming
    new_text = apply_stemming(new_text)
    
    # Apply filter with words room,hotel,stay
    new_text = remove_by_filters(new_text,['room','hotel','stay'])
    return new_text

In [460]:
docs['cleaned'] = docs['text'].apply(lambda text: preprocess_text(text))

In [461]:
docs.cleaned

0      complet blown away magnific got great deal hap...
1      weve return two night affinia chicago visit de...
2      excel locat feel like boutiqu right next neima...
3      travel lot busi quit frank expect start reach ...
4      visit th birthday never chicago sure found dea...
                             ...                        
395    husband plan st year wed anniversari want go b...
396    recent talbott night could disappoint terribl ...
397    id expect luxuri pay attent detail realli hadn...
398    select talbott recent famili vacat chicago cho...
399    talbott claim chicago premier small luxuri exp...
Name: cleaned, Length: 1600, dtype: object

## Classification for All CLASSES
### Naivebayes Unigram

In [487]:
tfidf_vectorizer=TfidfVectorizer(use_idf=True)
doc_vectors=tfidf_vectorizer.fit_transform(docs.cleaned)
sample_tfidf_vector = doc_vectors[40]
df = pd.DataFrame(sample_tfidf_vector.T.todense(), index=tfidf_vectorizer.get_feature_names(), columns=["tfidf"])
df.sort_values(by=["tfidf"],ascending=False)

Unnamed: 0,tfidf
street,0.273838
state,0.213356
corner,0.205221
unmatch,0.181897
revolv,0.181897
...,...
folio,0.000000
foldout,0.000000
fold,0.000000
focus,0.000000


In [488]:
text_classifier = MultinomialNB().fit(doc_vectors, docs.type)

### We can do the same operations with pipeline

In [524]:
def build_model(pipeline,k):
    # Divide the data; test&training
    #X_train, X_test, y_train, y_test = train_test_split(docs.cleaned,docs.type,test_size=0.2,random_state=0)
    # Instead of use %20 of data as a test, I will use K-Cross Validation
    kfold = KFold(n_splits=k,shuffle=True,random_state=0)
    k_cross_splits = kfold.split(docs.cleaned,docs.type)
    accuracies = []
    
    for train_index, test_index in k_cross_splits:
        X = docs['cleaned']
        X = np.array(X)
        y = docs['type']
        y = np.array(y)
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        pipeline.fit(X_train,y_train)
        predicted = pipeline.predict(X_test)
        accuracy = accuracy_score(y_test,predicted)
        accuracies.append(accuracy)
    
        print("Accuracy Score:",accuracy)
        print("Classification Report\n",classification_report(y_test,predicted))
        print("Confusion Matrix\n",confusion_matrix(y_test,predicted))
        print("---------------------------------------------------------------------")

    print("Average accuracy of 5-cross validation:",np.mean(accuracies))

#### Model with Unigram NaiveBayes

In [518]:
text_classifier = Pipeline([
    ('tfidf', TfidfVectorizer()), # Transform TF-IDF representation
    ('clf', MultinomialNB()),  # Give TF-IDF representation to Naive Bayes
])
build_model(text_classifier,5)

Accuracy Score: 0.809375
Classification Report
                precision    recall  f1-score   support

fake_negative       0.77      0.93      0.84        82
fake_positive       0.82      0.87      0.84        83
real_negative       0.88      0.65      0.75        75
real_positive       0.81      0.78      0.79        80

     accuracy                           0.81       320
    macro avg       0.82      0.81      0.80       320
 weighted avg       0.82      0.81      0.81       320

Confusion Matrix
 [[76  2  3  1]
 [ 2 72  0  9]
 [20  1 49  5]
 [ 1 13  4 62]]
---------------------------------------------------------------------
Accuracy Score: 0.803125
Classification Report
                precision    recall  f1-score   support

fake_negative       0.69      0.99      0.81        73
fake_positive       0.93      0.82      0.87        94
real_negative       0.92      0.57      0.70        83
real_positive       0.74      0.87      0.80        70

     accuracy                      

#### Model with Naivebayes | Unigram-Bigram

In [519]:
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 2))
text_classifier = Pipeline([
    ('tfidf', tfidf_vectorizer), # Tokenize and Transform TF-IDF representation
    ('clf', MultinomialNB()),  # Give TF-IDF representation to Naive Bayes
])

# Build the model
build_model(text_classifier,5)

Accuracy Score: 0.821875
Classification Report
                precision    recall  f1-score   support

fake_negative       0.79      0.91      0.85        82
fake_positive       0.82      0.90      0.86        83
real_negative       0.88      0.68      0.77        75
real_positive       0.83      0.78      0.80        80

     accuracy                           0.82       320
    macro avg       0.83      0.82      0.82       320
 weighted avg       0.83      0.82      0.82       320

Confusion Matrix
 [[75  2  4  1]
 [ 2 75  0  6]
 [17  1 51  6]
 [ 1 14  3 62]]
---------------------------------------------------------------------
Accuracy Score: 0.778125
Classification Report
                precision    recall  f1-score   support

fake_negative       0.66      0.99      0.79        73
fake_positive       0.95      0.77      0.85        94
real_negative       0.90      0.52      0.66        83
real_positive       0.71      0.89      0.79        70

     accuracy                      

#### Model with Naivebayes | Unigram-Bigram-Trigram

In [520]:
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1,3))
text_classifier = Pipeline([
    ('tfidf', tfidf_vectorizer), # Tokenize and Transform TF-IDF representation
    ('clf', MultinomialNB()),  # Give TF-IDF representation to Naive Bayes
])

build_model(text_classifier,5)

Accuracy Score: 0.815625
Classification Report
                precision    recall  f1-score   support

fake_negative       0.80      0.91      0.85        82
fake_positive       0.80      0.88      0.84        83
real_negative       0.87      0.71      0.78        75
real_positive       0.81      0.75      0.78        80

     accuracy                           0.82       320
    macro avg       0.82      0.81      0.81       320
 weighted avg       0.82      0.82      0.81       320

Confusion Matrix
 [[75  2  4  1]
 [ 3 73  0  7]
 [15  1 53  6]
 [ 1 15  4 60]]
---------------------------------------------------------------------
Accuracy Score: 0.7625
Classification Report
                precision    recall  f1-score   support

fake_negative       0.65      0.97      0.78        73
fake_positive       0.93      0.74      0.83        94
real_negative       0.88      0.52      0.65        83
real_positive       0.70      0.86      0.77        70

     accuracy                        

#### Model with Decision Tree with criteria Entropy | Unigram

In [521]:
decision_tree_classifier = DecisionTreeClassifier(criterion="entropy")
text_classifier = Pipeline([
    ('tfidf', TfidfVectorizer()), # Tokenize and Transform TF-IDF representation
    ('clf', decision_tree_classifier),  # Give TF-IDF representation to Decision Tree
])

build_model(text_classifier,5)

Accuracy Score: 0.56875
Classification Report
                precision    recall  f1-score   support

fake_negative       0.60      0.63      0.62        82
fake_positive       0.63      0.73      0.68        83
real_negative       0.52      0.40      0.45        75
real_positive       0.50      0.49      0.49        80

     accuracy                           0.57       320
    macro avg       0.56      0.56      0.56       320
 weighted avg       0.56      0.57      0.56       320

Confusion Matrix
 [[52 10 16  4]
 [ 6 61  2 14]
 [21  3 30 21]
 [ 8 23 10 39]]
---------------------------------------------------------------------
Accuracy Score: 0.50625
Classification Report
                precision    recall  f1-score   support

fake_negative       0.55      0.63      0.59        73
fake_positive       0.61      0.48      0.54        94
real_negative       0.51      0.48      0.49        83
real_positive       0.37      0.44      0.40        70

     accuracy                        

#### Model with Decision Tree with criteria Gini | Unigram

In [528]:
tfidf_vectorizer = TfidfVectorizer()
decision_tree_classifier = DecisionTreeClassifier(criterion="gini")
text_classifier = Pipeline([
    ('tfidf', TfidfVectorizer()), # Tokenize and Transform TF-IDF representation
    ('clf', decision_tree_classifier),  # Give TF-IDF representation to Decision Tree
])

build_model(text_classifier,5)

Accuracy Score: 0.4625
Classification Report
                precision    recall  f1-score   support

fake_negative       0.54      0.44      0.48        82
fake_positive       0.51      0.58      0.54        83
real_negative       0.39      0.33      0.36        75
real_positive       0.41      0.49      0.45        80

     accuracy                           0.46       320
    macro avg       0.46      0.46      0.46       320
 weighted avg       0.46      0.46      0.46       320

Confusion Matrix
 [[36 12 24 10]
 [ 5 48  9 21]
 [17  8 25 25]
 [ 9 26  6 39]]
---------------------------------------------------------------------
Accuracy Score: 0.48125
Classification Report
                precision    recall  f1-score   support

fake_negative       0.46      0.52      0.49        73
fake_positive       0.60      0.53      0.56        94
real_negative       0.50      0.48      0.49        83
real_positive       0.35      0.37      0.36        70

     accuracy                         

#### Model with K-NN | Unigram

In [533]:
tfidf_vectorizer = TfidfVectorizer()
neigh_classifier = KNeighborsClassifier(n_neighbors=3)
text_classifier = Pipeline([
    ('tfidf', tfidf_vectorizer), # Tokenize and Transform TF-IDF representation
    ('clf', neigh_classifier),  # Give TF-IDF representation to Decision Tree
])

build_model(text_classifier,5)

Accuracy Score: 0.60625
Classification Report
                precision    recall  f1-score   support

fake_negative       0.58      0.77      0.66        82
fake_positive       0.55      0.67      0.61        83
real_negative       0.70      0.37      0.49        75
real_positive       0.67      0.59      0.63        80

     accuracy                           0.61       320
    macro avg       0.63      0.60      0.60       320
 weighted avg       0.62      0.61      0.60       320

Confusion Matrix
 [[63  8 10  1]
 [15 56  0 12]
 [25 12 28 10]
 [ 6 25  2 47]]
---------------------------------------------------------------------
Accuracy Score: 0.64375
Classification Report
                precision    recall  f1-score   support

fake_negative       0.50      0.68      0.57        73
fake_positive       0.68      0.82      0.74        94
real_negative       0.78      0.48      0.60        83
real_positive       0.71      0.56      0.62        70

     accuracy                        

#### Model with K-NN | Unigram-Bigram

In [538]:
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1,2))
neigh_classifier = KNeighborsClassifier(n_neighbors=3)
text_classifier = Pipeline([
    ('tfidf', tfidf_vectorizer), # Tokenize and Transform TF-IDF representation
    ('clf', neigh_classifier),  # Give TF-IDF representation to Decision Tree
])

build_model(text_classifier,5)

Accuracy Score: 0.659375
Classification Report
                precision    recall  f1-score   support

fake_negative       0.68      0.72      0.70        82
fake_positive       0.56      0.75      0.64        83
real_negative       0.75      0.53      0.62        75
real_positive       0.71      0.62      0.67        80

     accuracy                           0.66       320
    macro avg       0.68      0.66      0.66       320
 weighted avg       0.68      0.66      0.66       320

Confusion Matrix
 [[59 10 12  1]
 [11 62  0 10]
 [16 10 40  9]
 [ 1 28  1 50]]
---------------------------------------------------------------------
Accuracy Score: 0.578125
Classification Report
                precision    recall  f1-score   support

fake_negative       0.48      0.63      0.54        73
fake_positive       0.58      0.69      0.63        94
real_negative       0.78      0.48      0.60        83
real_positive       0.56      0.49      0.52        70

     accuracy                      

#### Model with SVM | Unigram

In [542]:
tfidf_vectorizer = TfidfVectorizer()
svm_classifier = svm.SVC()
text_classifier = Pipeline([
    ('tfidf', tfidf_vectorizer), # Tokenize and Transform TF-IDF representation
    ('clf', svm_classifier),  # Give TF-IDF representation to Decision Tree
])

build_model(text_classifier,5)

Accuracy Score: 0.81875
Classification Report
                precision    recall  f1-score   support

fake_negative       0.93      0.78      0.85        82
fake_positive       0.83      0.88      0.85        83
real_negative       0.71      0.87      0.78        75
real_positive       0.83      0.75      0.79        80

     accuracy                           0.82       320
    macro avg       0.83      0.82      0.82       320
 weighted avg       0.83      0.82      0.82       320

Confusion Matrix
 [[64  2 15  1]
 [ 0 73  3  7]
 [ 5  1 65  4]
 [ 0 12  8 60]]
---------------------------------------------------------------------
Accuracy Score: 0.853125
Classification Report
                precision    recall  f1-score   support

fake_negative       0.87      0.90      0.89        73
fake_positive       0.95      0.81      0.87        94
real_negative       0.83      0.86      0.84        83
real_positive       0.77      0.86      0.81        70

     accuracy                       

#### Model with SVM | Unigram-Bigram

In [543]:
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1,2))
svm_classifier = svm.SVC()
text_classifier = Pipeline([
    ('tfidf', tfidf_vectorizer), # Tokenize and Transform TF-IDF representation
    ('clf', svm_classifier),  # Give TF-IDF representation to Decision Tree
])

build_model(text_classifier,5)

Accuracy Score: 0.809375
Classification Report
                precision    recall  f1-score   support

fake_negative       0.89      0.85      0.87        82
fake_positive       0.77      0.89      0.83        83
real_negative       0.80      0.76      0.78        75
real_positive       0.78      0.72      0.75        80

     accuracy                           0.81       320
    macro avg       0.81      0.81      0.81       320
 weighted avg       0.81      0.81      0.81       320

Confusion Matrix
 [[70  4  7  1]
 [ 0 74  2  7]
 [ 9  1 57  8]
 [ 0 17  5 58]]
---------------------------------------------------------------------
Accuracy Score: 0.81875
Classification Report
                precision    recall  f1-score   support

fake_negative       0.80      0.92      0.85        73
fake_positive       0.89      0.78      0.83        94
real_negative       0.85      0.76      0.80        83
real_positive       0.74      0.84      0.79        70

     accuracy                       