# Data Mining Project

In [1]:
from pathlib import Path
import sys
import numpy as np
import pandas as pd
import nltk
import matplotlib.pyplot as plt
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
import re
nltk.download('punkt')
nltk.download('stopwords')
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split,KFold
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report,confusion_matrix, accuracy_score
from sklearn.tree import DecisionTreeClassifier,plot_tree
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm

[nltk_data] Downloading package punkt to C:\Users\Kaan-
[nltk_data]     PC\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Kaan-
[nltk_data]     PC\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### READING DOCUMENTS

In [2]:
def read_documents(doc_type,path):
    files = path.iterdir()
    document_arr = []

    for index,file in enumerate(files, start=0):
        reader = open(file,"r")
        text = reader.read()
        document_arr.append({'type':doc_type,'text':text})
    return document_arr

real_positive_path = Path('opinion_spam/positive/truthful')
fake_positive_path = Path('opinion_spam/positive/deceptive')
real_negative_path = Path('opinion_spam/negative/truthful')
fake_negative_path = Path('opinion_spam/negative/deceptive')

docs = pd.DataFrame()
docs = docs.append(read_documents('real_negative',real_negative_path))
docs = docs.append(read_documents('fake_negative',fake_negative_path))
len(docs)
docs

Unnamed: 0,type,text
0,real_negative,We stayed 2 nights over spring break in what w...
1,real_negative,Stayed at the Fitzpatrick as a result of all o...
2,real_negative,We booked this hotel as a last minute vacation...
3,real_negative,This hotel is a shambles-furniture literally f...
4,real_negative,The hotel itself was beautiful and wonderful s...
...,...,...
395,fake_negative,My husband and I were planning our 1st year we...
396,fake_negative,I recently stayed at The Talbott Hotel for 3 n...
397,fake_negative,"I'd expect a ""luxury"" hotel to pay more attent..."
398,fake_negative,I selected The Talbott for my recent family va...


### PREPROCESSING FUNCTIONS

In [3]:
def remove_stopwords (text):
    stop_words = set(stopwords.words("english"))
    new_text = ' '.join([x for x in text.split() if x not in stop_words])
    return new_text

In [4]:
def apply_stemming (text):
    stemmer = SnowballStemmer("english")
    new_text = ' '.join([stemmer.stem(x) for x in text.split()])
    return new_text

In [5]:
def remove_by_filters (text,filters=[]):
        new_text = ' '.join([x for x in text.split() if x not in filters])
        return new_text

In [6]:
def preprocess_text (text):
    # Lower text
    new_text = text.lower()
    
    # Remove numbers
    new_text = re.sub(r'\d+', '', new_text) 
    
    # Remove punctuations
    translator = str.maketrans('', '', string.punctuation)
    new_text = new_text.translate(translator)
    
    # Remove white spaces
    new_text = " ".join(new_text.split())
    
    # Remove Stopwords
    new_text = remove_stopwords(new_text)
    
    # Stemming
    new_text = apply_stemming(new_text)
    
    # Apply filter with words room,hotel,stay
    new_text = remove_by_filters(new_text,['room','hotel','stay'])
    return new_text

In [7]:
docs['cleaned'] = docs['text'].apply(lambda text: preprocess_text(text))

In [8]:
docs.cleaned

0      night spring break call jr suit resembl wide h...
1      fitzpatrick result glow review tripadvisor avo...
2      book last minut vacat plan littl shop nightlif...
3      shamblesfurnitur liter fall apart staff rude u...
4      beauti wonder staff bottom line imposs sleep n...
                             ...                        
395    husband plan st year wed anniversari want go b...
396    recent talbott night could disappoint terribl ...
397    id expect luxuri pay attent detail realli hadn...
398    select talbott recent famili vacat chicago cho...
399    talbott claim chicago premier small luxuri exp...
Name: cleaned, Length: 800, dtype: object

## Classification for All CLASSES
### Naivebayes Unigram

In [9]:
tfidf_vectorizer=TfidfVectorizer(use_idf=True)
doc_vectors=tfidf_vectorizer.fit_transform(docs.cleaned)
sample_tfidf_vector = doc_vectors[40]
df = pd.DataFrame(sample_tfidf_vector.T.todense(), index=tfidf_vectorizer.get_feature_names(), columns=["tfidf"])
df.sort_values(by=["tfidf"],ascending=False)

Unnamed: 0,tfidf
blanket,0.308833
extra,0.224928
beer,0.211808
fill,0.200712
manag,0.185219
...,...
food,0.000000
fond,0.000000
follow,0.000000
folk,0.000000


In [10]:
text_classifier = MultinomialNB().fit(doc_vectors, docs.type)

### We can do the same operations with pipeline

In [11]:
def build_model(pipeline,k):
    # Divide the data; test&training
    #X_train, X_test, y_train, y_test = train_test_split(docs.cleaned,docs.type,test_size=0.2,random_state=0)
    # Instead of use %20 of data as a test, I will use K-Cross Validation
    kfold = KFold(n_splits=k,shuffle=True,random_state=0)
    k_cross_splits = kfold.split(docs.cleaned,docs.type)
    accuracies = []
    
    for train_index, test_index in k_cross_splits:
        X = docs['cleaned']
        X = np.array(X)
        y = docs['type']
        y = np.array(y)
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        pipeline.fit(X_train,y_train)
        predicted = pipeline.predict(X_test)
        accuracy = accuracy_score(y_test,predicted)
        accuracies.append(accuracy)
    
        print("Accuracy Score:",accuracy)
        print("Classification Report\n",classification_report(y_test,predicted))
        print("Confusion Matrix\n",confusion_matrix(y_test,predicted))
        print("---------------------------------------------------------------------")

    print("Average accuracy of 5-cross validation:",np.mean(accuracies))

#### Model with Unigram NaiveBayes

In [12]:
text_classifier = Pipeline([
    ('tfidf', TfidfVectorizer()), # Transform TF-IDF representation
    ('clf', MultinomialNB()),  # Give TF-IDF representation to Naive Bayes
])
build_model(text_classifier,5)

Accuracy Score: 0.825
Classification Report
                precision    recall  f1-score   support

fake_negative       0.78      0.95      0.86        88
real_negative       0.92      0.67      0.77        72

     accuracy                           0.82       160
    macro avg       0.85      0.81      0.82       160
 weighted avg       0.84      0.82      0.82       160

Confusion Matrix
 [[84  4]
 [24 48]]
---------------------------------------------------------------------
Accuracy Score: 0.78125
Classification Report
                precision    recall  f1-score   support

fake_negative       0.68      0.96      0.80        72
real_negative       0.95      0.64      0.76        88

     accuracy                           0.78       160
    macro avg       0.82      0.80      0.78       160
 weighted avg       0.83      0.78      0.78       160

Confusion Matrix
 [[69  3]
 [32 56]]
---------------------------------------------------------------------
Accuracy Score: 0.85
Classif

#### Model with Naivebayes | Unigram-Bigram

In [13]:
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 2))
text_classifier = Pipeline([
    ('tfidf', tfidf_vectorizer), # Tokenize and Transform TF-IDF representation
    ('clf', MultinomialNB()),  # Give TF-IDF representation to Naive Bayes
])

# Build the model
build_model(text_classifier,5)

Accuracy Score: 0.85625
Classification Report
                precision    recall  f1-score   support

fake_negative       0.82      0.94      0.88        88
real_negative       0.92      0.75      0.82        72

     accuracy                           0.86       160
    macro avg       0.87      0.85      0.85       160
 weighted avg       0.86      0.86      0.85       160

Confusion Matrix
 [[83  5]
 [18 54]]
---------------------------------------------------------------------
Accuracy Score: 0.7875
Classification Report
                precision    recall  f1-score   support

fake_negative       0.68      0.99      0.81        72
real_negative       0.98      0.62      0.76        88

     accuracy                           0.79       160
    macro avg       0.83      0.81      0.79       160
 weighted avg       0.85      0.79      0.78       160

Confusion Matrix
 [[71  1]
 [33 55]]
---------------------------------------------------------------------
Accuracy Score: 0.86875
Cla

#### Model with Naivebayes | Unigram-Bigram-Trigram

In [14]:
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1,3))
text_classifier = Pipeline([
    ('tfidf', tfidf_vectorizer), # Tokenize and Transform TF-IDF representation
    ('clf', MultinomialNB()),  # Give TF-IDF representation to Naive Bayes
])

build_model(text_classifier,5)

Accuracy Score: 0.875
Classification Report
                precision    recall  f1-score   support

fake_negative       0.85      0.93      0.89        88
real_negative       0.91      0.81      0.85        72

     accuracy                           0.88       160
    macro avg       0.88      0.87      0.87       160
 weighted avg       0.88      0.88      0.87       160

Confusion Matrix
 [[82  6]
 [14 58]]
---------------------------------------------------------------------
Accuracy Score: 0.8
Classification Report
                precision    recall  f1-score   support

fake_negative       0.70      0.99      0.82        72
real_negative       0.98      0.65      0.78        88

     accuracy                           0.80       160
    macro avg       0.84      0.82      0.80       160
 weighted avg       0.85      0.80      0.80       160

Confusion Matrix
 [[71  1]
 [31 57]]
---------------------------------------------------------------------
Accuracy Score: 0.8625
Classific

#### Model with Decision Tree with criteria Entropy | Unigram

In [15]:
decision_tree_classifier = DecisionTreeClassifier(criterion="entropy")
text_classifier = Pipeline([
    ('tfidf', TfidfVectorizer()), # Tokenize and Transform TF-IDF representation
    ('clf', decision_tree_classifier),  # Give TF-IDF representation to Decision Tree
])

build_model(text_classifier,5)

Accuracy Score: 0.66875
Classification Report
                precision    recall  f1-score   support

fake_negative       0.72      0.66      0.69        88
real_negative       0.62      0.68      0.65        72

     accuracy                           0.67       160
    macro avg       0.67      0.67      0.67       160
 weighted avg       0.67      0.67      0.67       160

Confusion Matrix
 [[58 30]
 [23 49]]
---------------------------------------------------------------------
Accuracy Score: 0.6875
Classification Report
                precision    recall  f1-score   support

fake_negative       0.67      0.60      0.63        72
real_negative       0.70      0.76      0.73        88

     accuracy                           0.69       160
    macro avg       0.68      0.68      0.68       160
 weighted avg       0.69      0.69      0.69       160

Confusion Matrix
 [[43 29]
 [21 67]]
---------------------------------------------------------------------
Accuracy Score: 0.74375
Cla

#### Model with Decision Tree with criteria Gini | Unigram

In [16]:
tfidf_vectorizer = TfidfVectorizer()
decision_tree_classifier = DecisionTreeClassifier(criterion="gini")
text_classifier = Pipeline([
    ('tfidf', TfidfVectorizer()), # Tokenize and Transform TF-IDF representation
    ('clf', decision_tree_classifier),  # Give TF-IDF representation to Decision Tree
])

build_model(text_classifier,5)

Accuracy Score: 0.6375
Classification Report
                precision    recall  f1-score   support

fake_negative       0.69      0.62      0.65        88
real_negative       0.59      0.65      0.62        72

     accuracy                           0.64       160
    macro avg       0.64      0.64      0.64       160
 weighted avg       0.64      0.64      0.64       160

Confusion Matrix
 [[55 33]
 [25 47]]
---------------------------------------------------------------------
Accuracy Score: 0.7
Classification Report
                precision    recall  f1-score   support

fake_negative       0.67      0.65      0.66        72
real_negative       0.72      0.74      0.73        88

     accuracy                           0.70       160
    macro avg       0.70      0.70      0.70       160
 weighted avg       0.70      0.70      0.70       160

Confusion Matrix
 [[47 25]
 [23 65]]
---------------------------------------------------------------------
Accuracy Score: 0.68125
Classif

#### Model with K-NN | Unigram

In [17]:
tfidf_vectorizer = TfidfVectorizer()
neigh_classifier = KNeighborsClassifier(n_neighbors=3)
text_classifier = Pipeline([
    ('tfidf', tfidf_vectorizer), # Tokenize and Transform TF-IDF representation
    ('clf', neigh_classifier),  # Give TF-IDF representation to Decision Tree
])

build_model(text_classifier,5)

Accuracy Score: 0.75
Classification Report
                precision    recall  f1-score   support

fake_negative       0.74      0.84      0.79        88
real_negative       0.77      0.64      0.70        72

     accuracy                           0.75       160
    macro avg       0.75      0.74      0.74       160
 weighted avg       0.75      0.75      0.75       160

Confusion Matrix
 [[74 14]
 [26 46]]
---------------------------------------------------------------------
Accuracy Score: 0.73125
Classification Report
                precision    recall  f1-score   support

fake_negative       0.67      0.79      0.73        72
real_negative       0.80      0.68      0.74        88

     accuracy                           0.73       160
    macro avg       0.74      0.74      0.73       160
 weighted avg       0.74      0.73      0.73       160

Confusion Matrix
 [[57 15]
 [28 60]]
---------------------------------------------------------------------
Accuracy Score: 0.70625
Class

#### Model with K-NN | Unigram-Bigram

In [18]:
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1,2))
neigh_classifier = KNeighborsClassifier(n_neighbors=3)
text_classifier = Pipeline([
    ('tfidf', tfidf_vectorizer), # Tokenize and Transform TF-IDF representation
    ('clf', neigh_classifier),  # Give TF-IDF representation to Decision Tree
])

build_model(text_classifier,5)

Accuracy Score: 0.80625
Classification Report
                precision    recall  f1-score   support

fake_negative       0.79      0.88      0.83        88
real_negative       0.83      0.72      0.77        72

     accuracy                           0.81       160
    macro avg       0.81      0.80      0.80       160
 weighted avg       0.81      0.81      0.80       160

Confusion Matrix
 [[77 11]
 [20 52]]
---------------------------------------------------------------------
Accuracy Score: 0.79375
Classification Report
                precision    recall  f1-score   support

fake_negative       0.73      0.86      0.79        72
real_negative       0.87      0.74      0.80        88

     accuracy                           0.79       160
    macro avg       0.80      0.80      0.79       160
 weighted avg       0.80      0.79      0.79       160

Confusion Matrix
 [[62 10]
 [23 65]]
---------------------------------------------------------------------
Accuracy Score: 0.73125
Cl

#### Model with SVM | Unigram

In [19]:
tfidf_vectorizer = TfidfVectorizer()
svm_classifier = svm.SVC()
text_classifier = Pipeline([
    ('tfidf', tfidf_vectorizer), # Tokenize and Transform TF-IDF representation
    ('clf', svm_classifier),  # Give TF-IDF representation to Decision Tree
])

build_model(text_classifier,5)

Accuracy Score: 0.84375
Classification Report
                precision    recall  f1-score   support

fake_negative       0.88      0.83      0.85        88
real_negative       0.81      0.86      0.83        72

     accuracy                           0.84       160
    macro avg       0.84      0.85      0.84       160
 weighted avg       0.85      0.84      0.84       160

Confusion Matrix
 [[73 15]
 [10 62]]
---------------------------------------------------------------------
Accuracy Score: 0.9125
Classification Report
                precision    recall  f1-score   support

fake_negative       0.94      0.86      0.90        72
real_negative       0.89      0.95      0.92        88

     accuracy                           0.91       160
    macro avg       0.92      0.91      0.91       160
 weighted avg       0.91      0.91      0.91       160

Confusion Matrix
 [[62 10]
 [ 4 84]]
---------------------------------------------------------------------
Accuracy Score: 0.8875
Clas

#### Model with SVM | Unigram-Bigram

In [20]:
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1,2))
svm_classifier = svm.SVC()
text_classifier = Pipeline([
    ('tfidf', tfidf_vectorizer), # Tokenize and Transform TF-IDF representation
    ('clf', svm_classifier),  # Give TF-IDF representation to Decision Tree
])

build_model(text_classifier,5)

Accuracy Score: 0.84375
Classification Report
                precision    recall  f1-score   support

fake_negative       0.84      0.89      0.86        88
real_negative       0.85      0.79      0.82        72

     accuracy                           0.84       160
    macro avg       0.84      0.84      0.84       160
 weighted avg       0.84      0.84      0.84       160

Confusion Matrix
 [[78 10]
 [15 57]]
---------------------------------------------------------------------
Accuracy Score: 0.8875
Classification Report
                precision    recall  f1-score   support

fake_negative       0.84      0.93      0.88        72
real_negative       0.94      0.85      0.89        88

     accuracy                           0.89       160
    macro avg       0.89      0.89      0.89       160
 weighted avg       0.89      0.89      0.89       160

Confusion Matrix
 [[67  5]
 [13 75]]
---------------------------------------------------------------------
Accuracy Score: 0.875
Class