# Data Mining Project

In [1]:
from pathlib import Path
import sys
import numpy as np
import pandas as pd
import nltk
import matplotlib.pyplot as plt
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
import re
nltk.download('punkt')
nltk.download('stopwords')
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split,KFold
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report,confusion_matrix, accuracy_score
from sklearn.tree import DecisionTreeClassifier,plot_tree
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm

[nltk_data] Downloading package punkt to C:\Users\Kaan-
[nltk_data]     PC\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Kaan-
[nltk_data]     PC\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### READING DOCUMENTS

In [2]:
def read_documents(doc_type,path):
    files = path.iterdir()
    document_arr = []

    for index,file in enumerate(files, start=0):
        reader = open(file,"r")
        text = reader.read()
        document_arr.append({'type':doc_type,'text':text})
    return document_arr

real_positive_path = Path('opinion_spam/positive/truthful')
fake_positive_path = Path('opinion_spam/positive/deceptive')

docs = pd.DataFrame()
docs = docs.append(read_documents('real_positive',real_positive_path))
docs = docs.append(read_documents('fake_positive',fake_positive_path))
len(docs)
docs

Unnamed: 0,type,text
0,real_positive,I was completely blown away by this hotel. It ...
1,real_positive,We've just returned from a two night stay at t...
2,real_positive,"Excellent location, feels like a boutique hote..."
3,real_positive,"I travel a lot for business and quite frankly,..."
4,real_positive,We visited for my 40th birthday. We had never ...
...,...,...
395,fake_positive,The rates at The Talbott Hotel were cheaper th...
396,fake_positive,I enjoyed my stay at the Talbott Hotel. It is ...
397,fake_positive,Pleasant staff and housekeeping. Above average...
398,fake_positive,My stay at this hotel was one of the best I ha...


### PREPROCESSING FUNCTIONS

In [3]:
def remove_stopwords (text):
    stop_words = set(stopwords.words("english"))
    new_text = ' '.join([x for x in text.split() if x not in stop_words])
    return new_text

In [4]:
def apply_stemming (text):
    stemmer = SnowballStemmer("english")
    new_text = ' '.join([stemmer.stem(x) for x in text.split()])
    return new_text

In [5]:
def remove_by_filters (text,filters=[]):
        new_text = ' '.join([x for x in text.split() if x not in filters])
        return new_text

In [6]:
def preprocess_text (text):
    # Lower text
    new_text = text.lower()
    
    # Remove numbers
    new_text = re.sub(r'\d+', '', new_text) 
    
    # Remove punctuations
    translator = str.maketrans('', '', string.punctuation)
    new_text = new_text.translate(translator)
    
    # Remove white spaces
    new_text = " ".join(new_text.split())
    
    # Remove Stopwords
    new_text = remove_stopwords(new_text)
    
    # Stemming
    new_text = apply_stemming(new_text)
    
    # Apply filter with words room,hotel,stay
    new_text = remove_by_filters(new_text,['room','hotel','stay'])
    return new_text

In [7]:
docs['cleaned'] = docs['text'].apply(lambda text: preprocess_text(text))

In [8]:
docs.cleaned

0      complet blown away magnific got great deal hap...
1      weve return two night affinia chicago visit de...
2      excel locat feel like boutiqu right next neima...
3      travel lot busi quit frank expect start reach ...
4      visit th birthday never chicago sure found dea...
                             ...                        
395    rate talbott cheaper expect reason book prepar...
396    enjoy talbott expens side worth got talbott tr...
397    pleasant staff housekeep averag breakfast clea...
398    one best ever locat servic accommod outstand l...
399    excel staff custom servic clean spotless eleg ...
Name: cleaned, Length: 800, dtype: object

## Classification for All CLASSES
### Naivebayes Unigram

In [9]:
tfidf_vectorizer=TfidfVectorizer(use_idf=True)
doc_vectors=tfidf_vectorizer.fit_transform(docs.cleaned)
sample_tfidf_vector = doc_vectors[40]
df = pd.DataFrame(sample_tfidf_vector.T.todense(), index=tfidf_vectorizer.get_feature_names(), columns=["tfidf"])
df.sort_values(by=["tfidf"],ascending=False)

Unnamed: 0,tfidf
street,0.275676
state,0.235063
corner,0.210892
around,0.172386
revolv,0.169364
...,...
fire,0.000000
fireplac,0.000000
firework,0.000000
firm,0.000000


In [10]:
text_classifier = MultinomialNB().fit(doc_vectors, docs.type)

### We can do the same operations with pipeline

In [11]:
def build_model(pipeline,k):
    # Divide the data; test&training
    #X_train, X_test, y_train, y_test = train_test_split(docs.cleaned,docs.type,test_size=0.2,random_state=0)
    # Instead of use %20 of data as a test, I will use K-Cross Validation
    kfold = KFold(n_splits=k,shuffle=True,random_state=0)
    k_cross_splits = kfold.split(docs.cleaned,docs.type)
    accuracies = []
    
    for train_index, test_index in k_cross_splits:
        X = docs['cleaned']
        X = np.array(X)
        y = docs['type']
        y = np.array(y)
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        pipeline.fit(X_train,y_train)
        predicted = pipeline.predict(X_test)
        accuracy = accuracy_score(y_test,predicted)
        accuracies.append(accuracy)
    
        print("Accuracy Score:",accuracy)
        print("Classification Report\n",classification_report(y_test,predicted))
        print("Confusion Matrix\n",confusion_matrix(y_test,predicted))
        print("---------------------------------------------------------------------")

    print("Average accuracy of 5-cross validation:",np.mean(accuracies))

#### Model with Unigram NaiveBayes

In [12]:
text_classifier = Pipeline([
    ('tfidf', TfidfVectorizer()), # Transform TF-IDF representation
    ('clf', MultinomialNB()),  # Give TF-IDF representation to Naive Bayes
])
build_model(text_classifier,5)

Accuracy Score: 0.88125
Classification Report
                precision    recall  f1-score   support

fake_positive       0.91      0.88      0.89        88
real_positive       0.85      0.89      0.87        72

     accuracy                           0.88       160
    macro avg       0.88      0.88      0.88       160
 weighted avg       0.88      0.88      0.88       160

Confusion Matrix
 [[77 11]
 [ 8 64]]
---------------------------------------------------------------------
Accuracy Score: 0.85
Classification Report
                precision    recall  f1-score   support

fake_positive       0.77      0.96      0.85        72
real_positive       0.96      0.76      0.85        88

     accuracy                           0.85       160
    macro avg       0.86      0.86      0.85       160
 weighted avg       0.87      0.85      0.85       160

Confusion Matrix
 [[69  3]
 [21 67]]
---------------------------------------------------------------------
Accuracy Score: 0.89375
Class

#### Model with Naivebayes | Unigram-Bigram

In [13]:
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 2))
text_classifier = Pipeline([
    ('tfidf', tfidf_vectorizer), # Tokenize and Transform TF-IDF representation
    ('clf', MultinomialNB()),  # Give TF-IDF representation to Naive Bayes
])

# Build the model
build_model(text_classifier,5)

Accuracy Score: 0.85625
Classification Report
                precision    recall  f1-score   support

fake_positive       0.89      0.84      0.87        88
real_positive       0.82      0.88      0.85        72

     accuracy                           0.86       160
    macro avg       0.85      0.86      0.86       160
 weighted avg       0.86      0.86      0.86       160

Confusion Matrix
 [[74 14]
 [ 9 63]]
---------------------------------------------------------------------
Accuracy Score: 0.85
Classification Report
                precision    recall  f1-score   support

fake_positive       0.76      0.99      0.86        72
real_positive       0.98      0.74      0.84        88

     accuracy                           0.85       160
    macro avg       0.87      0.86      0.85       160
 weighted avg       0.88      0.85      0.85       160

Confusion Matrix
 [[71  1]
 [23 65]]
---------------------------------------------------------------------
Accuracy Score: 0.8875
Classi

#### Model with Naivebayes | Unigram-Bigram-Trigram

In [14]:
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1,3))
text_classifier = Pipeline([
    ('tfidf', tfidf_vectorizer), # Tokenize and Transform TF-IDF representation
    ('clf', MultinomialNB()),  # Give TF-IDF representation to Naive Bayes
])

build_model(text_classifier,5)

Accuracy Score: 0.85625
Classification Report
                precision    recall  f1-score   support

fake_positive       0.89      0.84      0.87        88
real_positive       0.82      0.88      0.85        72

     accuracy                           0.86       160
    macro avg       0.85      0.86      0.86       160
 weighted avg       0.86      0.86      0.86       160

Confusion Matrix
 [[74 14]
 [ 9 63]]
---------------------------------------------------------------------
Accuracy Score: 0.85
Classification Report
                precision    recall  f1-score   support

fake_positive       0.76      0.99      0.86        72
real_positive       0.98      0.74      0.84        88

     accuracy                           0.85       160
    macro avg       0.87      0.86      0.85       160
 weighted avg       0.88      0.85      0.85       160

Confusion Matrix
 [[71  1]
 [23 65]]
---------------------------------------------------------------------
Accuracy Score: 0.88125
Class

#### Model with Decision Tree with criteria Entropy | Unigram

In [15]:
decision_tree_classifier = DecisionTreeClassifier(criterion="entropy")
text_classifier = Pipeline([
    ('tfidf', TfidfVectorizer()), # Tokenize and Transform TF-IDF representation
    ('clf', decision_tree_classifier),  # Give TF-IDF representation to Decision Tree
])

build_model(text_classifier,5)

Accuracy Score: 0.675
Classification Report
                precision    recall  f1-score   support

fake_positive       0.73      0.65      0.69        88
real_positive       0.62      0.71      0.66        72

     accuracy                           0.68       160
    macro avg       0.68      0.68      0.67       160
 weighted avg       0.68      0.68      0.68       160

Confusion Matrix
 [[57 31]
 [21 51]]
---------------------------------------------------------------------
Accuracy Score: 0.68125
Classification Report
                precision    recall  f1-score   support

fake_positive       0.63      0.69      0.66        72
real_positive       0.73      0.67      0.70        88

     accuracy                           0.68       160
    macro avg       0.68      0.68      0.68       160
 weighted avg       0.69      0.68      0.68       160

Confusion Matrix
 [[50 22]
 [29 59]]
---------------------------------------------------------------------
Accuracy Score: 0.70625
Clas

#### Model with Decision Tree with criteria Gini | Unigram

In [16]:
tfidf_vectorizer = TfidfVectorizer()
decision_tree_classifier = DecisionTreeClassifier(criterion="gini")
text_classifier = Pipeline([
    ('tfidf', TfidfVectorizer()), # Tokenize and Transform TF-IDF representation
    ('clf', decision_tree_classifier),  # Give TF-IDF representation to Decision Tree
])

build_model(text_classifier,5)

Accuracy Score: 0.6875
Classification Report
                precision    recall  f1-score   support

fake_positive       0.74      0.66      0.70        88
real_positive       0.63      0.72      0.68        72

     accuracy                           0.69       160
    macro avg       0.69      0.69      0.69       160
 weighted avg       0.69      0.69      0.69       160

Confusion Matrix
 [[58 30]
 [20 52]]
---------------------------------------------------------------------
Accuracy Score: 0.6875
Classification Report
                precision    recall  f1-score   support

fake_positive       0.62      0.78      0.69        72
real_positive       0.77      0.61      0.68        88

     accuracy                           0.69       160
    macro avg       0.70      0.70      0.69       160
 weighted avg       0.70      0.69      0.69       160

Confusion Matrix
 [[56 16]
 [34 54]]
---------------------------------------------------------------------
Accuracy Score: 0.7375
Class

#### Model with K-NN | Unigram

In [17]:
tfidf_vectorizer = TfidfVectorizer()
neigh_classifier = KNeighborsClassifier(n_neighbors=3)
text_classifier = Pipeline([
    ('tfidf', tfidf_vectorizer), # Tokenize and Transform TF-IDF representation
    ('clf', neigh_classifier),  # Give TF-IDF representation to Decision Tree
])

build_model(text_classifier,5)

Accuracy Score: 0.7625
Classification Report
                precision    recall  f1-score   support

fake_positive       0.77      0.81      0.79        88
real_positive       0.75      0.71      0.73        72

     accuracy                           0.76       160
    macro avg       0.76      0.76      0.76       160
 weighted avg       0.76      0.76      0.76       160

Confusion Matrix
 [[71 17]
 [21 51]]
---------------------------------------------------------------------
Accuracy Score: 0.7
Classification Report
                precision    recall  f1-score   support

fake_positive       0.62      0.86      0.72        72
real_positive       0.83      0.57      0.68        88

     accuracy                           0.70       160
    macro avg       0.73      0.71      0.70       160
 weighted avg       0.74      0.70      0.70       160

Confusion Matrix
 [[62 10]
 [38 50]]
---------------------------------------------------------------------
Accuracy Score: 0.7375
Classifi

#### Model with K-NN | Unigram-Bigram

In [18]:
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1,2))
neigh_classifier = KNeighborsClassifier(n_neighbors=3)
text_classifier = Pipeline([
    ('tfidf', tfidf_vectorizer), # Tokenize and Transform TF-IDF representation
    ('clf', neigh_classifier),  # Give TF-IDF representation to Decision Tree
])

build_model(text_classifier,5)

Accuracy Score: 0.7375
Classification Report
                precision    recall  f1-score   support

fake_positive       0.74      0.81      0.77        88
real_positive       0.73      0.65      0.69        72

     accuracy                           0.74       160
    macro avg       0.74      0.73      0.73       160
 weighted avg       0.74      0.74      0.74       160

Confusion Matrix
 [[71 17]
 [25 47]]
---------------------------------------------------------------------
Accuracy Score: 0.7125
Classification Report
                precision    recall  f1-score   support

fake_positive       0.63      0.86      0.73        72
real_positive       0.84      0.59      0.69        88

     accuracy                           0.71       160
    macro avg       0.74      0.73      0.71       160
 weighted avg       0.75      0.71      0.71       160

Confusion Matrix
 [[62 10]
 [36 52]]
---------------------------------------------------------------------
Accuracy Score: 0.71875
Clas

#### Model with SVM | Unigram

In [19]:
tfidf_vectorizer = TfidfVectorizer()
svm_classifier = svm.SVC()
text_classifier = Pipeline([
    ('tfidf', tfidf_vectorizer), # Tokenize and Transform TF-IDF representation
    ('clf', svm_classifier),  # Give TF-IDF representation to Decision Tree
])

build_model(text_classifier,5)

Accuracy Score: 0.825
Classification Report
                precision    recall  f1-score   support

fake_positive       0.87      0.81      0.84        88
real_positive       0.78      0.85      0.81        72

     accuracy                           0.82       160
    macro avg       0.82      0.83      0.82       160
 weighted avg       0.83      0.82      0.83       160

Confusion Matrix
 [[71 17]
 [11 61]]
---------------------------------------------------------------------
Accuracy Score: 0.8625
Classification Report
                precision    recall  f1-score   support

fake_positive       0.83      0.88      0.85        72
real_positive       0.89      0.85      0.87        88

     accuracy                           0.86       160
    macro avg       0.86      0.86      0.86       160
 weighted avg       0.86      0.86      0.86       160

Confusion Matrix
 [[63  9]
 [13 75]]
---------------------------------------------------------------------
Accuracy Score: 0.8875
Classi

#### Model with SVM | Unigram-Bigram

In [20]:
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1,2))
svm_classifier = svm.SVC()
text_classifier = Pipeline([
    ('tfidf', tfidf_vectorizer), # Tokenize and Transform TF-IDF representation
    ('clf', svm_classifier),  # Give TF-IDF representation to Decision Tree
])

build_model(text_classifier,5)

Accuracy Score: 0.8375
Classification Report
                precision    recall  f1-score   support

fake_positive       0.86      0.84      0.85        88
real_positive       0.81      0.83      0.82        72

     accuracy                           0.84       160
    macro avg       0.84      0.84      0.84       160
 weighted avg       0.84      0.84      0.84       160

Confusion Matrix
 [[74 14]
 [12 60]]
---------------------------------------------------------------------
Accuracy Score: 0.86875
Classification Report
                precision    recall  f1-score   support

fake_positive       0.82      0.90      0.86        72
real_positive       0.91      0.84      0.88        88

     accuracy                           0.87       160
    macro avg       0.87      0.87      0.87       160
 weighted avg       0.87      0.87      0.87       160

Confusion Matrix
 [[65  7]
 [14 74]]
---------------------------------------------------------------------
Accuracy Score: 0.8875
Clas