In [2]:
# import pandas requirements
import pandas as pd
from pandas import DataFrame
from sklearn.naive_bayes import MultinomialNB  # use multinomial naive bayes
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import plot_confusion_matrix, plot_roc_curve
import matplotlib.pyplot as plt
import re
from sklearn.pipeline import make_pipeline
from imblearn.metrics import classification_report_imbalanced


class Preprocess:
  # def __int__(self):
    # pass

    def remove_Tags(self, text):
        """
        Remove@.
        """
        clean_text = re.sub('(@[A-Za-z0-9_]+)', '', text)

        return clean_text

    def remove_HashTags(self, text):
        """
        Remove#.
        """
        clean_text = re.sub('(#[A-Za-z0-9_]+)', '', text)

        return clean_text

    def remove_HtmlTags(self, text):
        """
        take string input and clean string without tags.
        use regex to remove the html tags.
        """
        html_clean = re.compile('<.*?>')
        clean_text = re.sub(html_clean, '', text)

        return clean_text

    def remove_links(self, text):
        """
       Remove links.
       """
        clean_text = re.sub(
            'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|''(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', text)

        return clean_text

    def preprocess(self, text):

        word_list = []
        for cleaned_text in text:

            cleaned_text = self.remove_Tags(cleaned_text)
            cleaned_text = self.remove_HashTags(cleaned_text)
            cleaned_text = self.remove_HtmlTags(cleaned_text)
            cleaned_text = self.remove_links(cleaned_text)
            cleaned_text = cleaned_text.strip()
            cleaned_text = cleaned_text.encode('ascii', 'ignore').decode()
            word_list.append(cleaned_text)
        return word_list
# return word_list




In [3]:
# test data
test_df = pd.read_csv("test.csv", header=None, names=["target", "data"])
test_df = test_df[1:]
pr = Preprocess()
test_df.data = pr.preprocess(test_df.data)
print(test_df.head())

train_df = pd.read_csv("train.csv", header=None, names=["target", "data"])##
train_df = train_df[1:]
pr = Preprocess()
print(train_df.head())

eva_df = pd.read_csv("evaluation.csv", header=None, names=["target", "data"])
eva_df = eva_df[1:]
pr = Preprocess()
print(eva_df.head())

classifier = MultinomialNB()

model = make_pipeline(TfidfVectorizer(), MultinomialNB())

model.fit(train_df.data, train_df.target)

y_pred = model.predict(test_df.data)
y2_pred = model.predict(eva_df.data)
y3_pred = model.predict(train_df.data)


  target                                               data
1      1  The vigilante has long held a fascination for ...
2      0  This is a VERY average phone with bad battery ...
3      0  THIS CONTAINS SPOILERS.I have rarely seen a fi...
4      1  knows whats up! That  commercial. Just another...
5      0  Sure did go south after breakfast though! Dela...
  target                                               data
1      0  overgeneralized, not helpful to anyone serious...
2      1                           Great sound and service.
3      1  love this book!!!: this book is a fast read ab...
4      1  A hugely enjoyable screen version of Rona Jaff...
5      0  What an uninteresting hodge-podge. It could ha...
  target                                               data
1      0  Don't eat or drink here. Rooms okay but noisy....
2      1  ***********If you don't appreciate......\"The ...
3      1  So so happy that I have it a try. Being dairy ...
4      1  I just moved to the neighborho

In [4]:
print("Test")
print("--------------------------------------------------------------------------------------")
print(classification_report_imbalanced(test_df.target, y_pred))
print("Evaluation")
print("--------------------------------------------------------------------------------------")

print(classification_report_imbalanced(eva_df.target, y2_pred))
'''
we are using the
# :class:`~imblearn.pipeline.make_pipeline` function implemented in
# imbalanced-learn to properly handle the samplers.

'''
print("Train")
print("--------------------------------------------------------------------------------------")
print(classification_report_imbalanced(train_df.target, y3_pred))


Test
--------------------------------------------------------------------------------------
                   pre       rec       spe        f1       geo       iba       sup

          0       0.78      0.93      0.74      0.85      0.83      0.70      1252
          1       0.91      0.74      0.93      0.82      0.83      0.67      1248

avg / total       0.85      0.83      0.83      0.83      0.83      0.69      2500

Evaluation
--------------------------------------------------------------------------------------
                   pre       rec       spe        f1       geo       iba       sup

          0       0.74      0.95      0.67      0.83      0.79      0.65      2482
          1       0.93      0.67      0.95      0.77      0.79      0.61      2518

avg / total       0.83      0.80      0.81      0.80      0.79      0.63      5000

Train
--------------------------------------------------------------------------------------
                   pre       rec       spe     