In [178]:
## importing all the libraries needed

from sklearn.linear_model import LogisticRegression
from sklearn import naive_bayes
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import nltk
from nltk import word_tokenize
import numpy as np
import pandas as pd
import re
from io import StringIO

In [179]:

class classifier:
    
    def __init__(self, normalize=True, clf_type = "logistic", split_ratio=0.3):
        """
        Initializes the class with the right classifier attribute depending on the type of classifier
        """
        if clf_type == "logistic":
            self.clf = LogisticRegression(random_state=0, solver='lbfgs',multi_class='multinomial')
        elif clf_type == "naive":
            self.clf = naive_bayes.MultinomialNB()
        self.normalize = normalize
        if self.normalize:
            self.vec = TfidfVectorizer(use_idf=True)
        else:
            self.vec = TfidfVectorizer(use_idf=True, lowercase = True, strip_accents=ascii, stop_words = set(nltk.corpus.stopwords.words('english')))

        
    def _read(self, documents):
        """
        Reads and combines all the documents in one big pandas data frame
        """
        data = []
        X,Y = [], []
        for document in documents:
            d_ata = pd.read_csv(document, sep='\t', names=['review','label'])
            data.append(d_ata)
        data = pd.concat(data)
        self.data = data
        Y = data.label
        self.vec.fit(data.review)
        X = self.preprocess(data)
        
        return train_test_split(X,Y)
    
    def preprocess(self, data_f):
        """
        Preprocesses the text data by turning it into frequency tables
        Does a few normalization steps (lowercasing, removing stopwords ...) if self.normalize = true
        """
        
        return self.vec.transform(data_f.review)
    
    def train(self, documents):
        """
        Calls the train function
        Trains the classifier object
        """
        X_train, X_test, Y_train, Y_test =  self._read(documents)       
                
        self.clf.fit(X_train,Y_train)
        print (X_train.shape,Y_train.shape)
        
        acc = roc_auc_score(Y_test,self.clf.predict_proba(X_test)[:,1])
        
        print ("Accuracy: ",acc)
        
    def predict(self, sentence):
        """
        Predicts for a sentence
        """
        data = pd.read_csv(StringIO(sentence), names=['review'])
        X = self.preprocess(data)
        Y = self.clf.predict_proba(X)
        
        return np.argmax(Y)
    
    def test_file(self, file_name):
        """
        Tests with a file and outputs a file of labels
        """
        labels = []
        with open(file_name) as f:
            for line in f.readlines():
                print(line,self.predict(line))
                labels.append(self.predict(line))
        
        with open('test_results.txt', 'w') as f:
            for label in labels:
                f.write(str(label)+"\n")
                
        print ("Results from ",file_name," printed to: output.txt")
                

In [183]:
print ("Unnormalized data, Logistic regression")
my_clf_ul = classifier(normalize=False)
my_clf_ul.train(["../project1/sentiment_labelled_sentences/amazon_cells_labelled.txt",
                  "../project1/sentiment_labelled_sentences/imdb_labelled.txt",
                  "../project1/sentiment_labelled_sentences/yelp_labelled.txt"])
print()

print ("Normalized data, Logistic regression")
my_clf_nl = classifier(normalize=True)
my_clf_nl.train(["../project1/sentiment_labelled_sentences/amazon_cells_labelled.txt",
                  "../project1/sentiment_labelled_sentences/imdb_labelled.txt",
                  "../project1/sentiment_labelled_sentences/yelp_labelled.txt"])
print()

print ("Unnormalized data, Naive Bayes")
my_clf_un = classifier(normalize=False, clf_type='naive')
my_clf_un.train(["../project1/sentiment_labelled_sentences/amazon_cells_labelled.txt",
                  "../project1/sentiment_labelled_sentences/imdb_labelled.txt",
                  "../project1/sentiment_labelled_sentences/yelp_labelled.txt"])
print()

print ("Normalized data, Naive Bayes")
my_clf_nn = classifier(normalize=True, clf_type='naive')
my_clf_nn.train(["../project1/sentiment_labelled_sentences/amazon_cells_labelled.txt",
                  "../project1/sentiment_labelled_sentences/imdb_labelled.txt",
                  "../project1/sentiment_labelled_sentences/yelp_labelled.txt"])


Unnormalized data, Logistic regression
(2061, 5116) (2061,)
Accuracy:  0.896744853591

Normalized data, Logistic regression
(2061, 5155) (2061,)
Accuracy:  0.920064186378

Unnormalized data, Naive Bayes
(2061, 5116) (2061,)
Accuracy:  0.868579700272

Normalized data, Naive Bayes
(2061, 5155) (2061,)
Accuracy:  0.91101953602


In [166]:
my_clf_nl.predict("This product is really good as fuck")

1

In [177]:
my_clf_nl.test_file("../project1/test_sentences.txt")

This GPS tracker works like a charm.
 1
When I opened the box the product was not in the cutouts snd the protective cover was not on the unit
 0
Everyone should have one who owns a computer
 0
Buy something else
 0
Pure junk do not buy ever the greatest load of junk I have ever purchased ever
 0
The DataVac was used and full of dust and dirt
 1
Not so great...bought to clean the bobbin case area of my Brother and Baby Lock Quilting and Embroidery machines
 1
It is a great size, I keep it in my desk drawer at work and beause I teach wood shop it's going to get a lot of use
 0
I just bought this Vacuum. It's just good for nothing
 0
This is just perfect for vacuuming out the lint from my sewing machine
 1
I use it mostly to vacuum threads on the sewing machine. It is just the right size for this task.
 1
I have found this mini vac. to be everything it is said to be
 1
I ordered the Pork Prime Rib Chop it was beautiful, scrumptious and totally tender.
 1
A bastion of fine dining in The Ci