In [9]:
import pandas as pd
import nltk
from sklearn.feature_extraction.text import TfidfTransformer
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
import pyprind
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

from sklearn.grid_search import GridSearchCV

from sklearn.svm import SVC

from sklearn.metrics import confusion_matrix

from nltk.stem.wordnet import WordNetLemmatizer

from sklearn.cross_validation import train_test_split

from sklearn import svm

from sklearn.ensemble import RandomForestClassifier

from sklearn.naive_bayes import GaussianNB

from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB

from sklearn.neighbors import KNeighborsClassifier

In [10]:
# used when validating
train_pos=pd.read_csv("Y://Data//rev_Imdb//train_pos.csv",
                      encoding="cp1252")

train_neg=pd.read_csv("Y://Data//rev_Imdb//train_neg.csv",
                      encoding="cp1252")
train_data=pd.concat([train_pos,train_neg]).loc[:,['Review','Sentiment']]

In [3]:
#tokenizer
def tokenizer_stem_porter(text):
    porter=PorterStemmer()
    stop=stopwords.words('english')
    textlst=[porter.stem(word) for word in text.split() 
             if word not in stop]
    return ' '.join(textlst)


def tokenizer_stem_wordnet(text):
    #porter=PorterStemmer()
    lmtzr=WordNetLemmatizer()
    stop=stopwords.words('english')
    textlst=[lmtzr.lemmatize(word) for word in text.split() 
             if word not in stop]
    return ' '.join(textlst)

In [12]:
def wordtovec(Input_data):
    
    doc_lst=Input_data.Review.apply(tokenizer_stem_wordnet)
    print("#######################stem finished######################")
    X_data=pd.DataFrame(columns=['Review'])
    X_data['Review']=doc_lst
    Y_data=Input_data.Sentiment
    
    tfidf = TfidfVectorizer(min_df=1)
    X_data=tfidf.fit_transform(X_data.Review.tolist())
    print("###################### data ready #########################")
    
    return X_data,Y_data

In [5]:
def read_data():
    train_pos=pd.read_csv("Y://Data//rev_Imdb//train_pos.csv",
                          encoding="cp1252")
    train_neg=pd.read_csv("Y://Data//rev_Imdb//train_neg.csv",
                          encoding="cp1252")
    train_data=pd.concat([train_pos,train_neg]).loc[:,
                               ['Review','Sentiment']]
    test_pos=pd.read_csv("Y://Data//rev_Imdb//test_pos.csv",
                         encoding="cp1252")
    test_neg=pd.read_csv("Y://Data//rev_Imdb//test_neg.csv",
                         encoding="cp1252")
    test_data=pd.concat([test_pos,test_neg]).loc[:,['Review','Sentiment']]
    
    nrows=train_data.shape[0]
    
    all_data=pd.concat([train_data,test_data])
    X_vec, Y_vec=wordtovec(all_data)
    
    X_train=X_vec[:nrows,:]
    Y_train=Y_vec[:nrows]
    X_test =X_vec[nrows:,:]
    Y_test=Y_vec[nrows:]
    
    return X_train, Y_train, X_test, Y_test
    

In [10]:
def train_svm(X, y):
    """
    Create and train the Support Vector Machine.
    """
    svm = SVC(C=1000.0, gamma=0.0001, kernel='rbf')
    svm.fit(X, y)
    
    return svm

In [11]:
def train_logistic(X, y):
    """
    Create and train the Logistic regression model.
    """
    logis = LogisticRegression(C=10.0, verbose=5,
                               n_jobs=-1, solver='sag')
    logis.fit(X, y)
    
    return logis

In [12]:
def train_rf(X,y):
    rfc= RandomForestClassifier(n_estimators=220, criterion='gini',
                                max_depth=19, n_jobs=-1,verbose=1)
    rfc.fit(X, y)
    
    return rfc

In [13]:
def train_gnb(X,y):
    gnb = GaussianNB()
    gnb.fit(X,y)
    
    return gnb

In [14]:
def train_bnb(X,y):
    bnb = BernoulliNB()
    bnb.fit(X,y)
    
    return bnb

In [4]:
def train_mnb(X,y):
    bnb = MultinomialNB()
    bnb.fit(X,y)
    
    return bnb

In [9]:
def train_knn(X,y):
    knn=KNeighborsClassifier(n_neighbors=5)
    knn.fit(X, y) 
    
    return knn
    

In [15]:
def grid_rf(X, y):
    
    clf = RandomForestClassifier(criterion='gini')
    tuned_parameters = [{'n_estimators': [210,220,240],
                     'max_depth': [17,18,19]}]
    search = GridSearchCV(clf, tuned_parameters, cv=5,scoring='accuracy',
                          verbose=100,n_jobs=1)
    search.fit(X, y)
    print("best param:",search.best_params_)
    

In [6]:
def grid_svm(X, y):
    
    clf = svm.SVC()
    tuned_parameters = [{'kernel': ['rbf','poly'], 'gamma': [0.0001],
                     'C': [1000,2000,500]}]
    search = GridSearchCV(clf, tuned_parameters, cv=5,scoring='accuracy',
                          verbose=10,n_jobs=-1)
    search.fit(X, y)
    print("best param:",search.best_params_)
    

In [6]:
def grid_knn(X,y):
    clf=KNeighborsClassifier()
    
    tuned_parameters = [{'n_neighbors': [3,5,7], 
                         'algorithm': ['ball_tree','auto'],
                         'p': [1,2]}]
    search = GridSearchCV(clf, tuned_parameters, cv=5, scoring='accuracy',
                          n_jobs=-1,verbose=3)
    search.fit(X, y) 
    
    print("best param:",search.best_params_)

In [13]:
def main_cv(train_data):

    X_train, Y_train=wordtovec(train_data)

    grid_svm(X_train, Y_train)


In [17]:
#Validating results

#---------------------- SVM Reg Train ------------------
#C=1000, gamma=0.0001
0.8854
[2144 , 232]
[ 341 ,2283]

#----------------------Logistic Regression--------------------
#'sag' C=10.0  convergence after 31 epochs took 1 seconds
0.8972
[2202 , 231]
[ 283, 2284]



[283, 2284]

In [14]:
main_cv(train_data)

#######################stem finished######################
###################### data ready #########################
Fitting 5 folds for each of 6 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed: 10.7min
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed: 18.8min
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed: 32.2min
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed: 55.0min finished


best param: {'C': 1000, 'kernel': 'rbf', 'gamma': 0.0001}


In [5]:
def main_val():
    
#def main_svm()

    X_train, Y_train=wordtovec(train_data)

    X_tr, X_te, y_tr, y_te = train_test_split(X_train, Y_train,
                                test_size=0.2, random_state=42)

    #svm = train_svm(X_tr, y_tr)
    bnb= train_mnb(X_tr.toarray(),y_tr)

    # Make an array of predictions on the test set
    #pred = svm.predict(X_te)
    pred=bnb.predict(X_te.toarray())

    # Output the hit-rate and the confusion matrix for each model
    print(bnb.score(X_te.toarray(), y_te))
    print(confusion_matrix(pred, y_te))

In [15]:
def main_val():

    X_train, Y_train=wordtovec(train_data)

    X_tr, X_te, y_tr, y_te = train_test_split(X_train, Y_train, 
                              test_size=0.2, random_state=1024)

    logis = train_logistic(X_tr, y_tr)

    # Make an array of predictions on the test set
    pred = logis.predict(X_te)

    # Output the hit-rate and the confusion matrix for each model
    print(logis.score(X_te, y_te))
    print(confusion_matrix(pred, y_te))

In [7]:
def main_test():

    X_train, Y_train,X_test, Y_test=read_data()
   
    model=train_knn(X_train, Y_train)

    pred = model.predict(X_test)

    # Output the hit-rate and the confusion matrix for each model
    print(model.score(X_test, Y_test))
    print(confusion_matrix(pred, Y_test))

In [18]:
#logistic
main_test()

##########################stem finished########################
######################### data ready ############################
convergence after 33 epochs took 1 seconds
0.87248
[[11065  1753]
 [ 1435 10747]]


[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    0.9s
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.9s finished


In [22]:
#bernoulli naive basyes
main_test()

##########################stem finished########################
######################### data ready ############################
0.81696
[[11102  3178]
 [ 1398  9322]]


In [11]:
#multi NB
main_test()

##########################stem finished########################
######################### data ready ############################
0.831
[[10988  2713]
 [ 1512  9787]]


In [10]:
#knn
main_test()

##########################stem finished########################
######################### data ready ############################
0.65128
[[8299 4517]
 [4201 7983]]


In [24]:
#svm
main_test()

##########################stem finished########################
######################### data ready ############################
0.87904
[[10895  1419]
 [ 1605 11081]]


In [26]:
#rf
main_test()

##########################stem finished########################
######################### data ready ############################


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    1.9s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:    8.7s
[Parallel(n_jobs=-1)]: Done 220 out of 220 | elapsed:   10.0s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.7s
[Parallel(n_jobs=4)]: Done 220 out of 220 | elapsed:    0.8s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.1s


0.8474
[[10344  1659]
 [ 2156 10841]]


[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.7s
[Parallel(n_jobs=4)]: Done 220 out of 220 | elapsed:    0.8s finished
