In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
import textblob

from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

import re
import pickle
from scipy.sparse import hstack

from sklearn import preprocessing, model_selection, metrics
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import f1_score, make_scorer,roc_curve, roc_auc_score

In [2]:
train = pd.read_csv("data/labeledTrainData.tsv", sep='\t')
test = pd.read_csv("data/testData.tsv", sep='\t')

In [3]:
sample = pd.read_csv("data/sampleSubmission.csv", sep='\t')

In [4]:
train.head()

Unnamed: 0,id,sentiment,review
0,5814_8,1,With all this stuff going down at the moment w...
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi..."
2,7759_3,0,The film starts with a manager (Nicholas Bell)...
3,3630_4,0,It must be assumed that those who praised this...
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...


In [5]:
test.head()

Unnamed: 0,id,review
0,12311_10,Naturally in a film who's main themes are of m...
1,8348_2,This movie is a disaster within a disaster fil...
2,5828_4,"All in all, this is a movie for kids. We saw i..."
3,7186_2,Afraid of the Dark left me with the impression...
4,12128_7,A very accurate depiction of small time mob li...


In [6]:
sample.head()

Unnamed: 0,"id,""sentiment"""
0,"12311_10,0"
1,"8348_2,0"
2,"5828_4,0"
3,"7186_2,0"
4,"12128_7,0"


In [10]:
stops = set(stopwords.words("english"))

In [9]:
def cleanData(text, lowercase = False, remove_stops = False, stemming = False):
    txt = str(text)
    txt = re.sub(r'[^A-Za-z0-9\s]',r'',txt)
    txt = re.sub(r'\n',r' ',txt)
    
    if lowercase:
        txt = " ".join([w.lower() for w in txt.split()])
        
    if remove_stops:
        txt = " ".join([w for w in txt.split() if w not in stops])
    
    if stemming:
        st = PorterStemmer()
        txt = " ".join([st.stem(w) for w in txt.split()])

    return txt

In [7]:
y = train["sentiment"]

In [8]:
y.head()

0    1
1    1
2    0
3    0
4    1
Name: sentiment, dtype: int64

In [11]:
ctv_word = CountVectorizer(analyzer='word',token_pattern=r'\w{1,}',min_df = 200, max_features=5000,
            ngram_range=(1,2), stop_words = 'english')

In [12]:
# Fitting CountVectorizer to both training and test sets
ctv_word.fit(list(train['review']) + list(test['review']))
train_ctv_word =  ctv_word.transform(train['review']) 
test_ctv_word = ctv_word.transform(test['review'])

In [13]:
# Bag of words (charater based)
ctv_char = TfidfVectorizer(sublinear_tf=True, strip_accents='unicode',analyzer='char',
    stop_words='english', ngram_range=(2, 6), max_features=10000)


In [14]:
# Fitting CountVectorizer to both training and test sets
ctv_char.fit(list(train['review']) + list(test['review']))
train_ctv_char =  ctv_char.transform(train['review']) 
test_ctv_char = ctv_char.transform(test['review'])

In [19]:
tfv_word = TfidfVectorizer(min_df=150,  max_features= 5000, 
            strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
            ngram_range=(1,2),
            stop_words = 'english')

In [20]:
# Fitting TF-IDF to both training and test sets (semi-supervised learning)
tfv_word.fit(list(train['review']) + list(test['review']))
train_tfv_word =  tfv_word.transform(train['review'])
test_tfv_word = tfv_word.transform(test['review'])

In [21]:
tfv_char = TfidfVectorizer(sublinear_tf=True,strip_accents='unicode',analyzer='char',
    stop_words='english',ngram_range=(2, 6),max_features=10000)
tfv_char.fit(list(train['review']) + list(test['review']))
train_tfv_char = tfv_char.transform(train['review'])
test_tfv_char = tfv_char.transform(test['review'])

In [22]:
# bag of words for training set (words + char)
train_bow = hstack([train_ctv_word, train_ctv_char])
test_bow = hstack([test_ctv_word, test_ctv_char])

In [23]:
# TF-IDF for test set (words + char)
train_tfidf = hstack([train_tfv_word, train_tfv_char])
test_tfidf = hstack([test_tfv_word, test_tfv_char])

In [25]:
cross_val_score(LogisticRegression(), train_tfidf, y, cv=5, scoring=make_scorer(f1_score))



array([0.89493545, 0.8990717 , 0.8935831 , 0.89946247, 0.8913304 ])

In [26]:
score = np.array([0.89493545, 0.8990717 , 0.8935831 , 0.89946247, 0.8913304 ])

In [27]:
score.mean()

0.895676624

In [29]:
cross_val_score(LogisticRegression(), train_bow, y, cv=5, scoring=make_scorer(f1_score)).mean()



0.8660974232180519