## Bag of Words Meets Bags of Popcorn

In [1]:
#import things: pandas and numpy
import pandas as pd
import numpy as np
import re

In [2]:
#read labeledtraindataset
sentiment_train = pd.read_csv('labeledTrainData.tsv', delimiter='\t')
sentiment_train.head()
#read testData
test = pd.read_csv('testData.tsv', delimiter='\t')
test.head()

Unnamed: 0,id,review
0,12311_10,Naturally in a film who's main themes are of m...
1,8348_2,This movie is a disaster within a disaster fil...
2,5828_4,"All in all, this is a movie for kids. We saw i..."
3,7186_2,Afraid of the Dark left me with the impression...
4,12128_7,A very accurate depiction of small time mob li...


In [3]:
#download stopword
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HieuTrungLe\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
#import and use stopwords
from nltk.corpus import stopwords
stop = stopwords.words('english')

In [5]:
#Create needed function
def preprocessor(text):
    """ Return a cleaned version of text
    """
    # Remove HTML markup
    text = re.sub('<[^>]*>', '', text)
    # Save emoticons for later appending
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
    # Remove any non-word character and append the emoticons,
    # removing the nose character for standarization. Convert to lower case
    text = (re.sub('[\W]+', ' ', text.lower()) + ' ' + ' '.join(emoticons).replace('-', ''))
    
    return text

# Create some random texts for testing the function preprocessor()
print(preprocessor('friends.....'))

friends  


In [6]:
from nltk.stem import PorterStemmer

porter = PorterStemmer()

def tokenizer_porter(text):
    words = text.split()
    return [porter.stem(word) for word in words]
print(tokenizer_porter('Hi there, I am loving this, like with a lot of love'))

['Hi', 'there,', 'I', 'am', 'love', 'this,', 'like', 'with', 'a', 'lot', 'of', 'love']


In [7]:
#train data
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(stop_words=stop, tokenizer=tokenizer_porter, preprocessor=preprocessor)
clf=Pipeline([('vect', tfidf), ('clf', LogisticRegression())])
clf.fit(sentiment_train['review'], sentiment_train['sentiment'])

Pipeline(memory=None,
     steps=[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2',
        preprocessor=<function preproc...ty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])

In [8]:
#check the score
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

#Evaluate our model
X_test = test['review']
y_predict = clf.predict(X_test)

In [9]:
#output to file
output = pd.DataFrame(data={"id":test['id'], "sentiment":y_predict})

In [10]:
#check output dataset
output.head()

Unnamed: 0,id,sentiment
0,12311_10,1
1,8348_2,0
2,5828_4,1
3,7186_2,1
4,12128_7,1


In [11]:
#save ouput to file
output.to_csv("output.csv", index=False)

## Try unsupervisor learning with k-means model

In [12]:
#try unsupervisor learning with K-mean for unlableTrainData
from sklearn import cluster

k_means = Pipeline([('vect', tfidf), ('k_means', cluster.KMeans(n_clusters=2))])

In [13]:
#read data from unlabelTrainData
utrain = pd.read_csv('unlabeledTrainData.tsv', delimiter='\t', error_bad_lines=False)
utrain.head()

b'Skipping line 43043: expected 2 fields, saw 3\n'


Unnamed: 0,id,review
0,9999_0,"Watching Time Chasers, it obvious that it was ..."
1,45057_0,I saw this film about 20 years ago and remembe...
2,15561_0,"Minor Spoilers<br /><br />In New York, Joan Ba..."
3,7161_0,I went to see this film with a great deal of e...
4,43971_0,"Yes, I agree with everyone on this site this m..."


In [14]:
k_means.fit(utrain['review'])

Pipeline(memory=None,
     steps=[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2',
        preprocessor=<function preproc...2, n_init=10, n_jobs=1, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0))])

In [15]:
u_predictions = k_means.predict(X_test)

In [18]:
#create output for unsupervisor file
u_output = pd.DataFrame(data={"id":test['id'], "sentiment":u_predictions})

In [20]:
u_output.to_csv('u_output.csv', index=False)

### Great job !