In [1]:
import pandas as pd
import numpy as np

# Read in observations

In [2]:
import os

pos_path = 'review_polarity/txt_sentoken/pos/'
neg_path = 'review_polarity/txt_sentoken/neg/'
pos_rev = []
neg_rev = []

# assign positive reviews to list
for filename in os.listdir(pos_path):
    with open(pos_path + filename, 'r') as f:
        pos_rev.append(f.read())

# assign negative reviews to list
for filename in os.listdir(neg_path):
    with open(neg_path + filename, 'r') as f:
        neg_rev.append(f.read())

In [3]:
# create dictionary with pos/neg labels
data_pos = {'review': pos_rev, 'rating': 'positive'}
data_neg = {'review': neg_rev, 'rating': 'negative'}

In [4]:
df = pd.DataFrame(data_pos)
df_rev = pd.concat([df, pd.DataFrame(data_neg)])

In [5]:
df_rev.head()

Unnamed: 0,review,rating
0,""" love is the devil "" is a challenging film ,...",positive
1,"in some respects , rush hour is the ultimate e...",positive
2,"martin scorsese's kundun , which chronicles ro...",positive
3,expectation rating : a bit worse than expected...,positive
4,note : some may consider portions of the follo...,positive


In [6]:
X = df_rev.iloc[:, 0]

In [7]:
X.head()

0     " love is the devil " is a challenging film ,...
1    in some respects , rush hour is the ultimate e...
2    martin scorsese's kundun , which chronicles ro...
3    expectation rating : a bit worse than expected...
4    note : some may consider portions of the follo...
Name: review, dtype: object

In [8]:
y = df_rev.iloc[:, 1]

## Test/train split

In [9]:
from sklearn.model_selection import train_test_split

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Sklearn Solution

## Create Sparse Matrix

In [11]:
from sklearn.feature_extraction.text import CountVectorizer

In [12]:
vectorizer = CountVectorizer()

In [13]:
vectorizer.fit(X_train)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [14]:
feature_names = vectorizer.get_feature_names()

In [15]:
len(feature_names)

36291

In [16]:
X_train_vec = vectorizer.transform(X_train)

In [17]:
vect_view=pd.DataFrame(X_train_vec.toarray(),columns=feature_names)
vect_view.head()

Unnamed: 0,00,000,0009f,007,03,04,05,05425,10,100,...,zuehlke,zuko,zukovsky,zundel,zurg,zus,zwick,zwigoff,zycie,zzzzzzz
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [18]:
vect_view.shape

(1600, 36291)

## Modeling

In [19]:
from sklearn import naive_bayes
from sklearn.metrics import accuracy_score, classification_report

In [20]:
X_test_vec = vectorizer.transform(X_test)

In [21]:
model = naive_bayes.MultinomialNB()
model.fit(X_train_vec, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [22]:
print ("Accuracy: %.3f"% accuracy_score(y_test, model.predict(X_test_vec)))
print (classification_report(y_test, model.predict(X_test_vec)))

Accuracy: 0.840
             precision    recall  f1-score   support

   negative       0.83      0.85      0.84       201
   positive       0.85      0.83      0.84       199

avg / total       0.84      0.84      0.84       400



# Manual Solution

In [23]:
import nltk

# Functions

In [24]:
# create sparse matrix of words
# input is dataframe of text

def create_word_mat(df_docs_text, df_docs_labels):
    # initialize empty list to store counts for each observation
    word_dicts = []
    
    for doc in df_docs_text:
        word_counts = {}
        # split sentences into individual words
        words = nltk.word_tokenize(doc)
        # calc word counts for each word in observation
        for word in words:
            if word.isalnum():
                if word in word_counts.values():
                    word_counts[word] += 1
                else:
                    word_counts[word] = 1
            else:
                pass
        # add word counts for observation to list
        word_dicts.append(word_counts)
    
    # create dataframe of all observation word counts (0's fill empty word count cells)
    df_words = pd.DataFrame.from_records(word_dicts).fillna(0)
    # add class labels to dataframe
    df_words['class_labels'] = df_docs_labels
    return(df_words)

In [25]:
# calculate probabilities of words for each class
# input is word sparse matrix including last column of class labels

def calc_probas(df_words):
    # word counts by class
    word_freq = df_words.groupby(df_words.columns[-1]).sum()
    # total number of words in sample (to add for smoothing of zero values)
    word_num = len(word_freq.columns)
    # calc word totals per class
    word_freq['class_total'] = word_freq.sum(axis=1) + word_num
    # calc probabilities of each word appearing in each class
    word_probas = (word_freq.iloc[:, 0:-1] + 1).div(word_freq['class_total'], axis=0)
    return(word_probas)

In [26]:
# predict class for given statement
# input is statement to predict and word probability dataframe

def pred_class(statement, df_probas):
    # break statement into individual words
    words = nltk.word_tokenize(statement)
    # calculate probabilities of statement per class
    probas = df_probas[words].prod(axis=1)
    # return predicted class name
    return(max(probas.index))

In [27]:
def metrics(prediction, actual):
    
    classes = prediction.append(actual).unique()
    
    true_pos = sum((y_predict == classes[0]) & (y_predict == y_test))
    true_neg = sum((y_predict == classes[1]) & (y_predict == y_test))
    false_neg = sum((y_predict == classes[0]) & (y_predict != y_test))
    false_pos = sum((y_predict == classes[1]) & (y_predict != y_test))
    
    accuracy = (true_pos + true_neg) / (true_pos + true_neg + false_pos + false_neg)
    precision = true_pos / (true_pos + false_pos)
    recall = true_pos / (true_pos + false_neg)
    f1 = 2 * (recall * precision) / (recall + precision)
    
    print("Scores")
    print("-"*15)
    print("Accuracy:", accuracy)
    print("Precision:", precision)
    print("Recall:", recall)
    print("F1:", f1)

## Count word frequency

In [34]:
df_words = create_word_mat(X_train.reset_index(drop=True), y_train.reset_index(drop=True))
df_words.head()

Unnamed: 0,0,00,000,0009f,007,03,04,05,05425,1,...,zucker,zuehlke,zuko,zukovsky,zundel,zurg,zwick,zwigoff,zycie,class_labels
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,positive
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,positive
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,positive
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,positive
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,positive


In [35]:
df_probas = calc_probas(df_words)
df_probas

Unnamed: 0_level_0,0,00,000,0009f,007,03,04,05,05425,1,...,zsigmond,zucker,zuehlke,zuko,zukovsky,zundel,zurg,zwick,zwigoff,zycie
class_labels,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
negative,5.6e-05,1.1e-05,0.00012,4e-06,1.4e-05,7e-06,7e-06,1.1e-05,7e-06,0.000261,...,7e-06,1.8e-05,4e-06,4e-06,4e-06,4e-06,4e-06,7e-06,7e-06,4e-06
positive,3.9e-05,1e-05,0.000132,7e-06,2e-05,3e-06,3e-06,7e-06,3e-06,0.000211,...,3e-06,1e-05,7e-06,7e-06,7e-06,7e-06,7e-06,2e-05,3e-06,7e-06


In [38]:
X_test.head()

860    i can't recall a previous film experience wher...
353    matthew broderick and high school comedy . \nt...
333    hollywood never fails to astound me . \nevery ...
905    ok , i admit i had a bad attitude about this f...
289    i have never seen a man so in love with himsel...
Name: review, dtype: object

In [37]:
prediction = X_test.reset_index(drop=True).apply(lambda x: pred_class(x, df_probas))

KeyError: '["n\'t" \'.\' \',\' \'over-wrought\' \',\' \'.\' \',\' \',\' \'clubbed\' \';\' \',\' \'.\' \',\'\n \'writer-director\' \'baigelman\' \'.\' \'(\' \')\' \'(\' \')\' \'.\' "\'s" \'bookkeeper\'\n \',\' \'(\' "d\'onofrio" \')\' \',\' "\'s" \'.\' \'jjaks\' \'(\' \')\' \',\' "\'s" \',\' "\'s"\n \'.\' \',\' \'jjaks\' \',\' "n\'t" \'.\' \',\' \',\' "n\'t" "\'s" \',\' \'.\' \',\' \'baigelman\'\n \'jjaks\' "\'s" \',\' \'(\' \')\' \'jjaks\' \'.\' \'jjaks\' "\'" \',\' \';\' "\'" \'jjaks\' \'.\'\n \',\' \'.\' \',\' \'baigelman\' \'jjaks\' \',\' "\'" \'``\' \'``\' \'.\' \'--\' \'--\' \'.\' \',\'\n \'.\' \'valentines\' \',\' "n\'t" \'.\' \',\' \'.\' \',\' \',\' \'.\' \',\' \'.\' "d\'onofrio"\n \',\' \',\' \'competitiveness\' \'jjaks\' \',\' \',\' \'.\' \'(\' "\'s" "\'s" \')\' \',\' "n\'t"\n \'.\' "n\'t" \'baigelman\' "\'s" \'--\' \',\' \'well-crafted\' \'--\' \'light-hearted\'\n \'.\' \'_is_\' \'violates\' \',\' "n\'t" \'.\' \',\' "\'s" \'.\'] not in index'

# Example

## Input

In [24]:
word_dict = {'text': 
             ["a great game", "the election was over", "very clean match", 
              "a clean but forgettable game", "it was a close election"],
            'tag': ["sports", "not_sports", "sports", "sports", "not_sports"]
            }

In [27]:
word_df = pd.DataFrame(word_dict)
word_df

Unnamed: 0,text,tag
0,a great game,sports
1,the election was over,not_sports
2,very clean match,sports
3,a clean but forgettable game,sports
4,it was a close election,not_sports


In [96]:
X = word_df.iloc[:, 0]
X

0                    a great game
1           the election was over
2                very clean match
3    a clean but forgettable game
4         it was a close election
Name: text, dtype: object

In [97]:
y = word_df.iloc[:, 1]
y

0        sports
1    not_sports
2        sports
3        sports
4    not_sports
Name: tag, dtype: object

In [98]:
X_test = pd.Series(['a very close game', 'great clean election game', 'game over match election'])

In [99]:
y_test = pd.Series(['sports', 'not_sports', 'not_sports'])

## Word Dataframe
Create sparse matrix of words in training dataset

In [100]:
df_words = create_word_mat(X, y)
df_words

Unnamed: 0,a,but,clean,close,election,forgettable,game,great,it,match,over,the,very,was,class_labels
0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,sports
1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,not_sports
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,sports
3,1.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,sports
4,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,not_sports


## Word Probabilities
Calculate probabilities of each word per class label

In [101]:
df_probas = calc_probas(df_words)
df_probas

Unnamed: 0_level_0,a,but,clean,close,election,forgettable,game,great,it,match,over,the,very,was
class_labels,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
not_sports,0.086957,0.043478,0.043478,0.086957,0.130435,0.043478,0.043478,0.043478,0.086957,0.043478,0.086957,0.086957,0.043478,0.130435
sports,0.12,0.08,0.12,0.04,0.04,0.08,0.12,0.08,0.04,0.08,0.04,0.04,0.08,0.04


## Predictions
Predict class labels for test dataset

In [105]:
y_predict = X_test.apply(lambda x: pred_class(x, df_probas))
y_predict

0    sports
1    sports
2    sports
dtype: object

## Performance Metrics
Evaluate model performance based on:
- Accuracy
- Precision
- Recall

In [187]:
metrics(y_predict, y_test)

Scores
---------------
Accuracy: 0.3333333333333333
Precision: 1.0
Recall: 0.3333333333333333
F1: 0.5
