In [1]:
#package imports

import pandas as pd
import numpy as np

import tensorflow as tf
from sklearn.feature_extraction.text import CountVectorizer
tf.get_logger().setLevel('INFO')

import nltk
from nltk.corpus import stopwords
# nltk.download('stopwords')

In [2]:
# Load data
training_data = pd.read_json("train.json")
test_data = pd.read_json("test.json")

In [3]:
# stop words
stop_words = set(stopwords.words('english'))
stop_words = list(stop_words)
for word in ['i\'ve', 'i\'m', 'i\'ll', "i'm", "i've", "i'll", 'im',
            'would']:
    stop_words.append(word)

In [4]:
# Look at fields used in the test data
test_data.columns

Index(['giver_username_if_known', 'request_id', 'request_text_edit_aware',
       'request_title', 'requester_account_age_in_days_at_request',
       'requester_days_since_first_post_on_raop_at_request',
       'requester_number_of_comments_at_request',
       'requester_number_of_comments_in_raop_at_request',
       'requester_number_of_posts_at_request',
       'requester_number_of_posts_on_raop_at_request',
       'requester_number_of_subreddits_at_request',
       'requester_subreddits_at_request',
       'requester_upvotes_minus_downvotes_at_request',
       'requester_upvotes_plus_downvotes_at_request', 'requester_username',
       'unix_timestamp_of_request', 'unix_timestamp_of_request_utc'],
      dtype='object')

In [5]:
# Split
pizza = training_data[training_data['requester_received_pizza'] == True]
no_pizza = training_data[training_data['requester_received_pizza'] == False]
no_pizza_short = no_pizza[:len(pizza)]

# Create new dataframe with equal outcomes
new_train = pd.concat([pizza, no_pizza_short], axis = 0)
new_train = new_train.sample(frac=1) # shuffle
new_train.reset_index()
new_train['requester_received_pizza'] = new_train['requester_received_pizza'].astype(int) # boolean to 0/1

train_labels = new_train['requester_received_pizza']

## Examine Request Text

In [6]:
# Pull the request text
train_requests = new_train['request_text_edit_aware'].str.lower().tolist()

# Put labels into a list
train_labels = list(train_labels)

In [7]:
# Use count vectorizer to pull the request vocabulary
req_vectorizer = CountVectorizer(stop_words = stop_words,
                            token_pattern = '\\b(\\w+[\\\']?\\w+)\\b')
vtrain = req_vectorizer.fit_transform(train_requests)
req_vocab = req_vectorizer.vocabulary_

In [8]:
# Pull the words only
word_list = list(req_vocab.keys())

# Strip the likely non-words (less than 3 characters)
for word in word_list:
    if len(word) < 3:
        word_list.remove(word)

In [9]:
def count_examples(token, text_series):
    '''Counts the number of positive and negative examples containing token'''
    pos_count = 0
    neg_count = 0
    
    # Loop through each request
    
    for i in range(len(text_series)):
        
        if token in text_series[i]:
            
            # Add to positive example count
            if train_labels[i] == True:
                pos_count += 1
                
            # Add to negative example count
            if train_labels[i] == False:
                neg_count += 1
    
    return (pos_count, neg_count)

In [10]:
# Calculate words by success/fail requests and create dataframe
pos_count = []
neg_count = []

for word in word_list:
    pos,neg = count_examples(word, train_requests)
    pos_count.append(pos)
    neg_count.append(neg)

d = {
    'word': word_list,
    'n_pos': pos_count,
    'n_neg': neg_count
}

words_df = pd.DataFrame(d)

# Add a ratio column
words_df['ratio'] = words_df.n_pos / words_df.n_neg

In [11]:
# Check dataframe
words_df

Unnamed: 0,word,n_pos,n_neg,ratio
0,got,190,146,1.301370
1,hit,53,42,1.261905
2,unexpected,15,7,2.142857
3,bills,76,53,1.433962
4,paycheck,69,48,1.437500
...,...,...,...,...
8577,excitement,1,0,inf
8578,dislike,0,1,0.000000
8579,optics,0,1,0.000000
8580,irrelevant,0,1,0.000000


In [12]:
words_df = words_df.sort_values('n_pos', ascending = False)

In [13]:
top_words = words_df[:2000]

In [14]:
# Look at the most common words (top 2000)
top_words[:50]

Unnamed: 0,word,n_pos,n_neg,ratio
3064,ing,850,791,1.074589
5640,en,821,765,1.073203
898,zza,654,606,1.079208
8,pizza,653,605,1.079339
22,one,616,537,1.147114
5153,ave,595,504,1.180556
68,day,528,398,1.326633
1745,ate,487,395,1.232911
7634,n't,468,401,1.167082
1579,ill,464,381,1.217848


In [15]:
# Check which words have high usage and high ratio
top_words[top_words['n_pos'] > 100].sort_values(
    'ratio', ascending=False)[:50]

Unnamed: 0,word,n_pos,n_neg,ratio
671,check,152,88,1.727273
4323,heck,153,89,1.719101
981,http,101,60,1.683333
504,next,144,87,1.655172
18,weeks,114,70,1.628571
656,great,108,68,1.588235
5663,hoo,101,64,1.578125
12,days,169,108,1.564815
285,anyone,186,119,1.563025
9,last,206,133,1.548872


## Examine request titles

In [16]:
# Pull the request text
train_titles = new_train['request_title'].str.lower().tolist()

In [23]:
train_titles[4]

"[request] lookin' for za in all the wrong places."

In [24]:
# Use count vectorizer to pull the request vocabulary
title_vectorizer = CountVectorizer(stop_words = stop_words,
                            token_pattern = '\\b(\\w+[\\\']?\\w+)\\b')
title_train = title_vectorizer.fit_transform(train_titles)
title_vocab = title_vectorizer.vocabulary_

In [60]:
# Pull the words only
title_word_list = list(title_vocab.keys())

# Strip the likely non-words (less than 3 characters)
for word in title_word_list:
    if len(word) < 3:
        title_word_list.remove(word)

In [61]:
# Check size of title vocab
len(title_vocab)

2927

In [62]:
len(title_word_list)

2800

In [63]:
# Calculate words by success/fail requests and create dataframe
t_pos_count = []
t_neg_count = []

for word in title_word_list:
    t_pos,t_neg = count_examples(word, train_titles)
    t_pos_count.append(t_pos)
    t_neg_count.append(t_neg)

d_title = {
    'word': title_word_list,
    'n_pos': t_pos_count,
    'n_neg': t_neg_count
}

title_words_df = pd.DataFrame(d_title)

# Add a ratio column
title_words_df['ratio'] = title_words_df.n_pos / title_words_df.n_neg

In [64]:
title_words_df.head()

Unnamed: 0,word,n_pos,n_neg,ratio
0,request,993,992,1.001008
1,flat,4,0,inf
2,broke,132,121,1.090909
3,student,91,81,1.123457
4,love,80,92,0.869565


In [65]:
title_words_df = title_words_df.sort_values('n_pos', ascending = False)

In [66]:
title_words_df

Unnamed: 0,word,n_pos,n_neg,ratio
1679,req,994,994,1.000000
0,request,993,992,1.001008
5,pizza,352,400,0.880000
601,mo,186,198,0.939394
2352,ent,183,179,1.022346
...,...,...,...,...
1661,sleepover,0,1,0.000000
1660,we'd,0,2,0.000000
1659,assignments,0,1,0.000000
1658,wide,0,1,0.000000


In [67]:
top_title_words = title_words_df[:500]

In [68]:
top_title_words[:50]

Unnamed: 0,word,n_pos,n_neg,ratio
1679,req,994,994,1.0
0,request,993,992,1.001008
5,pizza,352,400,0.88
601,mo,186,198,0.939394
2352,ent,183,179,1.022346
219,day,179,175,1.022857
503,bro,151,139,1.086331
2,broke,132,121,1.090909
17,hungry,128,98,1.306122
138,one,127,134,0.947761


In [75]:
# Check which words have high usage and high ratio
top_title_words[top_title_words['n_pos'] > 50].sort_values(
    'ratio', ascending=False)[:25]

Unnamed: 0,word,n_pos,n_neg,ratio
451,pay,69,39,1.769231
17,hungry,128,98,1.306122
992,ex,70,54,1.296296
100,college,66,52,1.269231
25,tonight,52,41,1.268293
945,leg,70,57,1.22807
35,help,91,75,1.213333
295,work,54,45,1.2
791,ate,99,83,1.192771
1156,real,68,59,1.152542
