In [None]:
import pandas as pd
import seaborn as sn
import numpy as np
%matplotlib inline

# Load data

In [None]:
df = pd.read_json('pizza_data.json', orient='values')

In [None]:
print(len(df))
df.head()

# Balance in users which received pizza or not

In [None]:
df['requester_received_pizza'].value_counts()

In [None]:
df['requester_received_pizza'].value_counts().plot(kind='bar')

# Metadata

### Reputation of the requester
Maybe if a member is more active they have more chances of getting a pizza

In [None]:
df['requester_number_of_posts_on_raop_at_request'].value_counts()

In [None]:
df['requester_username'].value_counts().loc[lambda x : x>1] 

This means each requester has made only one request

In [None]:
df['requester_user_flair'].value_counts(dropna=False)

  `requester_user_flair`: Users on RAOP receive badges (Reddit calls them flairs) which is a small picture next to their username. In our data set the user flair is either None (neither given nor received pizza, N=4282), "shroom" (received pizza, but not given, N=1306), or "PIF" (given after received, N=83).
  
  -> these numbers confirm that these badges are attributed after user received the pizza

## Relevant information

Which columns can actually help in predicting ?

**Things which do not help**
* Unique infos : requester_username, request_id
* Infos after the gift : giver_username_if_known, requester_user_flair
* Things probably not relevant or related to raop : post_was_edited, requester_account_age_in_days, requester_days, requester_number_of_comments, requester_number_of_posts, requester_upvotes_plus_downvotes, listr of subreddits names
* Things which are obviously correlated to other column: requester_text_edit_aware, unix_timestamp_of_request_utc

## Find correlated variables
Using Pearson correlation coefficient and p-value, on the rema

In [None]:
data = df[["number_of_downvotes_of_request_at_retrieval",
          "number_of_upvotes_of_request_at_retrieval",
           "requester_number_of_comments_in_raop_at_request",
           "requester_number_of_comments_in_raop_at_retrieval",
           "requester_number_of_posts_on_raop_at_request",
           "requester_number_of_posts_on_raop_at_retrieval",
           "requester_number_of_subreddits_at_request",
           "requester_upvotes_minus_downvotes_at_request",
           "requester_upvotes_minus_downvotes_at_retrieval"
          ]]

In [None]:
sn.heatmap(data.corr(), annot=True, cmap="YlGnBu")

In [None]:
from scipy.stats import pearsonr

def calculate_pvalues(df):
    df = df.dropna()._get_numeric_data()
    dfcols = pd.DataFrame(columns=df.columns)
    pvalues = dfcols.transpose().join(dfcols, how='outer')
    for r in df.columns:
        for c in df.columns:
            pvalues[r][c] = round(pearsonr(df[r], df[c])[1], 4)
    return pvalues

calculate_pvalues(data)

### Correlation results

We use the threshold recommended by Evans (1996) : **correlation coeff >= 0.6** to decide that 2 variables are strongly correlated. We consider that the correlation test of 2 variables is valid if the **p-value is < 0.05**.
According to these criteria, the following variables are correlated:
* number_of_downvotes_of_request_at_retrieval and number_of_upvotes_of_request_at_retrieval
* requester_number_of_comments_in_raop_at_request and requester_number_of_comments_in_raop_at_retrieval
* requester_number_of_posts_on_raop_at_request and requester_number_of_posts_on_raop_at_retrieval
* requester_upvotes_minus_downvotes_at_request and requester_upvotes_minus_downvotes_at_retrieval (strong)

however the number of downvotes and upvotes are obviously not correlated so we keep both of them and merge them into one variable : number_of_upvotes_minus_downvotes_of_request

We observe in the reddit channel that donators react quite fast to the comments (usually the same day) while there can be several weeks between request and retrieval. Therefore for a situation closer to what the donator had **we chose to keep the variables "at_request"**.

# Text data

Inspiration from https://towardsdatascience.com/text-classification-in-python-dd95d264c802

### Difference between columns text and text_edit_aware

According to https://cs.stanford.edu/~althoff/raop-dataset/,
`We use a set of rules to strip edited comments indicating the success of the request such as "EDIT: Thanks /u/foo, the pizza was delicous".` 

In [None]:
dftxt = df[['request_text','request_text_edit_aware']].copy()

dftxt['same_text'] = np.where( dftxt['request_text'] == dftxt['request_text_edit_aware'] , True, False)
dftxt['same_text'].value_counts()

In [None]:
dftxt[dftxt['same_text']==False]

In [None]:
dftxt[dftxt['request_text_edit_aware']=='*']

### Conclusion:
I manually checked a few of the texts where request_text and request_text_edit_aware are not the same, a lot of times the EDIT from request_text is not about thanking a donator, it is more about adding some new contextual information which might have convinced the people to give them a pizza.
Therefore **we prefer to use request_text** and discard column request_text_edit_aware.

## Clean text

In [None]:
%%capture
!pip install nltk

In [None]:
"""
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')"""

In [None]:
txt = "[Request] College student, pay check delayed for a week, all out of food pantry food, and haven't eaten today"

In [None]:
import nltk
from nltk.corpus import wordnet as wn
from nltk.stem import WordNetLemmatizer 
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

punctuation = list(",.?!(){}[]-_\"'\\;:+*<>@#§^$%&|/") + ['\n', '\r', '\t', '...', '..']
stop_words = set(stopwords.words('english'))
stop_words.add("request")
stop_words.add("edit")

lemmatizer = WordNetLemmatizer()
tag_dict = {"J": wn.ADJ,
            "N": wn.NOUN,
            "V": wn.VERB,
            "R": wn.ADV}

def extract_wnpostag_from_postag(tag):
    #take the first letter of the tag
    #the second parameter is an "optional" in case of missing key in the dictionary 
    return tag_dict.get(tag[0].upper(), None)

def lemmatize_tupla_word_postag(tupla):
    """
    giving a tupla of the form (wordString, posTagString) like ('guitar', 'NN'), return the lemmatized word
    """
    tag = extract_wnpostag_from_postag(tupla[1])    
    return lemmatizer.lemmatize(tupla[0], tag) if tag is not None else tupla[0]

def correspondance_miswrite(word):
    if word == "im":
        return "i'm"
    elif word == "ive":
        return "i've"

def clean_text(sentence):
    sentence = sentence.lower()
    original_words = word_tokenize(sentence)
    tagged_words = nltk.pos_tag(original_words) #returns a list of tuples: (word, tagString) like ('And', 'CC')
    lemmatized_words = [ lemmatize_tupla_word_postag(ow) for ow in tagged_words ]
    cleaned_words = [ 
        w for w in lemmatized_words if (w not in punctuation) and (w not in stop_words)
    ]
    return ' '.join(cleaned_words)

In [None]:
clean_text(txt)

## Text length

In [None]:
df['len_title'] = df['request_title'].str.len()
df['len_text'] = df['request_text'].str.len()

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(12.8,6))
sn.histplot(df['len_title']).set_title('Requests title length distribution');

In [None]:
title_250 = df[df['len_title'] > 250]
print('Num requests with very long title: ', len(title_250))
print('\n Example of a request:')
print('------ Title ----------')
print(title_250['request_title'].iloc[0])
print('------ Text ----------')
print(title_250['request_text'].iloc[0])

In [None]:
plt.figure(figsize=(12.8,6))
sn.boxplot(data=df, x='requester_received_pizza', y='len_title');

In [None]:
plt.figure(figsize=(12.8,6))
sn.histplot(df['len_text']).set_title('Requests text length distribution');

In [None]:
df['len_text'].describe()

In [None]:
plt.figure(figsize=(12.8,6))
df_95 = df[df['len_text'] < df['len_text'].quantile(0.95)]
sn.histplot(df_95['len_text']).set_title('Requests text length distribution')

In [None]:
plt.figure(figsize=(12.8,6))
sn.boxplot(data=df_95, x='requester_received_pizza', y='len_text');

## Text data reduction

In [None]:
def clean_col(row, col):
    return clean_text(row[col])

df['cleaned_title'] = df.apply(lambda x: clean_col(x, 'request_title'), axis =1)
df['cleaned_text'] = df.apply(lambda x: clean_col(x, 'request_text'), axis =1)

In [None]:
df[['cleaned_title', 'cleaned_text', 'requester_received_pizza']]

# Prepare data

## Text only

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import chi2
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(lowercase=False, max_features=50)

min_df = 10
max_df = 1.
max_features = 200

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df['cleaned_title'], 
                                                    np.array(df['requester_received_pizza']).astype(int), 
                                                    test_size=0.15, 
                                                    random_state=8)



In [None]:
vectorizer = TfidfVectorizer(stop_words=None,
                        lowercase=False,
                        max_df=max_df,
                        min_df=min_df,
                        max_features=max_features,
                        norm='l2',
                        sublinear_tf=True)
                        
features_train = vectorizer.fit_transform(X_train).toarray()
labels_train = y_train
print('Train features:', features_train.shape)
features_test = vectorizer.transform(X_test).toarray()
labels_test = y_test
print('Test features:', features_test.shape)
print('Identified words:')
features_names = vectorizer.get_feature_names_out()
print(features_names[:100])

In [None]:
for i in range(2):
    features_chi2 = chi2(features_train, labels_train == i)
    indices = np.argsort(features_chi2[0])
    feature_names = np.array(features_names)[indices]
    print("# Requester received Pizza {}:".format(bool(i)))
    print("Most correlated words:\n. {}".format('\n. '.join(feature_names[-10:])))
    print("")

In [69]:
import pickle

# X_train
with open('pickles/X_train.pickle', 'wb') as output:
    pickle.dump(X_train, output)
    
# X_test    
with open('pickles/X_test.pickle', 'wb') as output:
    pickle.dump(X_test, output)
    
# y_train
with open('pickles/y_train.pickle', 'wb') as output:
    pickle.dump(y_train, output)
    
# y_test
with open('pickles/y_test.pickle', 'wb') as output:
    pickle.dump(y_test, output)
    
# df
with open('pickles/df.pickle', 'wb') as output:
    pickle.dump(df, output)
    
# features_train
with open('pickles/features_train.pickle', 'wb') as output:
    pickle.dump(features_train, output)

# labels_train
with open('pickles/labels_train.pickle', 'wb') as output:
    pickle.dump(labels_train, output)

# features_test
with open('pickles/features_test.pickle', 'wb') as output:
    pickle.dump(features_test, output)

# labels_test
with open('pickles/labels_test.pickle', 'wb') as output:
    pickle.dump(labels_test, output)

# labels_test
with open('pickles/vectorizer.pickle', 'wb') as output:
    pickle.dump(vectorizer, output)
