In [3]:
import re
from nltk.tokenize import RegexpTokenizer
import gensim
import nltk
nltk.download('words')
nltk.download('stopwords')
from nltk.corpus import words, stopwords
from multiprocessing import Pool
import scipy.stats as stats
import numpy as np
import statsmodels.stats.multitest as smm
import pandas as pd


def remove_special_characters(input_string):
    # Define a pattern to match special characters
    pattern = r'[^a-zA-Z0-9\s]'  # This pattern will keep alphanumeric characters and spaces
    
    # Use the re.sub function to replace matched patterns with an empty string
    cleaned_string = re.sub(pattern, '', input_string)
    
    return cleaned_string

def is_real_word(word):
    word = word.lower()  # Convert to lowercase for case-insensitive comparison
    return word in words.words()

def prepare_df_for_text_analysis(df, column_list, columns_to_keep):
    df['text'] = df[column_list].apply(lambda row: ' '.join(map(str, row)), axis=1)

    # clean values
    df['text'] = [remove_special_characters(x.replace("[", "").replace("]", "").replace(",", " ").replace("\n", " ").replace("  ", " ").replace("nnn", " ")) for x in df['text']]

    # standardize, make them all lowercase, remove urls
    df['text'] = df['text'].str.lower()
    df['text'] = df['text'].apply(lambda elem: re.sub(r"http\S+", "", elem))  # get rid of URLs

    # convert to tokens
    df["tokens"] = [gensim.utils.simple_preprocess(x) for x in df["text"]]

    # drop text column and other columns
    columns_to_keep.append("tokens")
    df = df[columns_to_keep]
    df = df.dropna()

    # remove stop words and words that aren't words
    stop_words = set(stopwords.words('english'))
    english_words = set(words.words())

    # Function to remove stop words and non-dictionary words from a list of tokens
    def clean_tokens(tokens):
        return [word for word in tokens if word not in stop_words and word in english_words]

    # Apply the clean_tokens function to the 'tokens' column
    df['tokens'] = df['tokens'].apply(clean_tokens)

    return df

# Multiprocessing version
# Define a function to calculate statistics for a given word
def calculate_statistics(payload):
    y_variable = payload['y_variable']
    word = payload['word']
    try:
        negative_counts = []
        positive_counts = []
        negative_mean = []
        positive_mean = []
        p_values = []
        coefficients = []

        # Function to check if the input string is in a row
        def is_input_in_row(row):
            return word in row['tokens']

        processed_df['is_in_df'] = processed_df.apply(is_input_in_row, axis=1)
        positive_subset = processed_df[processed_df.is_in_df]
        negative_subset = processed_df[~processed_df.is_in_df]

        negative_counts.append(len(negative_subset))
        positive_counts.append(len(positive_subset))


        negative_array = negative_subset[y_variable]
        positive_array = positive_subset[y_variable]
        negative_mean.append(negative_array.mean())
        positive_mean.append(positive_array.mean())

        statistic, p_value = stats.mannwhitneyu(positive_array, negative_array)
        p_values.append(p_value)
        coefficients.append(statistic)

        return {
            'word': word,
            'negative_counts': negative_counts[0],
            'positive_counts': positive_counts[0],
            'negative_mean': negative_mean[0],
            'positive_mean': positive_mean[0],
            'p_value': p_values[0],
            'coefficient': coefficients[0]
        }
    except:
        return {
            'word': word,
            'negative_counts': None,
            'positive_counts': None,
            'negative_mean': None,
            'positive_mean': None,
            'p_value': None,
            'coefficient': None
        }

[nltk_data] Downloading package words to /Users/hansenhan/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/hansenhan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
# some example data I randomly generated
df = pd.read_csv("example.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,product_name,product_reviews
0,0,Premium Green Cotton Shoulder Bag Bag,7357.714326
1,1,Luxurious Blue Jute Tote Bag,6165.224362
2,2,Trendy Red Velvet Shoulder Bag Bag,6170.669836
3,3,Stylish Gray Velvet Clutch Bag,4413.049808
4,4,Classic Green Denim Duffel Bag,623.333625


In [7]:
include_bigrams = True
processed_df = prepare_df_for_text_analysis(
    df = df, 
    column_list = ['product_name'], # all of the columns with text that we want to analyze
    columns_to_keep = ['product_reviews'] # this should include the y-variable or any covariates/labels you want to keep for later
)
processed_df.head()

Unnamed: 0,product_reviews,tokens
0,7357.714326,"[premium, green, cotton, shoulder, bag, bag]"
1,6165.224362,"[luxurious, blue, jute, tote, bag]"
2,6170.669836,"[red, velvet, shoulder, bag, bag]"
3,4413.049808,"[stylish, gray, velvet, clutch, bag]"
4,623.333625,"[classic, green, denim, duffel, bag]"


In [8]:
# extract unique words
all_words = [word for tokens in df["tokens"] for word in tokens]
sentence_lengths = [len(tokens) for tokens in df["tokens"]]
VOCAB = sorted(list(set(all_words)))

# List of words to process
word_list = VOCAB  # Replace with your actual word list
y_variable = "product_reviews"
payloads = [{"word": word, "y_variable": y_variable} for word in word_list]

# Create a Pool for multiprocessing
pool = Pool(processes=6)  # You can adjust the number of processes as needed

# Calculate statistics for each word in parallel
results = pool.map(calculate_statistics, payloads)
pool.close()
pool.join()

# Create a DataFrame with the results
stats_results = pd.DataFrame(results)

# remove invalid values
# Check for NaN values
nan_values = np.isnan(stats_results['p_value'])

# Check for Inf values
inf_values = np.isinf(stats_results['p_value'])

stats_results = stats_results[~nan_values][~inf_values]

# Run BH correction on the 'p_value' column
rejected, corrected_p_values, _, _ = smm.multipletests(np.sort(stats_results['p_value']), method='fdr_bh')

# Add the BH-corrected p-values and 'rejected' column to the DataFrame
stats_results['fdr'] = corrected_p_values
stats_results['rejected'] = rejected
stats_results['effect_size'] = stats_results['positive_mean'] / stats_results['negative_mean']
stats_results




Unnamed: 0,word,negative_counts,positive_counts,negative_mean,positive_mean,p_value,coefficient,fdr,rejected,effect_size
0,backpack,55.0,5.0,4698.788167,5674.049466,0.549518,161.0,0.985937,False,1.207556
2,beige,53.0,7.0,4777.650318,4798.30424,0.856993,194.0,0.985937,False,1.004323
3,black,56.0,4.0,4699.287464,5910.87463,0.380477,143.0,0.985937,False,1.257824
4,blue,50.0,10.0,4685.593046,5252.394423,0.558451,280.0,0.985937,False,1.120967
5,brown,52.0,8.0,4916.786866,3891.334933,0.32895,162.0,0.985937,False,0.791439
6,bucket,51.0,9.0,4661.948597,5449.357566,0.431464,268.0,0.985937,False,1.168901
7,canvas,57.0,3.0,4742.812465,5487.761998,0.700409,98.0,0.985937,False,1.157069
8,checkered,58.0,2.0,4796.006136,4317.620301,0.950282,56.0,0.985937,False,0.900253
9,chic,53.0,7.0,4685.021294,5499.638279,0.603775,209.0,0.985937,False,1.173877
10,classic,53.0,7.0,4858.760204,4184.186532,0.603775,162.0,0.985937,False,0.861163
