In [None]:
import pandas as pd
import spacy
import joblib

loaded_model = joblib.load('gender_predictor.pkl')
df = pd.read_csv('Reviews.csv')
nlp = spacy.load('en_core_web_sm')

In [40]:
# Change to user input/desired productId
productId = 'B007JFMH8M'

pros = {}
cons = {}

def analyze_review(review, rating):    
    # Extract adjectives (possible pros/cons)
    doc = nlp(review)
    excluded_words = ['good', 'bad', 'great', 'amazing', 'wonderful', 'best', 'many']
    aspects = [token.text.lower() for token in doc if token.pos_ == 'ADJ' and token.text.lower() not in excluded_words]

    # Positive Review
    if rating >= 4:
        for aspect in aspects:
            pros[aspect] = pros.get(aspect, 0) + 1
    # Negative review
    elif rating <= 2:
        for aspect in aspects:
            cons[aspect] = cons.get(aspect, 0) + 1
    
    return

def predict_gender(name):
    predicted_gender = loaded_model.predict([name.lower()])
    if predicted_gender[0] == 0:
        return('Male')
    else:
        return('Female')

# Extract Pros/Cons and Gender from each matching productId
df.loc[df['ProductId'] == productId, 'Extracted Aspects'] = df[df['ProductId'] == productId].apply(lambda row: analyze_review(row['Summary'], row['Score']), axis=1)
df.loc[df['ProductId'] == productId, 'Gender'] = df[df['ProductId'] == productId].apply(lambda row: predict_gender(row['ProfileName']), axis=1)

# Calculate gender proportion
num_reviews = len(df[(df['ProductId'] == productId)])
num_males = len(df[(df['ProductId'] == productId) & (df['Gender'] == 'Male')])
male_proportion = num_males / num_reviews

# Create DataFrames for Pros and Cons for visualization
top_pros = dict(sorted(pros.items(), key=lambda item: item[1], reverse=True)[:5])  # Top 5 Pros
top_cons = dict(sorted(cons.items(), key=lambda item: item[1], reverse=True)[:5])  # Top 5 Cons
pros_df = pd.DataFrame(list(top_pros.items()), columns=['Aspect', 'Frequency'])
cons_df = pd.DataFrame(list(top_cons.items()), columns=['Aspect', 'Frequency'])

# Output results
if male_proportion < 0.4:
    print('This product is most popular among females.')
elif male_proportion < 0.6:
    print('This product is popular among both males and females.')
else:
    print('This product is most popular among males')

print('\nTop Pros:')
print(pros_df)

print('\nTop Cons:')
print(cons_df)

This product is popular among both males and females.

Top Pros:
      Aspect  Frequency
0      yummy        118
1       soft         73
2  delicious         22
3    oatmeal         14
4      tasty         11

Top Cons:
    Aspect  Frequency
0      dry          2
1    handy          1
2  oatmeal          1
3     much          1
4  healthy          1
