In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
import numpy as np
from sklearn.metrics import accuracy_score, f1_score
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score


In [2]:
# Load data from Google Sheets
url = "https://docs.google.com/spreadsheets/d/1iDsbDwcxj5nygYZASAIIlY7Jo58tzrxK4qc4Xemj6vU/gviz/tq?tqx=out:csv&gid=465338724"
data = pd.read_csv(url)

Name_input = 'Kevin Bradley'

In [3]:
# Preprocess Data
label_encoder = LabelEncoder()
data['Reccomended_Type_Encoded'] = label_encoder.fit_transform(data['Reccomended_Type'])

features = data[['Reccomended_Type_Encoded', 'Travel_steve_mountains', 'Cancun_rating', 'Egypt_ranking',
                 'Atlanta_ranking', 'Tahiti_ranking', 'Grand_Canyon_ranking', 'Tools_tableau',
                 'Tools_PowerBI', 'Tools_Python', 'Tools_R', 'Tools_Excel',
                 'Tools_GSheets', 'Rate the following movie genres [Science Fiction]', 'Rate the following movie genres [Drama]',
                 'Rate the following movie genres [Comedy]', 'Rate the following movie genres [Horror]',
                 'Rate the following movie genres [Action]', 'Rate the following movie genres [Epic]', 'Rate the following movie genres [Fantasy]']]
target = data['Reccomended_Title']

In [4]:
# Impute missing values
imputer = SimpleImputer(strategy='mean')
features_imputed = imputer.fit_transform(features)

# Standardize features
scaler = StandardScaler()
features_standardized = scaler.fit_transform(features_imputed)

# Train Random Forest
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(features_standardized, target)

RandomForestClassifier(random_state=42)

In [5]:
# Prepare Person's data
person_data = data[data['Name'] == Name_input]
X_person_standardized = scaler.transform(imputer.transform(person_data[['Reccomended_Type_Encoded', 'Travel_steve_mountains', 
                                                                        'Cancun_rating', 'Egypt_ranking',
                                       'Atlanta_ranking', 'Tahiti_ranking', 'Grand_Canyon_ranking',
                                       'Tools_tableau', 'Tools_PowerBI', 'Tools_Python', 'Tools_R', 'Tools_Excel',
                                       'Tools_GSheets', 'Rate the following movie genres [Science Fiction]', 
                                       'Rate the following movie genres [Drama]', 'Rate the following movie genres [Comedy]',
                                       'Rate the following movie genres [Horror]', 'Rate the following movie genres [Action]',
                                       'Rate the following movie genres [Epic]', 'Rate the following movie genres [Fantasy]']]))

# Predict probabilities
proba_person = rf_model.predict_proba(X_person_standardized)

# Output top 5 recommendations for Person
recommended_titles = target.unique()
top_5_indices = np.argsort(proba_person[0])[-5:][::-1]
top_5_titles = recommended_titles[top_5_indices]

# Feature Importance
importance_vals = rf_model.feature_importances_
importance_df = pd.DataFrame({'Feature': features.columns, 'Importance': importance_vals}).sort_values(by='Importance', ascending=False)

In [6]:
# Predicting on the whole dataset to calculate accuracy and F1 score
predictions = rf_model.predict(features_standardized)

person_recommended_type = label_encoder.inverse_transform([person_data['Reccomended_Type_Encoded'].values[0]])[0]

# Calculate accuracy and F1 score
accuracy = accuracy_score(target, predictions)
f1 = f1_score(target, predictions, average='weighted')  # Use weighted average for multi-class classification

# Create a list to hold recommendations and explanations
recommendations_list = []

# Prepare the detailed explanation for each recommended title
for idx, title in enumerate(top_5_titles):
    explanation_list = []
    
    # Loop through each feature to generate explanations
    for i, feature in enumerate(features.columns):
        person_value = person_data[feature].values[0]
        importance = importance_vals[i]
        
        # Check if it's the 'Reccomended_Type_Encoded' column to decode its original value
        if feature == 'Reccomended_Type_Encoded':
            explanation_list.append(f"{Name_input}'s recommended type was '{person_recommended_type}', "
                                    f"and this feature had an importance score of {importance:.4f}.")
        elif importance > 0.01:  # Only include features with notable importance
            explanation_list.append(f"{Name_input}'s value on the survey for '{feature}' was {person_value}, "
                                    f"and this feature was important with an importance score of {importance:.4f}.")
    
    # Add the recommendation and explanation to the list
    recommendations_list.append({'Recommendation': title, 'Explanation': '\n'.join(explanation_list)})

# Convert the list to a DataFrame using concat
recommendations_df = pd.concat([pd.DataFrame([rec]) for rec in recommendations_list], ignore_index=True)

In [11]:
# First, split the 'Explanation' column by '\n' to create lists
recommendations_df['Explanation_Split'] = recommendations_df['Explanation'].str.split('\n')

# Use explode to split each explanation into a separate row
recommendations_df_exploded = recommendations_df.explode('Explanation_Split')

# Rename the column for better clarity
recommendations_df_exploded.rename(columns={'Explanation_Split': 'Explanation_Part'}, inplace=True)

# Print model accuracy and F1 score
print(f"Model Accuracy: {accuracy:.4f}")
print(f"Model F1 Score: {f1:.4f}")

recommendations_df_exploded = recommendations_df_exploded.drop(columns=['Explanation'])
recommendations_df_exploded

KeyError: 'Explanation'

In [8]:
# User inputs the index of the recommendation they'd like to reject
rejected_index = int(input("Enter the index of the recommendation you'd like to reject (0-4): "))
rejected_recommendation = top_5_titles[rejected_index]
print(f"You rejected: {rejected_recommendation}")

# Create a DataFrame with recommendations and include a column indicating dislikes
recommendations_df = pd.DataFrame({
    'Recommendation': top_5_titles,
    'Strength': proba_person[0][top_5_indices],
    'Disliked': [title == rejected_recommendation for title in top_5_titles]  # Add 'Disliked' column
})

# Display the updated DataFrame with the Disliked column before model selection
print("Updated Recommendations with Dislike feedback:")
print(recommendations_df)

# Define a list of models to compare
models = {
    'RandomForest': RandomForestClassifier(n_estimators=100, random_state=42),
    'LogisticRegression': LogisticRegression(max_iter=1000, random_state=42),
    'GradientBoosting': GradientBoostingClassifier(random_state=42),
    'SVC': SVC(probability=True, random_state=42)
}

# Store model accuracies
model_accuracies = {}

# Train each model and calculate accuracy
for model_name, model in models.items():
    # Fit the model on the entire dataset
    model.fit(features_standardized, target)
    
    # Get predictions and calculate accuracy
    predictions = model.predict(features_standardized)
    accuracy = accuracy_score(target, predictions)
    model_accuracies[model_name] = accuracy

# Select the best model based on accuracy
best_model_name = max(model_accuracies, key=model_accuracies.get)
best_model = models[best_model_name]

print(f"Best model selected: {best_model_name} with accuracy: {model_accuracies[best_model_name]:.4f}")

# Re-run predictions with the best model
proba_person = best_model.predict_proba(X_person_standardized)

# Get the top recommendations
all_titles = target.unique()

# Sort the remaining valid recommendations by their probabilities
top_5_indices = np.argsort(proba_person[0])[-5:][::-1]
new_top_5_titles = all_titles[top_5_indices]

# Update the DataFrame with new top recommendations, retaining the 'Disliked' column
new_recommendations_df = pd.DataFrame({
    'Recommendation': new_top_5_titles,
    'Strength': proba_person[0][top_5_indices],
    'Disliked': [title == rejected_recommendation for title in new_top_5_titles]  # Carry over 'Disliked' feedback
})

# Display the updated DataFrame with new recommendations and feedback
print("New Top 5 Recommendations (after rejection):")
new_recommendations_df

Enter the index of the recommendation you'd like to reject (0-4):  0


You rejected: Culture of Honor
Updated Recommendations with Dislike feedback:
                                      Recommendation  Strength  Disliked
0                                   Culture of Honor  0.650000      True
1                                   Girls Gone Bible  0.120833     False
2                                 Rich Dad, Poor Dad  0.108500     False
3      Blink: The Power of Thinking Without Thinking  0.100667     False
4  How I climbed a 3,000- foot vertical cliff -- ...  0.010000     False


NameError: name 'LogisticRegression' is not defined