In [1]:
import pandas as pd

# Load the data from the CSV file
data = pd.read_csv('recommendation_data.csv')

print(data.head())

   Unnamed: 0                                  _id        asin  overall  \
0           0  Row(oid='644ead17f66a6378caeef939')  B017O9P72A        1   
1           1  Row(oid='644ead17f66a6378caeef944')  B017O9P72A        2   
2           2  Row(oid='644ead17f66a6378caeef959')  B017O9P72A        3   
3           3  Row(oid='644ead17f66a6378caeef986')  B017O9P72A        1   
4           4  Row(oid='644ead17f66a6378caeef9a1')  B017O9P72A        2   

                                          reviewText   reviewTime  \
0  The service works with google home, but doesn'...  12 29, 2017   
1  I have to tell alexa to tell lifx to do things...  07 24, 2017   
2  Null message fixed by logging out!!! Log out a...   01 5, 2017   
3  When I try to link this app to the alexa app I...  09 15, 2016   
4  Horrible,  absolutely amateur coding without t...  03 10, 2016   

       reviewerID                                            summary  \
0   AA4DHYT5YSSIT                                      Does no

In [2]:
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

# Print the shape of the training and testing data to verify the split
print("Training data shape:", train_data.shape)
print("Testing data shape:", test_data.shape)

Training data shape: (900, 10)
Testing data shape: (225, 10)


# 1. Colabarative Filtering

In [3]:
from surprise import Dataset, Reader, KNNBasic
from surprise.model_selection import cross_validate

# Create a reader object to parse the data
reader = Reader(rating_scale=(1, 5))

# Load the data into the Surprise Dataset format
train_dataset = Dataset.load_from_df(train_data[['reviewerID', 'asin', 'overall']], reader)

# Define the collaborative filtering algorithm (e.g., KNNBasic)
cf_model = KNNBasic()

# Perform cross-validation on the training data
cross_validate(cf_model, train_dataset, measures=['RMSE', 'MAE'], cv=5, verbose=True)

# Train the collaborative filtering model on the entire training data
trainset = train_dataset.build_full_trainset()
cf_model.fit(trainset)

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNBasic on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.5821  1.5817  1.5625  1.5803  1.5486  1.5710  0.0134  
MAE (testset)     1.4162  1.4228  1.3767  1.4003  1.3781  1.3988  0.0189  
Fit time          0.02    0.02    0.00    0.02    0.00    0.01    0.01    
Test time         0.00    0.00    0.00    0.00    0.00    0.00    0.00    
Computing the msd similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x21a0d0ab700>

# 2. Content Based Filtering

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

# Create a TF-IDF vectorizer to convert text features into numerical vectors
tfidf_vectorizer = TfidfVectorizer()

# Compute TF-IDF vectors for the reviewText column
tfidf_matrix = tfidf_vectorizer.fit_transform(train_data['reviewText'].values.astype('U'))

# Compute cosine similarities between all items using the TF-IDF vectors
item_similarities = linear_kernel(tfidf_matrix, tfidf_matrix)

# Function to get top N similar items based on cosine similarity
def get_similar_items(item_id, top_n=5):
    item_index = train_data[train_data['asin'] == item_id].index[0]
    similar_indices = item_similarities[item_index].argsort()[::-1][1:top_n+1]
    similar_items = train_data.iloc[similar_indices]['asin'].values
    return similar_items

# Example usage: Get top 5 similar items for a given item ID
item_id=input('Enter Item id i.e. asin: ')
#item_id = 'B017O9P72A'
similar_items = get_similar_items(item_id, top_n=10)
print("Top 10 recommendations:")
print(similar_items)

Enter Item id i.e. asin: B017O9P72A
Top 10 recommendations:
['B015TJD0Y4' 'B015S1SWLO' 'B015TJD0Y4' 'B00XNQECFM' 'B00U33Q940'
 'B00VXS8E8S' 'B015S1SWLO' 'B00VXS8E8S' 'B00VXS8E8S' 'B00VXS8E8S']


# 3. Matrix Factoerization

In [5]:
from surprise import SVD

# Define the matrix factorization algorithm (e.g., SVD)
mf_model = SVD()

# Perform cross-validation on the training data
cross_validate(mf_model, train_dataset, measures=['RMSE', 'MAE'], cv=5, verbose=True)

# Train the matrix factorization model on the entire training data
mf_model.fit(trainset)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.5909  1.6429  1.4797  1.4000  1.4938  1.5215  0.0858  
MAE (testset)     1.3867  1.4094  1.3081  1.2259  1.2960  1.3252  0.0661  
Fit time          0.02    0.01    0.01    0.01    0.01    0.01    0.00    
Test time         0.00    0.00    0.00    0.00    0.00    0.00    0.00    


<surprise.prediction_algorithms.matrix_factorization.SVD at 0x21a080c0ca0>

# Content-Based Filtering Model Metrics Precision, f1 score, Recall

In [6]:
from sklearn.metrics import precision_score, recall_score, f1_score
# Define a threshold to convert ratings into classes (e.g., positive and negative)
threshold = 3.5

# Convert the continuous ratings into binary classes
true_classes = [1 if rating >= threshold else 0 for rating in train_data['overall']]
predicted_classes = [1 if item in similar_items else 0 for item in train_data['asin']]

# Compute precision, recall, and F1 score for the content-based filtering model
precision = precision_score(true_classes, predicted_classes)
recall = recall_score(true_classes, predicted_classes)
f1 = f1_score(true_classes, predicted_classes)

print("Content-Based Filtering Model Metrics:")
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

Content-Based Filtering Model Metrics:
Precision: 0.7032967032967034
Recall: 0.23063063063063063
F1 Score: 0.34735413839891455


# Matrix Factorization Precision, f1 score, Recall

In [7]:
# Matrix Factorization Precision, f1 score, Recall

# Generate predictions on the test set
test_dataset = Dataset.load_from_df(test_data[['reviewerID', 'asin', 'overall']], reader)
testset = test_dataset.build_full_trainset().build_testset()
mf_predictions = mf_model.test(testset)

# Convert ratings into classes based on a threshold
threshold = 3.6
true_classes = [1 if rating >= threshold else 0 for rating in test_data['overall']]
predicted_classes = [1 if pred.est >= threshold else 0 for pred in mf_predictions]

# Compute precision, recall, and F1 score for the matrix factorization model
precision = precision_score(true_classes, predicted_classes)
recall = recall_score(true_classes, predicted_classes)
f1 = f1_score(true_classes, predicted_classes)

# Print the evaluation results
print("Matrix Factorization Model Metrics:")
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

Matrix Factorization Model Metrics:
Precision: 0.7012987012987013
Recall: 0.8244274809160306
F1 Score: 0.7578947368421054


In [8]:
from flask import Flask, render_template, request

In [9]:
app = Flask(__name__)

In [10]:
from flask import Flask, render_template, request
import pandas as pd
from sklearn.metrics.pairwise import linear_kernel
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from sklearn.model_selection import train_test_split
from surprise import Dataset, Reader, KNNBasic

data = pd.read_csv(r'C:\Users\ihiba\OneDrive\Desktop\recommendation_data.csv')

# Split the data into training and testing sets
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

# Create a reader object to parse the data
reader = Reader(rating_scale=(1, 5))

# Load the data into the Surprise Dataset format
train_dataset = Dataset.load_from_df(train_data[['reviewerID', 'asin', 'overall']], reader)

# Create a TF-IDF vectorizer to convert text features into numerical vectors
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(train_data['reviewText'].values.astype('U'))

# Compute cosine similarities between all items using the TF-IDF vectors
item_similarities = linear_kernel(tfidf_matrix, tfidf_matrix)

app = Flask(__name__)

@app.route('/')
def home():
    return render_template('recommendation.html')

@app.route('/recommendations', methods=['GET'])
def recommend_items():
    item_id = request.args.get('asin')
    top_n = int(request.args.get('top_n', 5))
    similar_items = get_similar_items(item_id, top_n)
    return render_template('recommendation.html', item_id=item_id, items=similar_items)

def get_similar_items(item_id, top_n=5):
    item_index = train_data[train_data['asin'] == item_id].index[0]
    similar_indices = item_similarities[item_index].argsort()[::-1][1:top_n+1]
    similar_items = train_data.iloc[similar_indices][['asin', 'overall', 'reviewText']]
    return similar_items

if __name__ == '__main__':
    app.run(debug=True)


 * Serving Flask app '__main__'
 * Debug mode: on


 * Running on http://127.0.0.1:5000
Press CTRL+C to quit
 * Restarting with watchdog (windowsapi)


SystemExit: 1

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
