In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Import the necessary libraries and read the provided CSVs as a data frame and perform the below steps. 
# • Merge the provided CSVs into one data-frame.
# • Check a few observations and shape of the data-frame.
# • Round off scores to the nearest integers.
# • Check for missing values. Impute the missing values if there is any.
# • Check for duplicate values and remove them if there is any.
# • Keep only 1000000 data samples. Use random state=612.
# • Drop irrelevant features. Keep features like Author, Product, and Score

In [1]:
!pip install scikit-surprise
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn import preprocessing
from collections import defaultdict
from surprise import SVD
from surprise import KNNWithMeans
from surprise import Dataset
from surprise import accuracy
from surprise import Reader
from surprise.model_selection import cross_validate
from surprise.model_selection import train_test_split



In [2]:
data1 = pd.read_csv('/content/drive/MyDrive/Jaanu_notebook/phone_user_review_file_1.csv', encoding='iso-8859-1')
data2 = pd.read_csv('/content/drive/MyDrive/Jaanu_notebook/phone_user_review_file_2.csv', encoding='iso-8859-1')
data3 = pd.read_csv('/content/drive/MyDrive/Jaanu_notebook/phone_user_review_file_3.csv', encoding='iso-8859-1')
data4 = pd.read_csv('/content/drive/MyDrive/Jaanu_notebook/phone_user_review_file_4.csv', encoding='iso-8859-1')
data5 = pd.read_csv('/content/drive/MyDrive/Jaanu_notebook/phone_user_review_file_5.csv', encoding='iso-8859-1')
data6 = pd.read_csv('/content/drive/MyDrive/Jaanu_notebook/phone_user_review_file_6.csv', encoding='iso-8859-1') 

In [3]:
data = pd.concat([data1,data2,data3,data4,data5,data6],axis=0)

In [4]:
data.shape

(1415133, 11)

In [5]:
data.info

<bound method DataFrame.info of                                 phone_url  ...                                            product
0          /cellphones/samsung-galaxy-s8/  ...                                  Samsung Galaxy S8
1          /cellphones/samsung-galaxy-s8/  ...                                  Samsung Galaxy S8
2          /cellphones/samsung-galaxy-s8/  ...  Samsung Galaxy S8 (64GB) G950U 5.8" 4G LTE Unl...
3          /cellphones/samsung-galaxy-s8/  ...                      Samsung Galaxy S8 64GB (AT&T)
4          /cellphones/samsung-galaxy-s8/  ...                                  Samsung Galaxy S8
...                                   ...  ...                                                ...
163832  /cellphones/alcatel-ot-club_1187/  ...                            Alcatel Club Plus Handy
163833  /cellphones/alcatel-ot-club_1187/  ...                            Alcatel Club Plus Handy
163834  /cellphones/alcatel-ot-club_1187/  ...                            Alcatel Club

In [6]:
data = data.drop_duplicates()

In [7]:
data = data.fillna(data.median())

In [8]:
data['score'] = data['score'].astype('int64') 
data['score_max'] = data['score_max'].astype('int64') 

In [9]:
data1 = data.copy()

In [10]:
data = data.sample(n=1000000, random_state=612)

In [11]:
data1= data1.sample(n=5000, random_state=612)

In [None]:
# Answer the following questions
# • Identify the most rated features.
# • Identify the users with most number of reviews.
# • Select the data with products having more than 50 ratings and users who have given more than 50 ratings. Report the shape of the final 
# dataset.

In [12]:
data.groupby('product')['score'].mean().sort_values(ascending=False).head()  

product
'Sony Xperia X (F5122) â White â Dual Sim (Google Android 6.0.1, 5 Display, 2 x CORTEX A72 1.8 GHz + 4 x cortex-a53...    10.0
LG Leon 4G 8GB 4G Oro - Smartphone (Android, MicroSIM, EDGE, GPRS, GSM, HSPA+, LTE, Barra)                                    10.0
Smartphone LG Nexus 4                                                                                                         10.0
LG LG G Flex2                                                                                                                 10.0
LG LG Stylus 2, 16Gb - Brown                                                                                                  10.0
Name: score, dtype: float64

In [13]:
data['author'].value_counts().head()

Amazon Customer    54600
Cliente Amazon     13634
e-bit               5948
Client d'Amazon     5500
Amazon Kunde        3307
Name: author, dtype: int64

In [14]:
data['product'].value_counts().head()

Lenovo Vibe K4 Note (White,16GB)     3700
Lenovo Vibe K4 Note (Black, 16GB)    3093
OnePlus 3 (Graphite, 64 GB)          2889
OnePlus 3 (Soft Gold, 64 GB)         2522
Huawei P8lite zwart / 16 GB          1895
Name: product, dtype: int64

In [None]:
# Build a popularity based model and recommend top 5 mobile phone

In [19]:
ratings_mean_count = pd.DataFrame(data.groupby('product')['score'].mean()) 
ratings_mean_count['rating_counts'] = pd.DataFrame(data.groupby('product')['score'].count())
ratings_mean_count.sort_values(by=['score','rating_counts'], ascending=[False,False]).head()

Unnamed: 0_level_0,score,rating_counts
product,Unnamed: 1_level_1,Unnamed: 2_level_1
Motorola Smartphone Motorola Moto X Desbloqueado Preto Android 4.2.2 CÃ¢mera 10MP e Frontal 2MP MemÃ³ria Interna de 16GB GSM,10.0,141
Motorola Smartphone Motorola Moto G Dual Chip Desbloqueado TIM Android 4.3 Tela 4.5 8GB 3G Wi-Fi CÃ¢mera 5MP - Preto,10.0,128
Samsung Galaxy Note5,10.0,127
Samsung Smartphone Dual Chip Samsung Galaxy SIII Duos Desbloqueado Claro Azul Android 4.1 3G/Wi-Fi CÃ¢mera 5MP,10.0,121
Nokia Smartphone Nokia Lumia 520 Desbloqueado Oi Preto Windows Phone 8 CÃ¢mera 5MP 3G Wi-Fi MemÃ³ria Interna 8G GPS,10.0,119


In [None]:
# Build a collaborative filtering model using SVD. You can use SVD from surprise or build it from scratch(Note: Incase you’re building it from scratch you 
# can limit your data points to 5000 samples if you face memory issues). Build a collaborative filtering model using kNNWithMeans from surprise. You 
# can try both user-based and item-based model

In [20]:
data = Dataset.load_from_df(data[['author','product','score']], Reader(rating_scale=(1, 10)))
trainset, testset = train_test_split(data, test_size=.25,random_state=50)

In [21]:
svd = SVD(random_state=20)
svd.fit(trainset)
svd_pred = svd.test(testset)

In [22]:
data1 = Dataset.load_from_df(data1[['author','product','score']], Reader(rating_scale=(1, 10)))
trainset, testset = train_test_split(data1, test_size=.25,random_state=50)

In [23]:
knn_i = KNNWithMeans(k=50, sim_options={'name': 'pearson_baseline', 'user_based': False})
knn_i.fit(trainset)
knn_i_pred = knn_i.test(testset)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


In [24]:
knn_u = KNNWithMeans(k=50, sim_options={'name': 'pearson_baseline', 'user_based': True})
knn_u.fit(trainset)
knn_u_pred = knn_i.test(testset)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


In [25]:
svd_pred_df=pd.DataFrame(svd_pred, columns=['uid', 'iid', 'rui', 'est', 'details'])
print('average rating  by test users: ',svd_pred_df['rui'].mean())

average rating  by test users:  8.037344


In [26]:
def get_top_product(predictions, n=5):
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n
top_5_prod = get_top_product(knn_i_pred,5)

In [27]:
svd_cv = cross_validate(svd,data, measures=['RMSE'], cv=5, verbose=False)

In [29]:
print(svd_cv)

{'test_rmse': array([2.46337915, 2.46282894, 2.46420874, 2.47651037, 2.46729506]), 'fit_time': (57.1516375541687, 56.688700914382935, 57.16180396080017, 56.956111669540405, 57.14771127700806), 'test_time': (2.1836013793945312, 2.725874662399292, 2.3535122871398926, 3.012338161468506, 3.056548595428467)}


In [30]:
svd_cv

{'fit_time': (57.1516375541687,
  56.688700914382935,
  57.16180396080017,
  56.956111669540405,
  57.14771127700806),
 'test_rmse': array([2.46337915, 2.46282894, 2.46420874, 2.47651037, 2.46729506]),
 'test_time': (2.1836013793945312,
  2.725874662399292,
  2.3535122871398926,
  3.012338161468506,
  3.056548595428467)}

In [31]:
knn_u_cv = cross_validate(knn_u,data1, measures=['RMSE'], cv=5, verbose=False)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


In [32]:
print(knn_u_cv)

{'test_rmse': array([2.56756708, 2.61381849, 2.45850875, 2.59582128, 2.61353114]), 'fit_time': (0.4715151786804199, 0.4637758731842041, 0.5044052600860596, 0.4774971008300781, 0.455275297164917), 'test_time': (0.007775306701660156, 0.008193731307983398, 0.008762121200561523, 0.008186578750610352, 0.008045673370361328)}


In [33]:
knn_u_cv

{'fit_time': (0.4715151786804199,
  0.4637758731842041,
  0.5044052600860596,
  0.4774971008300781,
  0.455275297164917),
 'test_rmse': array([2.56756708, 2.61381849, 2.45850875, 2.59582128, 2.61353114]),
 'test_time': (0.007775306701660156,
  0.008193731307983398,
  0.008762121200561523,
  0.008186578750610352,
  0.008045673370361328)}

In [None]:
# Popularity based recommendation system works according to the trend. For example, if any product which is usually bought by every new user
# then there are chances that it may suggest that item to the user who just signed up.

In [None]:
# Collaborative filtering is useful in scenarios like:
# Giving personalised recommendation to the user.
# Example: Movie Recommender System, Book Recommender system etc.

In [None]:
# Other possible methods can you think of which can further improve the recommendation for different users  
# Recommendation methods like Content plus Collaborative method, Demographic, Utility based, and Knowledge based recommendation system 