In [1]:
import pandas as pd
import numpy as np
import scipy
from scipy import stats
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import ShuffleSplit

In [2]:
phones = pd.read_csv("phones.csv")
rankings = pd.read_csv("rankings.csv")

### Exploring Data

In [3]:
ranked_phones = set(np.unique(np.concatenate([rankings["top"], rankings["second"], rankings["third"]])))
known_phones = set(phones["Phone"])

There are 28 phones that are recommended, but 37 phones in our phone database.

In [4]:
phones.head(2)

Unnamed: 0,Phone,OS,Price,Area,Camera,Battery,Speed
0,OnePlus 8,Android,,18.1,7.0,7.07,7.3
1,OnePlus 8T,Android,399.0,18.5,7.0,6.95,7.5


In [5]:
rankings.head(2)

Unnamed: 0,OS,Budget,Size,Camera Importance,Usage Hours (Battery),Speed Importance,top,second,third,confidence,difficulty
0,Must be Android,400.0,Large,A little,A little,A lot,Motorola Moto G100,Motorola Edge (2021),Samsung Galaxy A42,OK,Medium
1,Must be Android,400.0,Large,A ton,A little,I need speed,Google Pixel 6,OnePlus 9,Samsung Galaxy S20 FE,OK,Hard


In [6]:
rankings.describe()

Unnamed: 0,OS,Budget,Size,Camera Importance,Usage Hours (Battery),Speed Importance,top,second,third,confidence,difficulty
count,177,177,177,177,177,177,177,177,177,177,177
unique,5,4,4,5,4,4,21,25,27,3,3
top,No preference,Not sure,No preference,A little,Not sure,A little,Apple iPhone 13 Pro Max,Apple iPhone 13 Pro,Apple iPhone 13,OK,Medium
freq,62,74,66,38,79,68,21,22,24,90,99


### Cleaning & Preparating Phones DF

In [7]:
#Drop rows with NA prices since they aren't in ranking table either
phones.dropna(axis=0,inplace=True)

In [8]:
#Standardized phones DF to calculate rating scores for each phone.
phones2=phones.copy()

col_names=['Price', 'Area', 'Camera', 'Battery', 'Speed']

for column in col_names:
    phones2['{}'.format(column)] = stats.zscore(phones2['{}'.format(column)])

In [9]:
#Calculating rating for each phone & adding it to the original phone table.
phones['Rating']=phones2.Price+phones2.Area+phones2.Camera+phones2.Battery+phones2.Speed

In [10]:
phones.head()

Unnamed: 0,Phone,OS,Price,Area,Camera,Battery,Speed,Rating
1,OnePlus 8T,Android,399.0,18.5,7.0,6.95,7.5,-1.070986
3,OnePlus 9,Android,599.0,18.4,8.5,6.22,8.5,0.391285
4,OnePlus 9 Pro,Android,799.0,18.6,8.7,5.98,8.5,1.171591
5,Motorola Edge (2021),Android,449.0,19.8,6.5,8.17,6.5,-0.011016
6,Motorola Edge 30 Pro,Android,899.0,19.2,8.3,5.98,8.5,1.626206


### Cleaning & Preparating Ranking DF

In [11]:
#Drop confidence & difficulty columns
rankings.drop(columns=['confidence','difficulty'], inplace=True); 

In [12]:
#Exploring column scales
print(set(rankings['Camera Importance']))
print(set(rankings['Usage Hours (Battery)']))
print(set(rankings['Speed Importance']))
print(set(rankings['Size']))
print(set(rankings['OS']))
print(set(rankings['Size']))
print(set(rankings['Budget']))

{'Not sure', 'A little', 'A ton', 'A lot', 'Somewhat'}
{'Not sure', 'A ton', 'A lot', 'A little'}
{'A lot', 'I need speed', 'Not sure', 'A little'}
{'Large', 'Small', 'Medium', 'No preference'}
{'Must be iPhone', 'Prefer iPhone', 'Must be Android', 'Prefer Android', 'No preference'}
{'Large', 'Small', 'Medium', 'No preference'}
{'Not sure', '800.00', '400.00', '600.00'}


In [13]:
#Changing string scales to numerical ordinal scales

camera_scale = {'A ton':4, 'A lot':3, 'Somewhat':2, 'A little':1, 'Not sure':0}
battery_scale = {'A ton':3, 'A lot':2, 'A little':1, 'Not sure':0}
speed_scale = {'A lot':3, 'I need speed':2, 'A little':1, 'Not sure':0}
size_scale={'Large':3,'Medium':2,  'Small':1, 'No preference':0}
os_scale={'Must be iPhone':4, 'Prefer iPhone':3, 'No preference':2, 'Prefer Android':1, 'Must be Android':0}
budget_scale={'400.00':400,'600.00':600,'800.00':800,'Not sure':700}

rankings['Camera Importance'] = rankings['Camera Importance'].apply(lambda x: camera_scale[x])
rankings['Usage Hours (Battery)'] = rankings['Usage Hours (Battery)'].apply(lambda x: battery_scale[x])
rankings['Speed Importance'] = rankings['Speed Importance'].apply(lambda x: speed_scale[x])
rankings['OS'] = rankings['OS'].apply(lambda x: os_scale[x])
rankings['Size'] = rankings['Size'].apply(lambda x: size_scale[x])
rankings['Budget'] = rankings['Budget'].apply(lambda x: budget_scale[x])

In [14]:
rankings.head(3)

Unnamed: 0,OS,Budget,Size,Camera Importance,Usage Hours (Battery),Speed Importance,top,second,third
0,0,400,3,1,1,3,Motorola Moto G100,Motorola Edge (2021),Samsung Galaxy A42
1,0,400,3,4,1,2,Google Pixel 6,OnePlus 9,Samsung Galaxy S20 FE
2,0,400,2,0,0,2,OnePlus 8T,Motorola Moto G100,OnePlus 9


### Train-Test Split for Training & Evaluation

In [15]:
#Preparing data for split
X=rankings[['OS','Budget','Size','Camera Importance','Usage Hours (Battery)','Speed Importance']]
y1=rankings[['top']]
y2=rankings[['second']]
y3=rankings[['third']]

In [16]:
#Split with top recommendation
X_train, X_test, y1_train, y1_test = train_test_split(X, y1, test_size=0.33, random_state=1, shuffle=True)
#Split with second recommendation
X_train, X_test, y2_train, y2_test = train_test_split(X, y2, test_size=0.33, random_state=1, shuffle=True)
#Split with third recommendation
X_train, X_test, y3_train, y3_test = train_test_split(X, y3, test_size=0.33, random_state=1, shuffle=True)

## Recommender Algorithm

### Hyperparameter Tuning for KNN

In [17]:
#Parameters to be tested
leaf_sizes=[15,30,45,60,80]
p_options=[1,2]

#Grid Searching Parameters
for p in p_options:
    for leaf_size in leaf_sizes:
        
        nn_model=KNeighborsClassifier(n_neighbors=1,p=p,leaf_size=leaf_size) #KNN Model
        cv = ShuffleSplit(n_splits=5, test_size=0.33,random_state=2) #Creating fold splits for parameter testing.
        score=cross_val_score(nn_model,X,y1.top,cv=cv) 
        
        print('When leaf size= {} & p={}, score is {:.3f}'.format(leaf_size,p,np.mean(score)))

When leaf size= 15 & p=1, score is 0.342
When leaf size= 30 & p=1, score is 0.346
When leaf size= 45 & p=1, score is 0.346
When leaf size= 60 & p=1, score is 0.349
When leaf size= 80 & p=1, score is 0.349
When leaf size= 15 & p=2, score is 0.292
When leaf size= 30 & p=2, score is 0.295
When leaf size= 45 & p=2, score is 0.295
When leaf size= 60 & p=2, score is 0.302
When leaf size= 80 & p=2, score is 0.302


In [18]:
X_train.head(1)

Unnamed: 0,OS,Budget,Size,Camera Importance,Usage Hours (Battery),Speed Importance
66,2,800,0,1,0,0


In [19]:
X_train.iloc[0]

OS                         2
Budget                   800
Size                       0
Camera Importance          1
Usage Hours (Battery)      0
Speed Importance           0
Name: 66, dtype: int64

In [32]:
def predict_top_3_ranking(query):
    '''
    This function takes a query expressed as numerical values. 
    See the dictionary in the code above for how to encode the query.
    
    Input: list of query numbers (list, int)
    Output: top phone recommendation (str)
    
    '''
    
    query=[query]
    
    #List of existing queries
    existing_queries=[]

    for i in range(X_train.shape[0]):
        query_sample=[]
        query_sample.append(X_train.OS.iloc[i])
        query_sample.append(X_train.Budget.iloc[i])
        query_sample.append(X_train.Size.iloc[i])
        query_sample.append(X_train['Camera Importance'].iloc[i])
        query_sample.append(X_train['Usage Hours (Battery)'].iloc[i])
        query_sample.append(X_train['Speed Importance'].iloc[i])
        existing_queries.append(query_sample)

    
    #KNN model
    nn_model=KNeighborsClassifier(n_neighbors=1,p=1,leaf_size=60)
    
    #KNN Top Rec
    nn_model.fit(X_train, y1_train.top)
    knn1_recommendation=nn_model.predict(query)

    #KNN 2nd Rec
    nn_model.fit(X_train, y2_train.second)
    knn2_recommendation=nn_model.predict(query)
    
    #KNN 3rd Rec
    nn_model.fit(X_train, y3_train.third)
    knn3_recommendation=nn_model.predict(query)
    
    #print('KNN Recs: {},{},{}'.format(knn1_recommendation, knn2_recommendation, knn3_recommendation))
    
    #If already rated by expert, return expert recommendations
    if query[0] in existing_queries:    
        return knn1_recommendation[0]
    
    #If not already rated by expert, cross-check recommendation with manual recommendation.
    
    #Manual Model
    #Query for phone OS
    if query[0][0]<2:
        narrow_df=phones[phones.OS=='Android']
    elif  query[0][0]>2:
        narrow_df=phones[phones.OS=='iOS']
    else: 
        narrow_df=phones
        
    #print('------OS filtered')
    #print(narrow_df)
    
    #Query for Price
    if query[0][1]==400:
        narrow_df=narrow_df[narrow_df.Price<=600]
    
    elif query[0][1]==600:
        narrow_df=narrow_df[narrow_df.Price<=800]
        
    elif query[0][1]==800:
        narrow_df=narrow_df
        
    elif query[0][1]==700:
        narrow_df=narrow_df
    
    #print('------Price Filtered')
    #print(narrow_df)
    
    #Query for Area
    
    #Phones filtered by size, if no preference, no filter applied.
    
    if query[0][2]==1:
        narrow_df=narrow_df[narrow_df['Area']<=17] #Small phones only
    elif query[0][2]==2:
        narrow_df=narrow_df[(narrow_df.Area>17) & (narrow_df.Area<=19.05)] #Medium Phones Only
    elif query[0][2]==3:
        narrow_df=narrow_df[narrow_df['Area']>19.05] #Large Phones only
    
    #print('------Area Filtered')
    #print(narrow_df)
    
    #Query for Camera
    
    #Phones filtered by camera rating according to percentiles. 
    # For instance, 'A lot' = 60+ percentile, 'A little'=20+ percentile
    
    camera_percentile={4:80,3:60,2:40,1:20,0:0}
    percentile=camera_percentile.get(query[0][3])
    cutoff=np.percentile(phones.Camera,percentile)
    
    narrow_df=narrow_df[narrow_df['Camera']>=cutoff]
    
    #print('------Camera Filtered')
    #print(narrow_df)
    
    #Query for Battery
    battery_percentile={3:75,2:50,1:25,0:0.0}
    percentile=battery_percentile.get(query[0][4])
    cutoff=np.percentile(phones.Battery,percentile)
    
    narrow_df=narrow_df[narrow_df['Battery']>=cutoff]
    
    #print('------Battery Filtered')
    #print(narrow_df)
    
    #Query for Speed Importance
    speed_percentile={3:75,2:50,1:25,0:0.0}
    percentile=speed_percentile.get(query[0][5])
    cutoff=np.percentile(phones.Speed,percentile)
    
    narrow_df=narrow_df[narrow_df['Speed']>=cutoff]

    #print('------Speed Filtered')
    #print(narrow_df)
    
    #Checking if KNN Top1 Rec is in narrow_df. If it is, all conditions user requirements have been met.
    # Recommend KNN product if true. Otherwise, try manual model.
    check=knn1_recommendation[0] in narrow_df.Phone
    
    if check:
        top_recommendation=knn1_recommendation
        
    else: #Trying manual model recommendation.
        if len(narrow_df.Phone)==1: 
            #If KNN recommendation does not meet requirements and manual rec has one that does, recommend that instead.
            top_recommendation=narrow_df.Phone.iloc[0]

        elif len(narrow_df.Phone)>1: 
            #If manual model has multiple recommendations, pick one with highest rating
            narrow_df.sort_values(by=['Rating'],ascending=False,inplace=True)
            top_recommendation=narrow_df.iloc[0].Phone

        else: 
            # If manual model has no recommendations, use KNN recommendation.
            top_recommendation=knn1_recommendation[0]
    
    return top_recommendation

In [21]:
#Testing recommender output

query='iPhone, $800+, Small, A lot, A little, A lot'
query=[0,400,3,1,1,3]
predict_top_3_ranking(query)

'Motorola Moto G100'

### Evaluation for top recommendation

In [33]:
#Predict recommendations for test set.
predicted_recommendations=[]
for i in range(X_test.shape[0]):
    rec=predict_top_3_ranking(list(X_test.iloc[i]))
    predicted_recommendations.append(rec)

In [23]:
#Check some of the recommendations predicted.
print(predicted_recommendations[:5])

['Google Pixel 5a', 'Samsung Galaxy A42', 'Samsung Galaxy S22 Ultra', 'Apple iPhone 13 Pro Max', 'Google Pixel 6']


In [31]:
#Calculate classification accuracy (for KNN only)
count=0
correct_count=0
for i in range(y1_test.shape[0]):
    count+=1
    if y1_test.iloc[i][0]==predicted_recommendations[i]:
        correct_count+=1
        
percent=correct_count/count  
print('The top recommendation accuracy (corrected guessed) is {:.2f}.'.format(percent))

The top recommendation accuracy (corrected guessed) is 0.37.


In [25]:
#Calculate classification accuracy (for Hybrid Model)
count=0
correct_count=0
for i in range(y1_test.shape[0]):
    count+=1
    if y1_test.iloc[i][0]==predicted_recommendations[i]:
        correct_count+=1
        
percent=correct_count/count  
print('The top recommendation accuracy (corrected guessed) is {:.2f}.'.format(percent))

The top recommendation accuracy (corrected guessed) is 0.46.


In [26]:
#Evaluate count of how many predicted top recommendations match top 3 recommendations (ground truth) (Hybrid)

count=0
correct_count=0
for i in range(y1_test.shape[0]):
    count+=1
    if predicted_recommendations[i] in [y1_test.iloc[i][0],y2_test.iloc[i][0],y3_test.iloc[i][0]]:
        correct_count+=1
percent=correct_count/count
print('The predicted top recommendation exists in {:.2f} of all top 3 recommendation lists.'.format(percent))

The predicted top recommendation exists in 0.59 of all top 3 recommendation lists.
