# Clustering
I'm trying out clustering algo because I'm stuck on the recommender system.  We dont really want to see what other foods are similarly rated. Im just confused on that right now. I want to try and see if I can identify the target based on the user ratings in this notebook

In [1]:
import numpy as np
import pandas as pd

In [2]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [5]:
user_data = pd.read_csv('user_data.csv')
user_data.rename(columns={'Unnamed: 0':'user_id'}, inplace=True)
user_data.set_index('user_id', inplace = True)
user_data.head()

Unnamed: 0_level_0,wheat,corn,dairy,beef,chicken,rice,apple,potatoes,broccoli,carrots,target
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,5.0,4.0,4.0,4.0,2.0,4.0,2.0,4.0,4.0,5.0,none
1,4.0,4.0,2.0,5.0,4.0,3.0,4.0,5.0,4.0,2.0,none
2,5.0,4.0,3.0,5.0,4.0,4.0,4.0,4.0,4.0,3.0,none
3,1.0,4.0,4.0,3.0,4.0,4.0,3.0,4.0,4.0,3.0,none
4,4.0,5.0,1.0,4.0,2.0,4.0,4.0,4.0,4.0,2.0,none


The target column is the information we want to predict. It shows user's food sensitivity. Most are none. 5% of the users should have a sensitivity to one of the 10 food items.  They were chosen at random.

In [8]:
user_data['target'].unique()

array(['none', 'wheat', 'potatoes', 'rice', 'corn', 'apple', 'chicken',
       'beef', 'carrots', 'dairy', 'broccoli'], dtype=object)

Sensitivity to all food items are represented in the dataset

In [9]:
from sklearn.model_selection import train_test_split 

In [11]:
X = user_data.drop('target',axis=1)   # selecting all the food ratings
y = user_data['target']            #  setting the target as the output

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [37]:
from sklearn.svm import SVC

In [38]:
sensitivity_model = SVC()

In [39]:
sensitivity_model.fit(X_train,y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [40]:
predictions = sensitivity_model.predict(X_test)

In [42]:
from sklearn.metrics import classification_report,confusion_matrix

In [43]:
print(classification_report(y_test,predictions))
print('\n')
print(confusion_matrix(y_test,predictions))

             precision    recall  f1-score   support

      apple       0.00      0.00      0.00        14
       beef       0.00      0.00      0.00        18
   broccoli       0.00      0.00      0.00        15
    carrots       0.00      0.00      0.00        14
    chicken       0.00      0.00      0.00        19
       corn       0.00      0.00      0.00        24
      dairy       0.00      0.00      0.00        21
       none       0.95      1.00      0.97      3133
   potatoes       0.00      0.00      0.00        17
       rice       0.00      0.00      0.00        11
      wheat       0.00      0.00      0.00        14

avg / total       0.90      0.95      0.92      3300



[[   0    0    0    0    0    0    0   14    0    0    0]
 [   0    0    0    0    0    0    0   18    0    0    0]
 [   0    0    0    0    0    0    0   15    0    0    0]
 [   0    0    0    0    0    0    0   14    0    0    0]
 [   0    0    0    0    0    0    0   19    0    0    0]
 [   0    0    0

  'precision', 'predicted', average, warn_for)


In [44]:
param_grid = {'C': [0.1,1, 10, 100, 1000], 'gamma': [1,0.1,0.01,0.001,0.0001], 'kernel': ['rbf']} 

In [45]:
from sklearn.model_selection import GridSearchCV

In [46]:
grid = GridSearchCV(SVC(),param_grid,refit=True,verbose=3)

In [47]:
grid.fit(X_train,y_train)

Fitting 3 folds for each of 25 candidates, totalling 75 fits
[CV] C=0.1, gamma=1, kernel=rbf ......................................
[CV] ....... C=0.1, gamma=1, kernel=rbf, score=0.951230, total=  12.8s
[CV] C=0.1, gamma=1, kernel=rbf ......................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   14.7s remaining:    0.0s


[CV] ....... C=0.1, gamma=1, kernel=rbf, score=0.951656, total=  13.1s
[CV] C=0.1, gamma=1, kernel=rbf ......................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   29.7s remaining:    0.0s


[CV] ....... C=0.1, gamma=1, kernel=rbf, score=0.952936, total=  13.3s
[CV] C=0.1, gamma=0.1, kernel=rbf ....................................
[CV] ..... C=0.1, gamma=0.1, kernel=rbf, score=0.951230, total=   1.4s
[CV] C=0.1, gamma=0.1, kernel=rbf ....................................
[CV] ..... C=0.1, gamma=0.1, kernel=rbf, score=0.951656, total=   1.4s
[CV] C=0.1, gamma=0.1, kernel=rbf ....................................
[CV] ..... C=0.1, gamma=0.1, kernel=rbf, score=0.952936, total=   1.4s
[CV] C=0.1, gamma=0.01, kernel=rbf ...................................
[CV] .... C=0.1, gamma=0.01, kernel=rbf, score=0.951230, total=   0.8s
[CV] C=0.1, gamma=0.01, kernel=rbf ...................................
[CV] .... C=0.1, gamma=0.01, kernel=rbf, score=0.951656, total=   0.9s
[CV] C=0.1, gamma=0.01, kernel=rbf ...................................
[CV] .... C=0.1, gamma=0.01, kernel=rbf, score=0.952936, total=   0.9s
[CV] C=0.1, gamma=0.001, kernel=rbf ..................................
[CV] .

[CV] ...... C=1000, gamma=1, kernel=rbf, score=0.950783, total=  21.1s
[CV] C=1000, gamma=1, kernel=rbf .....................................
[CV] ...... C=1000, gamma=1, kernel=rbf, score=0.951656, total=  21.3s
[CV] C=1000, gamma=1, kernel=rbf .....................................
[CV] ...... C=1000, gamma=1, kernel=rbf, score=0.952936, total=  21.5s
[CV] C=1000, gamma=0.1, kernel=rbf ...................................
[CV] .... C=1000, gamma=0.1, kernel=rbf, score=0.919911, total=   1.5s
[CV] C=1000, gamma=0.1, kernel=rbf ...................................
[CV] .... C=1000, gamma=0.1, kernel=rbf, score=0.932408, total=   1.5s
[CV] C=1000, gamma=0.1, kernel=rbf ...................................
[CV] .... C=1000, gamma=0.1, kernel=rbf, score=0.928732, total=   1.5s
[CV] C=1000, gamma=0.01, kernel=rbf ..................................
[CV] ... C=1000, gamma=0.01, kernel=rbf, score=0.941834, total=   3.3s
[CV] C=1000, gamma=0.01, kernel=rbf ..................................
[CV] .

[Parallel(n_jobs=1)]: Done  75 out of  75 | elapsed:  8.3min finished


GridSearchCV(cv=None, error_score='raise',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'C': [0.1, 1, 10, 100, 1000], 'gamma': [1, 0.1, 0.01, 0.001, 0.0001], 'kernel': ['rbf']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=3)

In [48]:
grid.best_params_

{'C': 0.1, 'gamma': 1, 'kernel': 'rbf'}

In [49]:
grid.best_estimator_

SVC(C=0.1, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma=1, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [50]:
grid_predictions = grid.predict(X_test)

In [51]:
print(classification_report(y_test,grid_predictions))
print('\n')
print(confusion_matrix(y_test,grid_predictions))

             precision    recall  f1-score   support

      apple       0.00      0.00      0.00        14
       beef       0.00      0.00      0.00        18
   broccoli       0.00      0.00      0.00        15
    carrots       0.00      0.00      0.00        14
    chicken       0.00      0.00      0.00        19
       corn       0.00      0.00      0.00        24
      dairy       0.00      0.00      0.00        21
       none       0.95      1.00      0.97      3133
   potatoes       0.00      0.00      0.00        17
       rice       0.00      0.00      0.00        11
      wheat       0.00      0.00      0.00        14

avg / total       0.90      0.95      0.92      3300



[[   0    0    0    0    0    0    0   14    0    0    0]
 [   0    0    0    0    0    0    0   18    0    0    0]
 [   0    0    0    0    0    0    0   15    0    0    0]
 [   0    0    0    0    0    0    0   14    0    0    0]
 [   0    0    0    0    0    0    0   19    0    0    0]
 [   0    0    0

  'precision', 'predicted', average, warn_for)
