In [1]:
import numpy as np
import pandas as pd
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.neighbors import NearestNeighbors

In [45]:
df = pd.read_json('final_result_after_logistic.json')

In [46]:
df_knn = df[['asin','overall','predicted_rating','processedReview', 'summary']]

In [47]:
df_product_review_data_from_sentiment = df_knn.groupby("asin", as_index=False).mean()

In [48]:
df_product_review_data_from_sentiment.head()

Unnamed: 0,asin,overall,predicted_rating
0,5019281,4.458716,4.886859
1,5119367,4.793478,4.952117
2,307514161,4.732283,4.917819
3,310263662,3.962461,4.639496
4,310274281,4.869159,4.926245


In [49]:
df_merge_review_series = df_knn.groupby("asin")["summary"].apply(list).reset_index()
df_merge_review_data = pd.DataFrame(df_merge_review_series)

In [50]:
df_merge_review_data.head()

Unnamed: 0,asin,summary
0,5019281,"[good version of a classic, Good but not as mo..."
1,5119367,"[Wonderful!!!, Joseph, As real as it gets!, a ..."
2,307514161,[Who needs it to be christmas to watch this fl...
3,310263662,[watching anyone being tortured would be sad -...
4,310274281,"[great life lessons, Awesome, Inspiring, GREAT..."


In [51]:
final_knn_data = pd.merge(df_product_review_data_from_sentiment, df_merge_review_series, on="asin", how='inner')

In [52]:
final_knn_data.head()

Unnamed: 0,asin,overall,predicted_rating,summary
0,5019281,4.458716,4.886859,"[good version of a classic, Good but not as mo..."
1,5119367,4.793478,4.952117,"[Wonderful!!!, Joseph, As real as it gets!, a ..."
2,307514161,4.732283,4.917819,[Who needs it to be christmas to watch this fl...
3,310263662,3.962461,4.639496,[watching anyone being tortured would be sad -...
4,310274281,4.869159,4.926245,"[great life lessons, Awesome, Inspiring, GREAT..."


In [53]:
regEx = re.compile('[^a-z]+')
def clean_data(text_array):
    text = " ".join(text_array)
    text = text.lower()
    text = regEx.sub(' ', text).strip()
    return text

In [54]:
final_knn_data["clean_summary_data"] = final_knn_data["summary"].apply(clean_data)



In [55]:
final_knn_data.head()

Unnamed: 0,asin,overall,predicted_rating,summary,clean_summary_data
0,5019281,4.458716,4.886859,"[good version of a classic, Good but not as mo...",good version of a classic good but not as movi...
1,5119367,4.793478,4.952117,"[Wonderful!!!, Joseph, As real as it gets!, a ...",wonderful joseph as real as it gets a man of c...
2,307514161,4.732283,4.917819,[Who needs it to be christmas to watch this fl...,who needs it to be christmas to watch this fli...
3,310263662,3.962461,4.639496,[watching anyone being tortured would be sad -...,watching anyone being tortured would be sad je...
4,310274281,4.869159,4.926245,"[great life lessons, Awesome, Inspiring, GREAT...",great life lessons awesome inspiring great pic...


In [56]:
final_knn_data_clean = final_knn_data[['asin','overall','predicted_rating','clean_summary_data']]

In [57]:
final_knn_data_clean.head()

Unnamed: 0,asin,overall,predicted_rating,clean_summary_data
0,5019281,4.458716,4.886859,good version of a classic good but not as movi...
1,5119367,4.793478,4.952117,wonderful joseph as real as it gets a man of c...
2,307514161,4.732283,4.917819,who needs it to be christmas to watch this fli...
3,310263662,3.962461,4.639496,watching anyone being tortured would be sad je...
4,310274281,4.869159,4.926245,great life lessons awesome inspiring great pic...


In [58]:
final_knn_data_clean.to_json('final_knn_data_clean.json')


In [59]:
countVector = CountVectorizer(max_features = 350, stop_words='english') 
transformedReviews = countVector.fit_transform(final_knn_data_clean['clean_summary_data']) 

In [60]:
df_knn_vectorized_data = pd.DataFrame(transformedReviews.A, columns=countVector.get_feature_names())
df_knn_vectorized_data = df_knn_vectorized_data.astype(int)

In [61]:
countVector.get_feature_names()

[u'absolutely',
 u'acting',
 u'action',
 u'actors',
 u'actually',
 u'adaptation',
 u'adventure',
 u'age',
 u'amazing',
 u'amazon',
 u'american',
 u'animated',
 u'animation',
 u'art',
 u'average',
 u'away',
 u'awesome',
 u'awful',
 u'bad',
 u'batman',
 u'beautiful',
 u'beginning',
 u'believe',
 u'best',
 u'better',
 u'big',
 u'bit',
 u'black',
 u'blood',
 u'blu',
 u'blue',
 u'bond',
 u'book',
 u'boring',
 u'brilliant',
 u'british',
 u'buy',
 u'cast',
 u'character',
 u'characters',
 u'charming',
 u'children',
 u'christmas',
 u'classic',
 u'collection',
 u'come',
 u'comedy',
 u'comes',
 u'comic',
 u'coming',
 u'complete',
 u'cool',
 u'creepy',
 u'cut',
 u'cute',
 u'dark',
 u'day',
 u'dead',
 u'death',
 u'decent',
 u'definitely',
 u'delightful',
 u'did',
 u'didn',
 u'die',
 u'different',
 u'director',
 u'disappointed',
 u'disappointing',
 u'disappointment',
 u'disc',
 u'disney',
 u'disturbing',
 u'does',
 u'doesn',
 u'don',
 u'drama',
 u'dvd',
 u'edition',
 u'effects',
 u'end',
 u'ending',

In [62]:
df_knn_vectorized_data.head()

Unnamed: 0,absolutely,acting,action,actors,actually,adaptation,adventure,age,amazing,amazon,...,worst,worth,worthy,wow,wrong,year,years,yes,young,zombie
0,0,0,0,0,0,4,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
1,0,1,0,0,0,0,0,0,1,0,...,0,1,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,2,1,0,...,0,1,0,0,0,2,0,0,0,0
3,2,1,1,0,1,0,0,0,10,6,...,4,8,1,10,1,5,2,2,0,0
4,0,0,0,0,0,0,0,0,0,1,...,0,0,1,0,0,0,0,0,1,0


In [63]:
final_knn_data_clean_without_summary = final_knn_data_clean[['asin','overall','predicted_rating']]


In [64]:
final_knn_data_clean_without_summary.head()

Unnamed: 0,asin,overall,predicted_rating
0,5019281,4.458716,4.886859
1,5119367,4.793478,4.952117
2,307514161,4.732283,4.917819
3,310263662,3.962461,4.639496
4,310274281,4.869159,4.926245


In [65]:
data_with_asin = final_knn_data_clean_without_summary.join(df_knn_vectorized_data, how='outer')

In [66]:
data_with_asin.reset_index()
data = data_with_asin.drop('asin', 1)
data_with_asin.tail(180)

Unnamed: 0,asin,overall,predicted_rating,absolutely,acting,action,actors,actually,adaptation,adventure,...,worst,worth,worthy,wow,wrong,year,years,yes,young,zombie
4379,B00ARA4SLU,3.951128,4.665250,0,3,0,1,0,0,0,...,1,1,0,1,0,0,0,0,0,0
4380,B00ARX2VZW,4.369369,4.885772,0,0,0,0,0,0,0,...,0,2,0,0,0,0,0,0,0,0
4381,B00AW0KHTM,4.390426,4.584431,1,1,7,1,1,1,0,...,2,3,1,6,1,3,0,0,1,0
4382,B00AY2DL78,4.625000,4.905905,1,0,0,0,0,0,0,...,0,1,0,2,0,0,0,0,0,0
4383,B00AZMFINM,3.069930,3.406936,0,0,0,0,0,0,0,...,3,0,0,0,0,0,0,2,0,0
4384,B00AZMFJ90,3.570175,4.147407,0,1,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
4385,B00AZMFO6I,4.333333,4.656220,3,10,0,7,0,1,0,...,0,7,1,0,0,0,3,0,3,0
4386,B00AZMFONG,3.647564,4.017977,0,6,3,23,0,0,0,...,0,7,0,2,1,0,0,0,0,0
4387,B00AZNEW5G,3.929699,4.305310,1,6,3,1,2,0,0,...,2,10,0,6,0,5,0,1,0,0
4388,B00B2YH7BS,4.604396,4.881869,2,0,0,0,0,0,0,...,0,3,1,1,0,0,0,4,1,0


In [24]:
data.head()

Unnamed: 0,overall,predicted_rating,absolutely,acting,action,actors,actually,adaptation,adventure,age,...,worst,worth,worthy,wow,wrong,year,years,yes,young,zombie
0,4.458716,4.886859,0,0,0,0,0,4,0,0,...,0,0,1,0,0,0,0,0,0,0
1,4.793478,4.952117,0,1,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
2,4.732283,4.917819,1,0,0,0,0,0,0,2,...,0,1,0,0,0,2,0,0,0,0
3,3.962461,4.639496,2,1,1,0,1,0,0,0,...,4,8,1,10,1,5,2,2,0,0
4,4.869159,4.926245,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,1,0


In [67]:
data.to_json("data.json")

In [68]:
np_data = np.array(data)


In [69]:
total_data = len(data)
size_of_cut = int(np.floor(0.95 * total_data))
training_data = np_data[:size_of_cut]
test_data = np_data[size_of_cut:]


In [70]:
len(training_data)
data_with_asin_test = data_with_asin.tail(total_data - size_of_cut)
data_with_asin_test_less_cols = data_with_asin_test[['asin','overall','predicted_rating']]

0

In [71]:
len(test_data)

228

In [72]:
neighbor = NearestNeighbors(n_neighbors=3, algorithm='ball_tree').fit(training_data)

In [73]:
distances, indices = neighbor.kneighbors(training_data)

In [74]:
def run_knn(row):
    val = neighbor.kneighbors([test_data[row['serial_no']]])[1]
    arr = val[0]
    temp_arr =[]
    for element in arr:
        temp_arr.append(data_with_asin["asin"][element])
    return ",".join(map(str, temp_arr))

In [75]:
data_with_asin_test_less_cols.reset_index()
data_with_asin_test_less_cols.insert(0, 'serial_no', range(0, 0 + len(data_with_asin_test_less_cols)))

In [76]:
data_with_asin_test_less_cols['recommended']= data_with_asin_test_less_cols.apply(run_knn,axis=1)
data_with_asin_test_less_cols.head()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,serial_no,asin,overall,predicted_rating,recommended
4331,0,B009B8YZIK,3.75,3.956708,"B009AMAMT8,B0034G4P58,B0019EXZXU"
4332,1,B009D4SFEC,4.802691,4.952209,"B005A1GS00,B00527QY88,B000N6UF0G"
4333,2,B009DS6YGO,4.700599,4.935863,"B00007KLE8,0792110803,B002KMIW6K"
4334,3,B009EU6F1U,3.586207,3.924027,"B004XCM89M,B0012KJ0DK,B0049TC88U"
4335,4,B009H3LN8Y,3.835681,4.172553,"B00062IZ0Y,B008220BW0,B0077ATSSQ"


In [77]:
data_with_asin_test_less_cols.to_csv("data_with_asin_test_less_cols.csv")

In [79]:
data_with_asin_test_less_cols.head()

Unnamed: 0,serial_no,asin,overall,predicted_rating,recommended
4331,0,B009B8YZIK,3.75,3.956708,"B009AMAMT8,B0034G4P58,B0019EXZXU"
4332,1,B009D4SFEC,4.802691,4.952209,"B005A1GS00,B00527QY88,B000N6UF0G"
4333,2,B009DS6YGO,4.700599,4.935863,"B00007KLE8,0792110803,B002KMIW6K"
4334,3,B009EU6F1U,3.586207,3.924027,"B004XCM89M,B0012KJ0DK,B0049TC88U"
4335,4,B009H3LN8Y,3.835681,4.172553,"B00062IZ0Y,B008220BW0,B0077ATSSQ"


In [80]:
neighbor_with_cosine_similarity = NearestNeighbors(n_neighbors=3, algorithm='brute',metric='cosine').fit(training_data)

In [81]:
def run_knn_cosine(row):
    val = neighbor_with_cosine_similarity.kneighbors([test_data[row['serial_no']]])[1]
    arr = val[0]
    temp_arr =[]
    for element in arr:
        temp_arr.append(data_with_asin["asin"][element])
    return ",".join(map(str, temp_arr))

In [84]:
data_with_asin_test_less_cols.reset_index()
data_with_asin_test_less_cols.insert(0, 'serial_no', range(0, 0 + len(data_with_asin_test_less_cols)))


ValueError: cannot insert serial_no, already exists

In [85]:
data_with_asin_test_less_cols['recommended_cosine']= data_with_asin_test_less_cols.apply(run_knn_cosine,axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [86]:
data_with_asin_test_less_cols

Unnamed: 0,serial_no,asin,overall,predicted_rating,recommended,recommended_cosine
4331,0,B009B8YZIK,3.750000,3.956708,"B009AMAMT8,B0034G4P58,B0019EXZXU","B005LAIGLE,B005LAIIQC,B0012RLX7Y"
4332,1,B009D4SFEC,4.802691,4.952209,"B005A1GS00,B00527QY88,B000N6UF0G","B00331RHPO,B009AF5OY8,1608838137"
4333,2,B009DS6YGO,4.700599,4.935863,"B00007KLE8,0792110803,B002KMIW6K","B001CDLATY,B002KMIW6K,0792110803"
4334,3,B009EU6F1U,3.586207,3.924027,"B004XCM89M,B0012KJ0DK,B0049TC88U","B008JFUPPI,B005LAIGXW,B004A8ZWWO"
4335,4,B009H3LN8Y,3.835681,4.172553,"B00062IZ0Y,B008220BW0,B0077ATSSQ","B00020BVYW,B00062IZ0Y,B00005JHAA"
4336,5,B009HIK3V2,3.711538,4.059056,"B00003CXQF,B002UADLXG,B0062P332Y","B00682LS4G,B002NXSRVG,B005LAII44"
4337,6,B009JBZH54,4.653571,4.809105,"B000W07EKW,B00005QCYC,B0002Y69NG","B000W07EKW,B00005QCYC,B00006AL1D"
4338,7,B009L79YFU,3.548387,3.958463,"B0010YVCB6,B005OK721G,B00005AFT5","B005LAIGLE,B0010YVCB6,B00682LS4G"
4339,8,B009LDCWWG,4.683544,4.840575,"B003L77GH4,B004YM6JLO,B002JVWR9U","B003L77GH4,B004YM6JLO,B002JVWR9U"
4340,9,B009LDCXNY,4.678392,4.884059,"B003L77FYS,B004YM6JI2,B001BMGXTI","B004YM6JI2,B0058YPGSY,B0053O8A46"


In [87]:
data_with_asin_test_less_cols.to_csv("data_with_asin_test_less_cols.csv")

In [89]:
neighbor_with_kd_tree = NearestNeighbors(n_neighbors=3, algorithm='kd_tree').fit(training_data)

In [90]:
def run_knn_kd(row):
    val = neighbor_with_kd_tree.kneighbors([test_data[row['serial_no']]])[1]
    arr = val[0]
    temp_arr =[]
    for element in arr:
        temp_arr.append(data_with_asin["asin"][element])
    return ",".join(map(str, temp_arr))

In [91]:
data_with_asin_test_less_cols['recommended_kd_tree']= data_with_asin_test_less_cols.apply(run_knn_kd,axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [93]:
data_with_asin_test_less_cols.to_csv("data_with_asin_test_less_cols.csv")