### Imports

In [None]:
import gzip
import json
import pandas as pd
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import TfidfVectorizer,TfidfTransformer,CountVectorizer
from sklearn.neighbors import NearestNeighbors
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import pickle
from sklearn.decomposition import PCA
from matplotlib import pyplot as plt
from sklearn.decomposition import TruncatedSVD
from collections import OrderedDict
import numpy as np
from scipy.sparse import csr_matrix

# Item Profiling

In [None]:
# Importing Products' data+
item_data = gzip.open("/home/data/amazon-reviews/metadata.json.gz","r")

def explore_data():   
    for line in item_data:
        yield json.dumps(eval(line))

In [None]:
# Importing reviews in order to preserve only the items with more tahn 5 reviews
review_file =  open("/home/wsaadallah/amazon-reviews/complete.json","r") 
review_lines_text = review_file.readlines(900000000)

reviews=[]
for rlt in review_lines_text:
    reviews.append(json.loads(rlt))

reviews_df = pd.DataFrame(reviews, columns=['asin','overall','reviewText'])

count_reviews = reviews_df['asin'].value_counts()
# Print(count_reviews)

# Preserving only items with more tha 5 reviews
items = []
for line in explore_data():
    json_item = json.loads(line)
    if json_item["asin"] in count_reviews and count_reviews[json_item["asin"]] >= 5 :
        items.append(json_item)

In [None]:
#Constructing items' dataframe
items_df = pd.DataFrame(items, columns=["asin","description", "title", "categories"])

#### Data Preprocessing

In [None]:
# Removing items with nans
filtred_items_df = items_df.dropna()
# Reseting the index
filtred_items_df.reset_index(drop=True, inplace=True)

In [None]:
# Removing the variables that we don't need anymore in order to empty the RAM
items_df = None
del items_df

In [None]:
# Printing items' dataframe
filtred_items_df.head(5)

In [None]:
# As we can see, the categories field is a list of list of string. For simplification reasons, we will only maintain the first
# So, we are extracting the first category and putting it in a new column 'category1'
filtred_items_df['category1']= filtred_items_df['categories'].map(lambda x: [x[0][0]])

In [None]:
# Applying One Hot Encoder on the category1 field
filtred_items_df = filtred_items_df.drop('category1', 1).join(filtred_items_df['category1'].str.join('|').str.get_dummies())

In [None]:
# We think that setting dataFrame index to the product_id category which is the 'asin' field, will facilitate the access 
# to the item since it's a unique value
filtred_items_df = filtred_items_df.set_index('asin')

#### Information Retrieval : TF-IDF vectorisation

In [None]:
# Building description Corpus:
# Corpus is a dataset of textual data, our textual data is the description field
descriptions_corpus = np.array(filtred_items_df['description']) 

# The preprocessing needed for TF-IDF is done in the next few steps:

# To lowercase
descriptions_corpus = [df.lower() for df in descriptions_corpus]

# Remove numbers
descriptions_corpus = [re.sub(r'\d+', '', df) for df in descriptions_corpus]
 
# Replace punctuation with space
p = re.compile("[" + re.escape(string.punctuation) + "]")
descriptions_corpus = [p.sub(" ", df) for df in descriptions_corpus]

# Remove white space
descriptions_corpus = [df.strip() for df in descriptions_corpus]

# Tokenization
tokenized_description_corpus = [nltk.word_tokenize(dc) for dc in descriptions_corpus]

# Stemming
stemmer= PorterStemmer()
stemmed_description_corpus = [[stemmer.stem(word)for word in des]for des in tokenized_description_corpus]

# The following function need to be passed in the 'TfidfVectorizer' funtion. since, our corpus is already tokenized, we create 
# a function that returns the same text
def identity_tokenizer(text):
    return text

# Building the TF-IDF matrix
tfidf = TfidfVectorizer(tokenizer=identity_tokenizer, stop_words='english', lowercase=False, max_features=100000, min_df=5)    
tfidf_matrix = tfidf.fit_transform(stemmed_description_corpus)

In [None]:
# Creating a dataframe from this matrix: each row in this dataframe represent the featured vector of the desciption of an item 
tf_idf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf.get_feature_names())

In [None]:
# In order to keep track of reach descrption, we concatenated the column 'asin' to the tf-idf dataframe
tf_idf_df.insert(0,'myAsin',filtred_items_df.index)

# And we set this field as an index:
tf_idf_df.set_index('myAsin', inplace=True)

#### Concatenating the 2 matrixes of OHE and TFIDF

In [None]:
# Starting by eliminating columns that we choose nt to use 
items_df = filtred_items_df.drop(columns=['description','title','categories'])

In [None]:
# Concatenation
items_df = pd.concat([items_df,tf_idf_df], axis=1)

# User Profiling 

In [None]:
# For memory sake, we got only the first 90000000 rows
review_file =  open("/home/wsaadallah/amazon-reviews/complete.json","r") 
review_lines_text = review_file.readlines(90000000)

#Getting reviews
reviews=[]
for rlt in review_lines_text:
    json_rlt = json.loads(rlt)
    if json_rlt['asin'] in items_df.index:
        reviews.append(json_rlt)

In [None]:
reviews_df = pd.DataFrame(reviews, columns=['asin', 'reviewerID', 'overall'], dtype=np.uint8)

#Removing duplication
reviews_df = reviews_df.drop_duplicates(['asin','reviewerID'], keep='first')

In [None]:
# Removing useless variables
del review_file, review_lines_text, reviews

In [None]:
# Building product x reviewer matrix (or dataframe), with the overall variable as values
reviews_df = reviews_df.pivot(index='asin', columns='reviewerID', values='overall')

In [None]:
# Filling the nans in the obtained dataframe
reviews_df= reviews_df.fillna(0)

In [None]:
# Reshaping items_df to take only item that have reviews in the item x reviewer df
indexes_to_drop = []
for i in items_df.index:
    if i not in reviews_df.index:
        indexes_to_drop.append(i)
items_df = items_df.drop(indexes_to_drop)

In [None]:
# Sorting the reviews_df by the item_df index
reviews_df = reviews_df.reindex(items_df.index)

In [None]:
# Building reviewers' profiles
reviewers_profiles = reviews_df.T.dot(items_df)

In [None]:
# Normalizing reviewers' profiles

# Constructing reviewers_ratings_count_df which calclate the number of reviews for each reviewer
reviewers_ratings_count_df = pd.DataFrame(reviews_df.astype(bool).sum(axis=0))

# Contactinating this df to the reviewers_profiles df
reviewers_ratings_count_df = pd.concat([reviewers_profiles, reviewers_ratings_count_df], axis=1)

# Normalizing this ds by the count column
reviewer_normalized_profile = reviewers_ratings_count_df.apply(lambda row: row/row[0], axis=1)

#Then remoming this count column
reviewer_normalized_profile = reviewer_normalized_profile.drop(0, axis=1)

# Recommendation 

In [None]:
# We used knn in order to determine the products to recommend:

def neighbors(data):
    return NearestNeighbors(n_neighbors=10, metric='cosine').fit(data)

nbrs = neighbors(items_df)

def predit_neigbors_for_user (user_index) :
        distances, indices = nbrs.kneighbors(reviewer_normalized_profile.loc[user_index].values.reshape(1,-1))
        #print(distances)
        for i in indices:
            index = items_df.iloc[i,:].index 
        return index.tolist()  

In [None]:
# Predicting by passing the user_id. Here I mean reviewer by user.
user_id = "A00000262KYZUE4J55XGL"
user_predictions = predit_neigbors_for_user(user_id)