In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
from sklearn.metrics.pairwise import cosine_similarity


In [2]:
df=pd.read_csv('../dataset/train.csv')

In [3]:
# Scale rating feature to make its range between 1-5
df['rating'] = df['rating']/2

In [4]:
# Add new feature which has binary rating i.e either positive or negative 
# Categorise as 1 if rating is either 5 or 4 and rented item is not small or large in size
rating_to_binary = (df["rating"] == 5.0) | (df["rating"] == 4.0) | (df["rating"] == 3.0) & (df["fit"] == "fit")
rating_to_binary 
df["class_rating"] = np.where(rating_to_binary == True, 1, 0)
df["class_rating"]

0         1
1         1
2         1
3         1
4         1
         ..
154030    1
154031    1
154032    1
154033    1
154034    1
Name: class_rating, Length: 154035, dtype: int32

In [5]:
# Compressing records by considering the items which has more than 4 ratings and users who purchased more than 5 items
# Or give rating to more than 5 items


# including items with more than 4 ratings
itemRat_count = pd.DataFrame(df.groupby("item_id")["rating"].count())
itemRat_count = itemRat_count.loc[(itemRat_count["rating"] > 4)]
itemRat_count.reset_index(inplace=True)
#print('itemRat_count')
clothingRed = df[df["item_id"].isin(itemRat_count["item_id"])]
#print(clothingRed)

In [6]:
# including users with more than 4 purchases
userRat_count = pd.DataFrame(clothingRed.groupby("user_id")["rating"].count())
userRat_count = userRat_count.loc[(userRat_count.rating > 5)]
userRat_count.reset_index(inplace=True)
userRat_count.drop("rating", axis=1, inplace=True)
clothingRed = clothingRed[clothingRed["user_id"].isin(userRat_count["user_id"])]
clothingRed.reset_index(inplace=True, drop=True)

# Content based  using TF-IDF

In [7]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer

# Remove useless words 

stop_words = stopwords.words("english")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Checkout\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [8]:
# Add new column which has  the text data such as category, review summary, review text
# Using this since in the dataset there is no meta data given for items
# Hence creating meta data extracting most of the text data from features 

df["item_review_text"] = (df["category"] + " " + df["review_summary"] + " " + df["review_text"]).str.lower()
df["item_review_text"] = df["item_review_text"].astype(str)
df["item_review_text"]= df["item_review_text"].apply(lambda x: " ".join([word for word in x.split() if word not in (stop_words)]))


In [9]:
df[["item_id", "item_review_text"]].tail()

Unnamed: 0,item_id,item_review_text
154030,2358935,top fun top. cute top. really liked lace ruffl...
154031,625911,"dress classic design, edge. dress great! style..."
154032,1515649,dress fit like glove complaints
154033,127081,gown cancelled wedding. gorgeous dress fit dre...
154034,126335,dress sparkly! perfect holiday nye party. wore...


In [10]:
agg_item_text = pd.DataFrame(df.groupby("item_id")["item_review_text"].sum())
agg_item_text.head()

Unnamed: 0_level_0,item_review_text
item_id,Unnamed: 1_level_1
123373,"gown wore wedding nyc! dress stretchy, meant c..."
123793,gown gorgeous!! many compliments actually wore...
124204,"dress many sparkles so, mesmerizingly sparkly ..."
124553,"dress meh. shows imperfections; bit boxy. 5'6""..."
125424,dress nye party runs large. great flattering d...


In [11]:
tf_idf = TfidfVectorizer()
tf_idf_matrix = tf_idf.fit_transform(agg_item_text["item_review_text"])
item_tfidf_similarity = cosine_similarity(tf_idf_matrix, tf_idf_matrix)
item_tfidf_similarity_df = pd.DataFrame(item_tfidf_similarity, columns=agg_item_text.index, index=agg_item_text.index)

# Create item-item similarity matrix usig td-idf
item_tfidf_similarity_df.head()

item_id,123373,123793,124204,124553,125424,125465,125564,126335,127081,127495,...,2960969,2961855,2962646,2963344,2963601,2963850,2964470,2965009,2965924,2966087
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
123373,1.0,0.914838,0.644867,0.680008,0.674219,0.913108,0.932101,0.681126,0.872518,0.685355,...,0.2019,0.039075,0.122114,0.119077,0.041857,0.135678,0.146464,0.053211,0.106687,0.179288
123793,0.914838,1.0,0.711434,0.692911,0.683379,0.931258,0.931197,0.750467,0.88585,0.674954,...,0.184861,0.03847,0.119408,0.120977,0.033069,0.127584,0.13296,0.048308,0.097633,0.184856
124204,0.644867,0.711434,1.0,0.841316,0.834549,0.649814,0.660238,0.924594,0.633101,0.860119,...,0.197987,0.035061,0.120656,0.148743,0.040208,0.131365,0.155288,0.051148,0.092773,0.132675
124553,0.680008,0.692911,0.841316,1.0,0.915943,0.710702,0.692059,0.876482,0.665468,0.876139,...,0.239085,0.044465,0.144731,0.156248,0.059697,0.153152,0.182614,0.05426,0.117275,0.148433
125424,0.674219,0.683379,0.834549,0.915943,1.0,0.67346,0.691833,0.869635,0.658052,0.86807,...,0.240335,0.049629,0.157829,0.152972,0.056813,0.148583,0.174565,0.050577,0.121423,0.140394


In [12]:
import evaluation as evalResult

def tfidf_contentRecSys(user_input):
    id = user_input['user_id']
    # Get record of user from train data which has same user id as passed in the argument
    top_train_user_item = clothingRed[clothingRed.user_id == id][:1]
    if len(top_train_user_item) == 0:
        return pd.DataFrame(columns = ['item_id'])
    # To get item id corresponding to above record
    top_train_user_item = int(top_train_user_item.item_id)
    # print(top_train_user_item)
    # To compute all the similar items corresponding to above record's item
    item_similarities = item_tfidf_similarity_df[[top_train_user_item]].sort_values([top_train_user_item], ascending=False)
    item_similarities = item_similarities.iloc[1:]
    item_similarities.reset_index(inplace=True)
    return item_similarities
    # Take top 50 simialr items
    top_50 = item_similarities.iloc[1:51]
    # print(top_50)
    top_50.reset_index(inplace=True)
    # print(top_50)               
    return evaluation(recommendations = top_50, user = id)


evalResult.evaluate_recommendation(tfidf_contentRecSys, -1)

{'average_of_recommendations': 831.0921006641034,
 'number_of_recommendations': 5722,
 'total_test_cases': 38509,
 '% of recommendations': 14.858864161624554}