In [1]:
import  numpy as np
import  matplotlib.pyplot as plt
import cv2
import pandas as pd
import requests
import cv2
import torch
import torchvision.transforms as transforms
import torchvision.models as models
from sklearn.preprocessing import normalize

In [2]:
# Define the pre-trained ResNet model
#resnet = models.resnet50(pretrained=True)

# Remove the last layer
#feature_extractor = torch.nn.Sequential(*(list(resnet.children())[:-1]))


In [3]:
data=pd.read_csv('/kaggle/input/a2-data-csv/A2_Data.csv')
def preprocess_image_urls(image_column):
    image_column = image_column.apply(lambda x: x.replace("[", "").replace("]", "").replace("'", "").split(", "))
    return image_column

# Preprocess the 'Image' column
data['Image'] = preprocess_image_urls(data['Image'])
data.head()

Unnamed: 0.1,Unnamed: 0,Image,Review Text,Image_Vectors
0,3452,[https://images-na.ssl-images-amazon.com/image...,Loving these vintage springs on my vintage str...,"[array([[0.3948523 , 0.5592724 , 0.4393471 , ...."
1,1205,[https://images-na.ssl-images-amazon.com/image...,Works great as a guitar bench mat. Not rugged ...,"[array([[0.38690877, 0.5909363 , 0.47527072, ...."
2,1708,[https://images-na.ssl-images-amazon.com/image...,We use these for everything from our acoustic ...,"[array([[0.3948523 , 0.5592724 , 0.4393471 , ...."
3,2078,[https://images-na.ssl-images-amazon.com/image...,Great price and good quality. It didn't quite...,"[array([[0.38690877, 0.5909363 , 0.47527072, ...."
4,801,[https://images-na.ssl-images-amazon.com/image...,I bought this bass to split time as my primary...,"[array([[0.3948523 , 0.5592724 , 0.4393471 , ...."


In [4]:


def adjust_brightness_and_contrast(image, brightness=0, contrast=0):
        # Alpha controls contrast; Beta controls brightness.
        alpha = 1 + contrast / 127
        beta = brightness
        adjusted_image = cv2.convertScaleAbs(image, alpha=alpha, beta=beta)
        return adjusted_image

def preprocess(image_url):
    response = requests.get(image_url)
    image = cv2.imdecode(np.frombuffer(response.content, np.uint8), -1)
    if image is None:
        return None
    image_height, image_width = image.shape[:2]


    #border removal
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    _,thresh = cv2.threshold(gray,1,255,cv2.THRESH_BINARY)
    contours,hierarchy = cv2.findContours(thresh,cv2.RETR_EXTERNAL,cv2.CHAIN_APPROX_SIMPLE)
    x,y,w,h = cv2.boundingRect(contours[0])
    image = image[y:y+h,x:x+w]
    image = cv2.resize(image, (80, 80))


    # Random flip
    flip_direction = np.random.choice(["horizontal", "vertical"])
    if flip_direction == "horizontal":
        image = cv2.flip(image, 1)  # 1: Flip horizontally
    elif flip_direction == "vertical":
        image = cv2.flip(image, 0)  # 0: Flip vertically

    

    #pixel normalisation
    def normalize_image(image):
        normalized_image = cv2.normalize(image, None, alpha=0, beta=1, norm_type=cv2.NORM_MINMAX, dtype=cv2.CV_32F)
        return normalized_image

    image = normalize_image(image)
    

    #brightness and contrast adjustment
    def calculate_brightness_and_contrast(image):
        gray_image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
        mean, std_dev = cv2.meanStdDev(gray_image)
        return mean[0][0], std_dev[0][0]

    

    mean_brightness, std_dev_contrast = calculate_brightness_and_contrast(image)

    # Set a threshold for deciding whether to adjust brightness and contrast
    brightness_threshold = 20  
    contrast_threshold = 10 

    # Adjust brightness and contrast only if needed
    if mean_brightness < brightness_threshold or std_dev_contrast < contrast_threshold:
        brightness = 20  
        contrast = 20  
        adjusted_image = adjust_brightness_and_contrast(image, brightness=brightness, contrast=contrast)
    else:
        adjusted_image = image

    return image


image_url = data['Image'][200][0]
image = preprocess(image_url)


# plt.imshow(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
# plt.axis('off')
# plt.show()

In [5]:
import cv2
import torch
import torchvision.transforms as transforms
from torchvision.models import vgg16
from sklearn.preprocessing import normalize

# Load pre-trained VGG16 model
vgg16_model = vgg16(pretrained=True)
vgg16_model.eval()

# Define a function to extract features using VGG16
def extract_features_vgg16(image):
    img = cv2.resize(image, (224, 224))
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    img_tensor = transforms.ToTensor()(img)
    img_tensor = torch.unsqueeze(img_tensor, 0)
    with torch.no_grad():
        features = vgg16_model.features(img_tensor)
    features = torch.nn.functional.normalize(features, p=2, dim=1)
    features = features.squeeze().numpy()
    return features.flatten()

# Define a function to extract normalized features using VGG16
def extract_features_normalized_vgg16(image):
    extracted_features = extract_features_vgg16(image)
    normalized_features = normalize([extracted_features], norm='l2')
    return normalized_features[0]





In [6]:
def imageTOvec(image_url):
    image=preprocess(image_url)
    if image is None:
        return None
    features = extract_features_normalized_vgg16(image)
    if features is None:
        return None
    return features


def cosine_similarity(vector_a, vector_b):
    dot_product = np.dot(vector_a, vector_b)
    norm_a = np.linalg.norm(vector_a)
    norm_b = np.linalg.norm(vector_b)
    similarity = dot_product / (norm_a * norm_b)
    return similarity

In [7]:
data['Image_Vectors'] = None

In [8]:

for i, row in data.iterrows():
    print(i)
    vectors = []
    for url in row['Image']:
        vector = imageTOvec(url)
        if vector is not None:
            vectors.append(vector)
    if len(vectors) ==0:
        # drop the whole row
        data = data.drop(i)
    else:
        data.at[i, 'Image_Vectors'] = vectors
    if (i>300):
        break
    

# Print the updated DataFrame
print(data)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
27

In [9]:

for i, row in data.iterrows():
    print(i)
    if i<300:
        continue
    vectors = []
    for url in row['Image']:
        vector = imageTOvec(url)
        if vector is not None:
            vectors.append(vector)
    if len(vectors) ==0:
        # drop the whole row
        data = data.drop(i)
    else:
        data.at[i, 'Image_Vectors'] = vectors
    if (i>600):
        break
    

# Print the updated DataFrame
print(data)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
2

In [10]:

for i, row in data.iterrows():
    print(i)
    if i<600:
        continue
    vectors = []
    for url in row['Image']:
        vector = imageTOvec(url)
        if vector is not None:
            vectors.append(vector)
    if len(vectors) ==0:
        # drop the whole row
        data = data.drop(i)
    else:
        data.at[i, 'Image_Vectors'] = vectors

    

# Print the updated DataFrame
print(data)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
2

In [11]:
data.head()

Unnamed: 0.1,Unnamed: 0,Image,Review Text,Image_Vectors
0,3452,[https://images-na.ssl-images-amazon.com/image...,Loving these vintage springs on my vintage str...,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
1,1205,[https://images-na.ssl-images-amazon.com/image...,Works great as a guitar bench mat. Not rugged ...,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 3.563436856848..."
2,1708,[https://images-na.ssl-images-amazon.com/image...,We use these for everything from our acoustic ...,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
3,2078,[https://images-na.ssl-images-amazon.com/image...,Great price and good quality. It didn't quite...,"[[0.008398145451137908, 0.0022459865717929435,..."
4,801,[https://images-na.ssl-images-amazon.com/image...,I bought this bass to split time as my primary...,"[[0.015007795081277633, 0.009100475550438039, ..."


In [33]:
import pickle
with open('data3.pkl', 'wb') as f:
    pickle.dump(data, f)

# Load the dataset back
with open('data.pkl', 'rb') as f:
    loaded_data = pickle.load(f)

# Print loaded data to verify
print(loaded_data)

     Unnamed: 0                                              Image  \
0          3452  [https://images-na.ssl-images-amazon.com/image...   
1          1205  [https://images-na.ssl-images-amazon.com/image...   
2          1708  [https://images-na.ssl-images-amazon.com/image...   
3          2078  [https://images-na.ssl-images-amazon.com/image...   
4           801  [https://images-na.ssl-images-amazon.com/image...   
..          ...                                                ...   
995        1265  [https://images-na.ssl-images-amazon.com/image...   
996        1882  [https://images-na.ssl-images-amazon.com/image...   
997        1547  [https://images-na.ssl-images-amazon.com/image...   
998        1004  [https://images-na.ssl-images-amazon.com/image...   
999        1306  [https://images-na.ssl-images-amazon.com/image...   

                                           Review Text  \
0    love vintag spring vintag strat good tension g...   
1    work great guitar bench mat rug enough

In [13]:
droped_data = data.dropna()

In [14]:
#  drop rows with review text being nan
print(droped_data)

     Unnamed: 0                                              Image  \
0          3452  [https://images-na.ssl-images-amazon.com/image...   
1          1205  [https://images-na.ssl-images-amazon.com/image...   
2          1708  [https://images-na.ssl-images-amazon.com/image...   
3          2078  [https://images-na.ssl-images-amazon.com/image...   
4           801  [https://images-na.ssl-images-amazon.com/image...   
..          ...                                                ...   
995        1265  [https://images-na.ssl-images-amazon.com/image...   
996        1882  [https://images-na.ssl-images-amazon.com/image...   
997        1547  [https://images-na.ssl-images-amazon.com/image...   
998        1004  [https://images-na.ssl-images-amazon.com/image...   
999        1306  [https://images-na.ssl-images-amazon.com/image...   

                                           Review Text  \
0    Loving these vintage springs on my vintage str...   
1    Works great as a guitar bench mat. Not

In [15]:
#  preprorcess revierw texts
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer as ps

# Download the stopwords corpus if you haven't already
nltk.download('stopwords')
nltk.download('punkt')
stop_words = set(stopwords.words('english'))

def preprocess_review_text(review_text):
    # print(review_text)
    # print index of the reivew text
    review_text = str(review_text)
    review_text = review_text.lower()
    review_text = re.sub(r'[^\w\s]', '', review_text)
    # remove stop words
    review_text = ' '.join([word for word in review_text.split() if word not in stop_words])
    # do stemming
    review_text = ' '.join([ps().stem(word) for word in review_text.split()])
    return review_text



data['Review Text'] = data['Review Text'].apply(preprocess_review_text)

data.head()

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Unnamed: 0.1,Unnamed: 0,Image,Review Text,Image_Vectors
0,3452,[https://images-na.ssl-images-amazon.com/image...,love vintag spring vintag strat good tension g...,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
1,1205,[https://images-na.ssl-images-amazon.com/image...,work great guitar bench mat rug enough abus ta...,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 3.563436856848..."
2,1708,[https://images-na.ssl-images-amazon.com/image...,use everyth acoust bass ukulel know smaller mo...,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
3,2078,[https://images-na.ssl-images-amazon.com/image...,great price good qualiti didnt quit match radi...,"[[0.008398145451137908, 0.0022459865717929435,..."
4,801,[https://images-na.ssl-images-amazon.com/image...,bought bass split time primari bass dean edg m...,"[[0.015007795081277633, 0.009100475550438039, ..."


In [16]:
import numpy as np
from collections import Counter
import math

def tokenize(text):
    # Split text into tokens (words)
    return text.split()

def calculate_tf(text):
    # Calculate term frequency (TF) for each word in the text
    tokens = tokenize(text)
    word_count = Counter(tokens)
    total_words = len(tokens)
    tf = {word: count / total_words for word, count in word_count.items()}
    return tf

def calculate_idf(documents):
    # Calculate inverse document frequency (IDF) for each word
    total_documents = len(documents)
    all_words = set([word for document in documents for word in tokenize(document)])
    idf = {}
    for word in all_words:
        doc_count = sum([1 for document in documents if word in tokenize(document)])
        idf[word] = math.log10(total_documents / doc_count)
    return idf

def calculate_tfidf(text, idf):
    # Calculate TF-IDF for each word in the text using precomputed IDF values
    tf = calculate_tf(text)
    tfidf = {word: tf[word] * idf[word] for word in tf}
    return tfidf

# make tfidf form review text
dataset = data['Review Text'].tolist()

# Calculate IDF for the dataset
idf = calculate_idf(dataset)

# Calculate TF-IDF for each document
tfidf_matrix = []
for document in dataset:
    tfidf = calculate_tfidf(document, idf)
    tfidf_matrix.append(tfidf)



In [17]:
#  normalise the tfidf matrix

# tfidf_matrix = normalize(tfidf_matrix, axis=1, norm='l2')



#  print tfidk in a tablular manner
tfidf_df = pd.DataFrame(tfidf_matrix)
#  convert thge nan to zeros
tfidf_df = tfidf_df.fillna(0)
#  print the first 5 rows
print(tfidf_df.head())



       love    vintag    spring     strat      good  tension     great  \
0  0.060264  0.232282  0.295898  0.090928  0.045850  0.13962  0.032906   
1  0.000000  0.000000  0.000000  0.000000  0.029902  0.00000  0.021461   
2  0.000000  0.000000  0.000000  0.000000  0.000000  0.00000  0.000000   
3  0.000000  0.000000  0.000000  0.000000  0.057313  0.00000  0.041133   
4  0.000000  0.000000  0.000000  0.000000  0.000000  0.00000  0.010730   

    stabil     float     bridg  ...  mayer  importantli  toneprint  \
0  0.13962  0.168018  0.097727  ...    0.0          0.0        0.0   
1  0.00000  0.000000  0.000000  ...    0.0          0.0        0.0   
2  0.00000  0.000000  0.000000  ...    0.0          0.0        0.0   
3  0.00000  0.000000  0.000000  ...    0.0          0.0        0.0   
4  0.00000  0.000000  0.000000  ...    0.0          0.0        0.0   

   stringthru  stopflair  biggi  accord  screenshot  amazoncom  piti  
0         0.0        0.0    0.0     0.0         0.0        0.0 

In [18]:
#  calculate cosine similarity between two review texts

def cosine_similarity(text1, text2):

    tfidf1 = calculate_tfidf(text1, idf)
    tfidf2 = calculate_tfidf(text2, idf)

    # Calculate the dot product of tfidf1 and tfidf2
    dot_product = sum([tfidf1[word] * tfidf2[word] for word in tfidf1 if word in tfidf2])

    # Calculate the magnitude of tfidf1 and tfidf2
    magnitude1 = math.sqrt(sum([tfidf1[word]**2 for word in tfidf1]))
    magnitude2 = math.sqrt(sum([tfidf2[word]**2 for word in tfidf2]))

    # Calculate the cosine similarity
    similarity = dot_product / (magnitude1 * magnitude2)
    return similarity

#  calculate cosine similarity between two review texts
text1 = data['Review Text'][0]
text2 = data['Review Text'][1]
similarity = cosine_similarity(text1, text2)
print(similarity, "1")

0.0113100827022372 1


In [19]:
#  creaate a ranked index of the most similar review texts

def most_similar_reviews(query_text, num_results=5):

    # Calculate the cosine similarity between the query text and all the review texts
    similarities = [cosine_similarity(query_text, review_text) for review_text in dataset]

    # Sort the review texts by their similarity to the query text
    most_similar_indices = np.argsort(similarities)[::-1]

    # Return the top N most similar review texts
    return most_similar_indices[:num_results]

#  print the most similar review texts

query_text = data['Review Text'][0]
similar_reviews = most_similar_reviews(query_text)
print(similar_reviews)

#  print the texts of the most similar review texts
for i in similar_reviews:
    print(data['Review Text'][i])
    print('\n')


[  0 269 801 746 754]
love vintag spring vintag strat good tension great stabil float bridg want spring way go


let low price fool incred devic free mix softwar year sinc track anyth back everyth solid state analogu scarlett solo moder comput larg monitor stress larg much control softwar realli lay seriou track new get back get pro custom support tube video con mani featur scarlet pro softwar instruct download simpl user manual


save moneydo fit properli 1gap around guitar allow guitar slide forward hit end case headstock buy


fabtast build qualiti rich fuzzi good surpris let light fuzz distor blue heavi fuzz unaud grung rucku meant perceiv get squar circl rhombu im boss


tuner good one pricemak happi love behring




In [20]:
print(cosine_similarity(data['Review Text'][0], data['Review Text'][271]))
print(cosine_similarity(data['Review Text'][0], data['Review Text'][805]))
print(cosine_similarity(data['Review Text'][0], data['Review Text'][750]))
print(cosine_similarity(data['Review Text'][0], data['Review Text'][758]))    

0.3405783487638787
0.24197358832056767
0.21632954560853315
0.14955751792697444


In [21]:
print(data['Review Text'][0])

love vintag spring vintag strat good tension great stabil float bridg want spring way go


In [22]:
#  save the tfidf matrix to a csv file
tfidf_df.to_csv('tfidf.csv', index=False)

# load the tfidf matrix from the csv file
#loaded_tfidf_df = pd.read_csv('tfidf.csv')
#print(loaded_tfidf_df.head())

In [23]:
data.head()

Unnamed: 0.1,Unnamed: 0,Image,Review Text,Image_Vectors
0,3452,[https://images-na.ssl-images-amazon.com/image...,love vintag spring vintag strat good tension g...,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
1,1205,[https://images-na.ssl-images-amazon.com/image...,work great guitar bench mat rug enough abus ta...,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 3.563436856848..."
2,1708,[https://images-na.ssl-images-amazon.com/image...,use everyth acoust bass ukulel know smaller mo...,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
3,2078,[https://images-na.ssl-images-amazon.com/image...,great price good qualiti didnt quit match radi...,"[[0.008398145451137908, 0.0022459865717929435,..."
4,801,[https://images-na.ssl-images-amazon.com/image...,bought bass split time primari bass dean edg m...,"[[0.015007795081277633, 0.009100475550438039, ..."


In [24]:
print(data['Image_Vectors'][0][0][0])

0.0


In [25]:
print(data.head())

   Unnamed: 0                                              Image  \
0        3452  [https://images-na.ssl-images-amazon.com/image...   
1        1205  [https://images-na.ssl-images-amazon.com/image...   
2        1708  [https://images-na.ssl-images-amazon.com/image...   
3        2078  [https://images-na.ssl-images-amazon.com/image...   
4         801  [https://images-na.ssl-images-amazon.com/image...   

                                         Review Text  \
0  love vintag spring vintag strat good tension g...   
1  work great guitar bench mat rug enough abus ta...   
2  use everyth acoust bass ukulel know smaller mo...   
3  great price good qualiti didnt quit match radi...   
4  bought bass split time primari bass dean edg m...   

                                       Image_Vectors  
0  [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...  
1  [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 3.563436856848...  
2  [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...  
3  [[0.008398145451137908, 0.00224

In [26]:
print(data['Image_Vectors'][0])

[array([0.        , 0.        , 0.        , ..., 0.00622895, 0.        ,
       0.        ])]


In [27]:
tfidf = pd.read_csv('tfidf.csv')

In [28]:
tfidf.head()

Unnamed: 0,love,vintag,spring,strat,good,tension,great,stabil,float,bridg,...,mayer,importantli,toneprint,stringthru,stopflair,biggi,accord,screenshot,amazoncom,piti
0,0.060264,0.232282,0.295898,0.090928,0.04585,0.13962,0.032906,0.13962,0.168018,0.097727,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.029902,0.0,0.021461,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.057313,0.0,0.041133,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.01073,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
print(data['Image'][1][2])

https://images-na.ssl-images-amazon.com/images/I/71domStNfIL._SY88.jpg


In [None]:
def text_similarity_Score_Tfidf(doci, docj):
    # use tfidf doc indices using cosine similarity
    tfidf1 = tfidf.iloc[doci]
    tfidf2 = tfidf.iloc[docj]


    # calculate the cosine similarity
    similarity = np.dot(tfidf1, tfidf2) / (np.linalg.norm(tfidf1) * np.linalg.norm(tfidf2))
    return similarity

print(text_similarity_Score_Tfidf(0, 1))

0.011310082702237163


In [None]:
# make a matrix of the similarity scores
n = len(data)
similarity_matrix = np.zeros((n, n))
for i in range(n):
    if i%50==0:
        print(i)
    for j in range(n):
        similarity_matrix[i, j] = text_similarity_Score_Tfidf(i, j)

# plot the similarity matrix
plt.imshow(similarity_matrix, cmap='hot', interpolation='nearest')
plt.show()


0
50
100


KeyboardInterrupt: 

In [None]:
print(similarity_matrix[1])

In [None]:
with open('SmMatrix.pkl', 'wb') as f:
    pick le.dump(similarity_matrix, f)

# Load the dataset back
with open('SmMatrix.pkl', 'rb') as f:
    simlarity_M = pickle.load(f)

# Print loaded data to verify
print(simlarity_M)

In [None]:
def image_similarity(imgVector1, imgVector2):
    # calculate the cosine similarity
    similarity = np.dot(imgVector1, imgVector2) / (np.linalg.norm(imgVector1) * np.linalg.norm(imgVector2))
    return similarity



In [None]:
# for each document in the dataset load the list of image vectors and set image similarity scores of two doccuments as the value of most similar images using data 
# define an image similarity matrix
image_similarity_M = np.zeros((len(data), len(data)))


for i in range(len(data)):
    image_vectorsi= data.iloc[i]['Image_Vectors']
    if i%50==0:
        print(i)
    for j in range(len(data)):
        image_vectorsj= data.iloc[j]['Image_Vectors']
        # iterate through all the image vectors in both vectors list and take the maximum
        max_similarity = 0
        for img1 in image_vectorsi:
            for img2 in image_vectorsj:
                similarity = image_similarity(img1[0], img2[0])
                if similarity > max_similarity:
                    max_similarity = similarity
        image_similarity_M[i][j] = max_similarity

# save the image similarity matrix in csv


    

In [None]:
image_similarity_M = pd.DataFrame(image_similarity_M)
image_similarity_M.to_csv('image_similarity.csv', index=False)


