# Week 10: Feature Hashing and LSH

In [1]:
import json, glob

cleaned_data = []

#Load every datafile
for file in glob.glob('full/*'):
    data = json.load(open(file))
    
    for elem in data:
        #Check if the article has both an topic and body-element
        if 'topics' in list(elem.keys()) and 'body' in list(elem.keys()):
            cleaned_data.append(elem)
            
print("Number of articles:", len(cleaned_data))

Number of articles: 10377


In [2]:
#Construct dictionary over unique words in all articles
dictionary = set()
for article in cleaned_data:
    for word in article['body'].lower().split():
        dictionary.add(word)
print("Number of unique words:", len(dictionary))

Number of unique words: 70793


In [3]:
# Make it into a dictioniary, with each word having an index as value
from collections import defaultdict
features = defaultdict(int)
for i, word in enumerate(dictionary):
    features[word] = i

In [4]:
import numpy as np
bag_of_words = np.zeros((len(cleaned_data), len(features)))

#Consturct bag-of-words for all articles by looking up the ID of all the words
for i, article in enumerate(cleaned_data):
    for word in article['body'].lower().split():
        bag_of_words[i][features[word]] += 1

print("articles, features")        
print(bag_of_words.shape)

articles, features
(10377, 70793)


In [6]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

#Prepare data
y = np.zeros(len(cleaned_data))
for i, article in enumerate(cleaned_data):
    if 'earn' in article['topics']:
        y[i] = 1
        
X = bag_of_words

#Split data into test and validation
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#Construct RFC
clf = RandomForestClassifier(n_estimators = 50)
clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=50, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [7]:
#Test performance
print("Accuracy on train-set =", clf.score(X_train, y_train))
print("Accuracy on test-set =", clf.score(X_test, y_test))

Accuracy on train-set = 1.0
Accuracy on test-set = 0.946531791908


### Feature Hashing

In [8]:
num_bins = 1000
hashed = np.zeros((len(cleaned_data), num_bins))

for i, article in enumerate(cleaned_data):
    for word in article['body'].lower().split():
        #Just hash the features 
        hashed[i][features[word] % num_bins] += 1

print("articles, features")        
print(hashed.shape)

articles, features
(10377, 1000)


In [9]:
X = hashed


#Split data into test and validation
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#Construct RFC
clf = RandomForestClassifier(n_estimators = 50)
clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=50, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [10]:
#Test performance
print("Accuracy on train-set = ", clf.score(X_train, y_train))
print("Accuracy on test-set = ", clf.score(X_test, y_test))

Accuracy on train-set =  1.0
Accuracy on test-set =  0.948458574181


## Exercise 10.2 LSH

In [5]:
from PIL import Image
#Open image and make it gray-scale
im = Image.open("img").convert('L')

#Resize image
img = np.array(im.resize((9,8)))


[[255 255 255 255 255 255 255 255 255]
 [255 255 255  92 180  89 255 255 255]
 [255 255 140  89 180 180 111 255 255]
 [255  93 180 180 180 180 180 255 255]
 [255  89  89  89 180 180 180  89 255]
 [255 132  89  89 180 180 180 255 255]
 [255 181  89  89  89  90 176 179 255]
 [255 255 255 255 255 255 255 255 255]]


array([[False, False, False, False, False, False, False, False],
       [False, False,  True, False,  True, False, False, False],
       [False,  True,  True, False, False,  True, False, False],
       [ True, False, False, False, False, False, False, False],
       [ True, False, False, False, False, False,  True, False],
       [ True,  True, False, False, False, False, False, False],
       [ True,  True, False, False, False, False, False, False],
       [False, False, False, False, False, False, False, False]], dtype=bool)

In [6]:
images = sorted(glob.glob('*.jpeg'))

In [10]:
features_all = []
width = 9
height = 8

# For every image in the folder
for image in sorted(glob.glob('*.jpeg')):
    #Open and resize
    im = Image.open(image).convert('L')

    img = np.array(im.resize((width,height)))

    #Do differencing
    diff = img[:, :-1] > img[:,1:]

    features = []
    print(image, end = ': ')
    
    #Use Davids script from Aula
    for difference in diff:
        decimal_value = 0
        hex_string = []
        for index, value in enumerate(difference):
            #If True, fill the position
            if value:
                decimal_value += 2**(index % (width - 1))
            #If reached the end, append it to the string and add it to features
            if (index % (width-1)) == (width-2):
                hex_string.append(hex(decimal_value)[2:].rjust(2, '0'))
                features.append(decimal_value)
                decimal_value = 0
        print(''.join(hex_string), end='')
    print()
    
    features_all.append(features)

banana.jpeg: 0000212139331e07
banana2.jpeg: 808141312123260c
orange1.jpeg: 000e2d0d4f0f060e
orange2.jpeg: 0613160747070e0c


In [11]:
from pprint import pprint
from sklearn.metrics.pairwise import * 
import pandas as pd

#Compute Cosine Similarity
similarities = cosine_similarity(features_all)
images = sorted(glob.glob('*.jpeg'))

df = pd.DataFrame(similarities, index=images, columns= images)
df

Unnamed: 0,banana.jpeg,banana2.jpeg,orange1.jpeg,orange2.jpeg
banana.jpeg,1.0,0.437564,0.825922,0.7762
banana2.jpeg,0.437564,1.0,0.447148,0.496058
orange1.jpeg,0.825922,0.447148,1.0,0.961906
orange2.jpeg,0.7762,0.496058,0.961906,1.0


In [15]:
features_all = []
#Increase number of bins for better precision.
width = 25
height = 24

# For every image in the folder
for image in sorted(glob.glob('*.jpeg')):
    #Open and resize
    im = Image.open(image).convert('L')

    img = np.array(im.resize((width,height)))

    #Do differencing
    diff = img[:, :-1] > img[:,1:]

    features = []
    
    #Use Davids script from Aula
    for difference in diff:
        decimal_value = 0
        hex_string = []
        for index, value in enumerate(difference):
            #If True, fill the position
            if value:
                decimal_value += 2**(index % (width - 1))
            #If reached the end, append it to the string and add it to features
            if (index % (width-1)) == (width-2):
                hex_string.append(hex(decimal_value)[2:].rjust(2, '0'))
                features.append(decimal_value)
                decimal_value = 0
    
    features_all.append(features)
    

similarities = cosine_similarity(features_all)
names = sorted(glob.glob('*.jpeg'))

df = pd.DataFrame(similarities, index=images, columns= images)
df

Unnamed: 0,banana.jpeg,banana2.jpeg,orange1.jpeg,orange2.jpeg
banana.jpeg,1.0,0.874059,0.517271,0.494726
banana2.jpeg,0.874059,1.0,0.518566,0.457681
orange1.jpeg,0.517271,0.518566,1.0,0.901867
orange2.jpeg,0.494726,0.457681,0.901867,1.0
