## This file used to preprocess description
### [{man: 2, skateboard: 3, ...},{},{}]

In [54]:
import numpy as np
import re
from nltk.corpus import stopwords # Import the stop word list
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem import LancasterStemmer
import pprint
from autocorrect import spell
import string
plural = WordNetLemmatizer()

In [55]:
# deal with five descriptions under one image
def preprocess(lines):
    res = []
    # stopword
    for i, line in enumerate(lines):
        line = re.sub("[^a-zA-Z]", " ", line).lower()
        line = ' '.join([w for w in line.split() if not w in stopwords.words("english")])
        line = ' '.join([w for w in line.split() if not w in string.punctuation])
        res.append(line)
    # lemmatizere  
    for i, sentence in enumerate(res):
        temp = []
        for word in sentence.split(" "):
            word2 = spell(word)
            word2 = plural.lemmatize(word2,'n')
            word2 = plural.lemmatize(word2,'v')
            word2 = plural.lemmatize(word2,'a')
            word2 = plural.lemmatize(word2,'r')
            temp.append(word2)
        res[i] = ' '.join(temp)
    return res

In [56]:
def load_description(isTrain):
    res = []
    if isTrain:
        folder_path = "data/descriptions_train/"
        num = 10000
    else:
        folder_path = "data/descriptions_test/"
        num = 2000
        
    for n in range(num):
        path = folder_path + str(n) + ".txt"
        txtfile = open(path, "r")
        lines = txtfile.read().split('\n')
        processed_lines = preprocess(lines)
#         print(lines)
#         print(processed_lines)
#         if n > 5:
#             break
        res.append(processed_lines)
    return res

In [78]:
def flatten_descrip(all_data):
    words = []
    for sentence in all_data:
        cur_word = []
        for line in sentence:
            for w in line.split():
                cur_word.append(w)
        words.append(cur_word)
    return words
            

In [57]:
description_train = load_description(True)


In [79]:
des_train_flat = flatten_descrip(description_train)

In [81]:
np.save('param/description_train', des_train_flat)
print(description_train[100])
print(des_train_flat[100])

['brown crack crust bake berry pie', 'brown crust pie strawberry file', 'top pie look crusty good', 'close cook fruit flavor muffin', 'image sort strawberry flaky mastery display', 'a']
['brown', 'crack', 'crust', 'bake', 'berry', 'pie', 'brown', 'crust', 'pie', 'strawberry', 'file', 'top', 'pie', 'look', 'crusty', 'good', 'close', 'cook', 'fruit', 'flavor', 'muffin', 'image', 'sort', 'strawberry', 'flaky', 'mastery', 'display', 'a']


In [82]:
description_test = load_description(False)
des_test_flat = flatten_descrip(description_test)
np.save('param/description_test', des_test_flat)


In [83]:
print(des_test_flat[10])

['leave', 'turn', 'sign', 'hang', 'side', 'wooden', 'pole', 'sign', 'road', 'many', 'vehicle', 'park', 'picture', 'street', 'busy', 'city', 'street', 'green', 'traffic', 'light', 'sit', 'near', 'park', 'lot', 'white', 'build', 'right', 'turn', 'sign', 'turn', 'upside', 'pole', 'a']


### Build Inside BoW

In [88]:
# for each image
def build_BoW_inside(descriptions):
    word_dict = {}
    for descipt in descriptions:
        for word in descipt.split():
#             if len(word) < 2:
#                 continue
            hasRepeat, repeatWord = check_repeat(word, word_dict)
            if hasRepeat:
                word_dict[repeatWord] += 1
            else:
                word_dict[word] = 1
    return word_dict

def check_repeat(word, word_dict):
    if word_dict is None:
        return False, None
    for dict_w in word_dict:
        if dict_w.startswith(word):
            return True, dict_w
    return False, None
    

In [91]:
description_dict_train = []
for line in description_train:
    cur_dict = build_BoW_inside(line)
    description_dict_train.append(cur_dict)

In [93]:
print(description_dict_train[0])

{'skateboarder': 5, 'put': 1, 'show': 1, 'use': 1, 'picnic': 4, 'table': 5, 'stage': 1, 'pull': 1, 'trick': 2, 'top': 2, 'man': 1, 'rid': 2, 'boarder': 1, 'person': 1, 'crowd': 1, 'watch': 1, 'a': 1}


In [106]:
# deal with "skate board" -- "skateboard"
def restore_description(dic):
    res = []
    for line in dic:
        cur_res = []
        for key, val in line.items():
            for i in range(val):
                cur_res.append(key)
        res.append(' '.join(cur_res))
    return res

In [107]:
restore_descrip_train = restore_description(description_dict_train)

In [108]:
print(restore_descrip_train[0])
np.save('param/restore_descrip_train', restore_descrip_train)

skateboarder skateboarder skateboarder skateboarder skateboarder put show use picnic picnic picnic picnic table table table table table stage pull trick trick top top man rid rid boarder person crowd watch a


In [109]:
description_dict_test = []
for line in description_test:
    cur_dict = build_BoW_inside(line)
    description_dict_test.append(cur_dict)
restore_descrip_test = restore_description(description_dict_test)
np.save('param/restore_descrip_test', restore_descrip_test)

In [110]:
print(description_dict_test[0])

{'woman': 5, 'walk': 4, 'street': 1, 'past': 2, 'doorway': 2, 'sidewalk': 4, 'talk': 2, 'cell': 2, 'phone': 3, 'check': 2, 'watch': 2, 'wear': 1, 'blue': 1, 'along': 2, 'front': 1, 'build': 1, 'black': 1, 'planter': 1, 'entrance': 1}


### Build the BoW of all descriptions


In [70]:
def build_Bow_outside(dics):
    bag_word = []
    bag_word_dict = {}
    for one_dic in dics:
        for key, val in one_dic.items():
            bag_word.append(key)
            if key not in bag_word_dict:
                bag_word_dict[key] = 0
            bag_word_dict[key] += val
    return list(set(bag_word)), bag_word_dict

In [71]:
bag_of_words, bag_of_dict = build_Bow_outside(description_dict_train)

In [72]:
print(bag_of_dict)

{'skateboarder': 292, 'put': 99, 'show': 471, 'use': 324, 'picnic': 69, 'table': 2603, 'stage': 20, 'pull': 284, 'trick': 333, 'top': 2238, 'man': 5961, 'rid': 1357, 'boarder': 58, 'person': 2054, 'crowd': 345, 'watch': 475, 'bowl': 507, 'soup': 61, 'carrot': 216, 'shrimp': 9, 'noodle': 29, 'healthy': 10, 'food': 1123, 'ready': 326, 'eat': 954, 'sit': 5568, 'next': 3133, 'chopstick': 12, 'tasty': 25, 'ramen': 3, 'serve': 183, 'someone': 245, 'enjoy': 92, 'asian': 68, 'walk': 1544, 'across': 300, 'street': 2656, 'busy': 205, 'intersection': 182, 'ice': 74, 'cream': 58, 'truck': 624, 'drive': 391, 'behind': 554, 'cross': 278, 'near': 1673, 'icecream': 1, 'concession': 1, 'young': 1479, 'boy': 1058, 'throw': 241, 'frisbee': 780, 'grassy': 512, 'field': 1818, 'park': 2132, 'tree': 1190, 'line': 380, 'kid': 293, 'city': 927, 'bright': 119, 'green': 1122, 'child': 790, 'yard': 135, 'hold': 2809, 'bat': 413, 'raring': 2, 'back': 453, 'baseball': 1225, 'little': 558, 'black': 1746, 'clothe': 1

In [76]:
def build_BoW_clean(bdict, threshold = 10):
    bow_clean = []
    for key, val in bdict.items():
        if val > threshold:
            bow_clean.append(key)
 
    print("after cutting: " + str(len(bow_clean)))
    return bow_clean

In [77]:
bag_of_words_short = build_BoW_clean(bag_of_dict, 10)

after cutting: 1658


In [113]:
print(bag_of_words_short)

['skateboarder', 'put', 'show', 'use', 'picnic', 'table', 'stage', 'pull', 'trick', 'top', 'man', 'rid', 'boarder', 'person', 'crowd', 'watch', 'bowl', 'soup', 'carrot', 'noodle', 'food', 'ready', 'eat', 'sit', 'next', 'chopstick', 'tasty', 'serve', 'someone', 'enjoy', 'asian', 'walk', 'across', 'street', 'busy', 'intersection', 'ice', 'cream', 'truck', 'drive', 'behind', 'cross', 'near', 'young', 'boy', 'throw', 'frisbee', 'grassy', 'field', 'park', 'tree', 'line', 'kid', 'city', 'bright', 'green', 'child', 'yard', 'hold', 'bat', 'back', 'baseball', 'little', 'black', 'clothe', 'shoulder', 'fence', 'play', 'small', 'bedroom', 'desk', 'computer', 'chair', 'front', 'brown', 'beside', 'window', 'big', 'donut', 'feed', 'another', 'hand', 'blonde', 'look', 'hole', 'doughnut', 'powder', 'sugar', 'face', 'power', 'cover', 'flood', 'part', 'frost', 'stand', 'wood', 'wear', 'hat', 'glass', 'sunglass', 'grass', 'tie', 'crown', 'blue', 'shirt', 'weird', 'around', 'brick', 'build', 'sign', 'right

### TFIDF

In [123]:
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd

cv = CountVectorizer(min_df = 3)
X_train_bow = cv.fit_transform(restore_descrip_train).toarray()

vocab = np.array(cv.get_feature_names())
print(X_train_bow.shape)
print(vocab.shape)

transformer = TfidfTransformer()
X_train_tfidf = transformer.fit_transform(X_train_bow).toarray()

print(X_train_tfidf.shape)
# tvector = TfidfVectorizer()
# X_train_tfidf = tvector.fit_transform(X_train_bow).todense()
# print(X_train_tfidf.shape)

# Convert into DataFrames
X_train_tfidf = pd.DataFrame(X_train_tfidf, columns = vocab)
X_train_bow = pd.DataFrame(X_train_bow, columns = vocab)
X_train_tfidf.head()




(10000, 3017)
(3017,)
(10000, 3017)


Unnamed: 0,abandon,aboard,abstract,accent,accessory,accident,accompany,across,action,ad,...,yellow,yogurt,york,young,youth,yummy,zebra,zone,zoo,zucchini
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.274598,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.216419,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.07127,0.0,0.0,0.0,0.0,0.0,0.0


In [124]:
cv = CountVectorizer(vocabulary = vocab)
X_test_bow = cv.fit_transform(restore_descrip_test).toarray()

vocab = np.array(cv.get_feature_names())
print(X_test_bow.shape)
print(vocab.shape)

transformer = TfidfTransformer()
X_test_tfidf = transformer.fit_transform(X_test_bow).toarray()

print(X_test_tfidf.shape)
# tvector = TfidfVectorizer()
# X_train_tfidf = tvector.fit_transform(X_train_bow).todense()
# print(X_train_tfidf.shape)

# Convert into DataFrames
X_test_tfidf = pd.DataFrame(X_test_tfidf, columns = vocab)
X_test_bow = pd.DataFrame(X_test_bow, columns = vocab)
X_test_tfidf.head()

(2000, 3017)
(3017,)
(2000, 3017)


Unnamed: 0,abandon,aboard,abstract,accent,accessory,accident,accompany,across,action,ad,...,yellow,yogurt,york,young,youth,yummy,zebra,zone,zoo,zucchini
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.098363,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Build Tag : Y

In [135]:
def load_tag(isTrain):
    all_tag = []
    tag_for_one_image = []
    if isTrain:
        folder_path = "data/tags_train/"
        num = 10000
    else:
        folder_path = "data/tags_test/"
        num = 2000
        
    for n in range(num):
        path = folder_path + str(n) + ".txt"
        txtfile = open(path, "r")
        lines = txtfile.read().split('\n')
        # line:   sport: base ball
        temp = ''
        for line in lines:
            words = line.split(':')
            if words[0] == '':
                break
            tags = words[1]
            tags = tags.replace(" ", "")
            temp = temp + ' ' + tags
            all_tag.append(tags)
        tag_for_one_image.append(temp)
        
    all_tag = list(set(all_tag))
    return tag_for_one_image, all_tag

In [136]:
tag2image_train, tag_train = load_tag(True)

In [139]:
print(len(tag_train))

80


In [140]:
cv = CountVectorizer(vocabulary = tag_train)
tags_train_0_1 = cv.fit_transform(tag2image_train).toarray()

In [142]:
print(tags_train_0_1.shape)

(10000, 80)


In [148]:
#HAVEN't RUN
tag2image_test, tag_test = load_tag(False)
cv = CountVectorizer(vocabulary = tag_train) # train!
tags_test_0_1 = cv.fit_transform(tag2image_test).toarray()

## Random Forest

In [143]:
from sklearn import preprocessing
import csv
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.neighbors import NearestNeighbors as KNN
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor

predict_tags = []
for i in range(80):
    y = tags_train_0_1[:,i]
    clf = RandomForestRegressor()
    clf.fit(X_train_tfidf, y)
    tag = clf.predict(X_test_tfidf)
    predict_tags.append(tag)

In [145]:
predict_tags_trans = np.array(predict_tags).T
np.save('param/random_forest_predict', predict_tags)
predict_tags_trans.shape

(2000, 80)

## KNN

In [152]:
test_labels = [i for i in range(2000)]
knn = KNN(n_neighbors = 1)
knn = knn.fit(tags_test_0_1, test_labels)
predictions = knn.kneighbors(predict_tags_trans, return_distance = False)


In [159]:
print(predictions[100])
print(tags_test_0_1.shape)

[1395]
(2000, 80)


## KNN 2

In [168]:
features_test = np.load("data/sort_feat_test.npy")
print(features_test.shape)
print(np.array(test_labels).shape)
print(predictions.shape)
predict_feature = features_test[predictions].reshape(2000, 1000)


(2000, 1000)
(2000,)
(2000, 1)


In [169]:
knn2 = KNN(n_neighbors = 20)
knn2 = knn2.fit(features_test, test_labels)
predict_final = knn2.kneighbors(predict_feature, return_distance = False)

## Export CSV

In [170]:
def export_csv(data, filename = "lr.csv"):
    list = []
    for i, row in enumerate(data):
        temp = ''
        for j, val in enumerate(row):
            temp = temp + ' ' + (str(val) + ".jpg")
        list.append(temp)
    images = pd.DataFrame(list, columns = ["Top_20_Image_IDs"])

    index = []
    for i in range(2000):
        index.append(str(i)+ ".txt")
    index = pd.DataFrame(index, columns = ["Descritpion_ID"])

    result = pd.concat([index, images], axis=1)
    result.to_csv(filename, index = False)

In [171]:
export_csv(predict_final, "rf_tag_knn.csv")

In [172]:
predict_final = pd.DataFrame(predict_final)

print(predict_final.head())

     0     1     2     3     4     5    6     7     8     9     10    11  \
0   124   254   790  1799    75   364  132   752  1119  1447  1818   825   
1  1341   383  1911  1209   121  1522  211  1353  1645  1652  1355   193   
2   598  1139    60   168  1390  1001  198   564   824   151  1113   688   
3    75   790   785  1099  1630   124  752   156   104  1172  1243  1447   
4    36    38  1689  1022  1062   943    8  1820  1489  1415   636  1916   

     12    13    14    15    16    17    18    19  
0   156  1463   683  1243  1455   531  1731    39  
1  1232  1087   977   711  1410  1950   155   188  
2   722  1086  1701  1151  1762  1635   947   944  
3  1280   907   364   486   127  1602  1639  1119  
4   632   731   293  1327   317   389   434   606  
