In [1]:
import pandas as pd
import pickle
import re
import numpy as np
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
n = 3
MAX_VOCAB = 999

In [2]:
def form_docs(df):
    docs = []
    #make docs
    for row in df.values:
        doc = []
        for idx, inst in enumerate(row[2:6]):
            if inst != inst:
                inst = '<NAN>'
            elif idx == 1 or idx == 2:                                       #title, body_html
                string = inst.replace('\n', '')
                string = re.sub(r'[a-zA-z+]', r'', string)
                string = re.sub(r'[^\w\s]', r'', string)
                string = re.sub(r'[0-9]+', r'', string)
                string = string.replace(' ', '')
                string = re.sub(r'(.)', '\g<1> ', string)
                doc.append(string)
            else:
                doc.append(inst)
            sent = ' '.join(doc)
        docs.append(sent)
        del sent, doc, string 
    return docs

In [3]:
with open('./labelled.pickle', 'rb') as file:
    df = pickle.load(file)

In [4]:
corpus = form_docs(df)
#form tfidf table    
cv = CountVectorizer(max_features = MAX_VOCAB, tokenizer = lambda x: x.split(), ngram_range = (1, n))
word_count_vector = cv.fit_transform(corpus)
tfidf_trans = TfidfTransformer(smooth_idf = True,use_idf = True) 
tfidf_trans.fit(word_count_vector)
del word_count_vector

NameError: name 't_docs' is not defined

In [4]:
training = df.drop(df[df['labels'] == -1].index)
training = training.drop(training[training['price'] == 0].index)
training = training.drop(training[training['price'] > 1000000].index)

In [5]:
t_docs = form_docs(training)
wordcount = cv.transform(t_docs)
X_tfidf = tfidf_trans.transform(wordcount)

In [6]:
norm_price = []
for idx, inst in enumerate(training['price']):
    if inst > 1.7976931348623157e+308:
        norm_price.append(1)
    else:
        norm_price.append(1 /(1 + np.exp(-inst)))

In [7]:
X = np.append(X_tfidf.toarray(), np.array(norm_price)[:, np.newaxis], axis = 1)
y = training['labels']

In [8]:
from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier(n_neighbors = 9)
neigh.fit(X, y)
testing = df.drop(df[df['labels'] != -1].index)

In [None]:
test_docs = form_docs(testing)
t_wordcount = cv.transform(test_docs)
test_tfidf = tfidf_trans.transform(t_wordcount)
pred = neigh.predict(test_tfidf)
result = {'product_id':testing['id'], 'category_id':pred}
result = pd.DataFrame(result)
with open('./result.pickle', 'wb') as out:
    pickle.dump(result, out)

In [30]:
with open('./labelled.pickle', 'rb') as file:
    df = pickle.load(file)

In [1]:
import pandas as pd
import pickle
import re
import numpy as np
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.neighbors import KNeighborsClassifier

n = 3
MAX_VOCAB = 999

def form_docs(df):
    docs = []
    #make docs
    for row in df.values:
        doc = []
        for idx, inst in enumerate(row[2:6]):
            if inst != inst:
                inst = '<NAN>'
            elif idx == 3 or idx == 4:                                       #title, body_html
                string = inst.replace('\n', '')
                string = re.sub(r'[a-zA-z+]', r'', string)
                string = re.sub(r'[^\w\s]', r'', string)
                string = re.sub(r'[0-9]+', r'', string)
                string = string.replace(' ', '')
                string = re.sub(r'(.)', '\g<1> ', string)
                doc.append(string)
            else:
                doc.append(inst)
            sent = ' '.join(doc)
        docs.append(sent)
        del sent, doc, string 
    return docs
if __name__ == '__main__':
    
    print('Opening Data...')
    with open('./labelled.pickle', 'rb') as file:
        df = pickle.load(file)
    
    print('Forming Tfidf Table...')
    #form tfidf table   
    corpus = form_docs(df) 
    cv = CountVectorizer(max_features = MAX_VOCAB, tokenizer = lambda x: x.split(), ngram_range = (1, n))
    word_count_vector = cv.fit_transform(corpus)
    tfidf_trans = TfidfTransformer(smooth_idf = True,use_idf = True) 
    tfidf_trans.fit(word_count_vector)
    del word_count_vector, corpus
    
    print('Cleansing Data...')
    training = df.drop(df[df['labels'] == -1].index)
    training = training.drop(training[training['price'] == 0].index)
    training = training.drop(training[training['price'] > 1000000].index)
    
    print('Forming Training Tfidf Vectors...')
    t_docs = form_docs(training)
    wordcount = cv.transform(t_docs)
    X_tfidf = tfidf_trans.transform(wordcount)
    del wordcount, t_docs

    print('Normalizing Training Price...')
    norm_price = []
    for idx, inst in enumerate(training['price']):
        if inst > 1.7976931348623157e+308:
            norm_price.append(1)
        else:
            norm_price.append(1 /(1 + np.exp(-inst)))
    X = np.append(X_tfidf.toarray(), np.array(norm_price)[:, np.newaxis], axis = 1)
    y = training['labels']
    
    print('Start Training Process...')
    neigh = KNeighborsClassifier(n_neighbors = 5)
    neigh.fit(X, y)
    del X_tfidf, norm_price
    print('Finished Training')
    
    print('Forming Testing Tfidf Vectors...')
    testing = df.drop(df[df['labels'] != -1].index)
    test_docs = form_docs(testing)
    t_wordcount = cv.transform(test_docs)
    test_tfidf = tfidf_trans.transform(t_wordcount)
    del t_wordcount, test_docs
    
    

Opening Data...
Forming Tfidf Table...
Cleansing Data...
Forming Training Tfidf Vectors...
Normalizing Training Price...
Start Training Process...
Finished Training
Forming Testing Tfidf Vectors...


In [3]:
print('Normalizing Testing Price...')
t_norm_price = []
for idx, inst in enumerate(testing['price']):
    if inst > 1.7976931348623157e+308:
        t_norm_price.append(1)
    else:
        t_norm_price.append(1 /(1 + np.exp(-inst)))

Normalizing Testing Price...


In [4]:
print(np.array(t_norm_price).shape)

(819561,)


In [5]:
np.array(test_tfidf.toarray()).shape

(819561, 999)

In [6]:
X = np.append(test_tfidf.toarray(), np.array(t_norm_price)[:, np.newaxis], axis = 1)
del t_norm_price

print('Start Label Prediction...')
pred = neigh.predict(X)
result = {'product_id':testing['id'], 'category_id':pred}
result = pd.DataFrame(result)

print('Saving Prediction...')
with open('./result.pickle', 'wb') as out:
    pickle.dump(result, out)
print('Process Finished.')


Start Label Prediction...
Saving Prediction...
Process Finished.


In [1]:
import pickle
with open('./labelled.pickle', 'rb') as file:
    df = pickle.load(file)

In [2]:
df.head()

Unnamed: 0,id,shop_id,handle,title,body_html,product_type,price,labels
0,15207,1526,e03,全麥核桃,<p>用心選用整粒麥子研磨的全粒麵粉，完整保留麥子的營養與濃郁香氣；外皮酥軟，配上鬆脆的核桃...,歐式麵包,85.0,-1
1,15207,1526,e03,全麥核桃,<p>用心選用整粒麥子研磨的全粒麵粉，完整保留麥子的營養與濃郁香氣；外皮酥軟，配上鬆脆的核桃...,歐式麵包,85.0,-1
2,15207,1526,e03,全麥核桃,<p>用心選用整粒麥子研磨的全粒麵粉，完整保留麥子的營養與濃郁香氣；外皮酥軟，配上鬆脆的核桃...,歐式麵包,85.0,-1
3,16354,1282,dual-ribbons,"特價 - 雙彩蝴蝶結（0-6個月, 10-12歲）","<div style=""font-family: Tahoma, Arial, sans-s...",經典公主襪,159.0,-1
4,16354,1282,dual-ribbons,"特價 - 雙彩蝴蝶結（0-6個月, 10-12歲）","<div style=""font-family: Tahoma, Arial, sans-s...",經典公主襪,159.0,-1
