In [1]:
import sys
import tqdm
sys.path.append('/usr/local/lib/python3.7/site-packages')
import json,nltk
import numpy as np
from nltk.tokenize import sent_tokenize
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.stem.porter import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
import re
import nltk
#nltk.download('stopwords')
#nltk.download('wordnet') 
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import RegexpTokenizer
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
import re
import copy

In [2]:
lem = WordNetLemmatizer()
stem = PorterStemmer()
word = "inversely"
stop_words = set(stopwords.words("english"))
new_words = ["using", "show", "result", "large", "also", "iv", "one", "two", "new", "previously", "shown"]
stop_words = stop_words.union(new_words)

## Functions Definitions:

In [3]:
def get_cosine_sim(*strs): 
    vectors = [t for t in get_vectors(*strs)]
    return cosine_similarity(vectors)
    
def get_vectors(*strs):
    text = [t for t in strs]
    vectorizer = CountVectorizer(text)
    vectorizer.fit(text)
    return vectorizer.transform(text).toarray()

In [5]:
def get_single_keywords(text2):
    corpus = []
    
    #Remove punctuations
    text = re.sub('[^a-zA-Z]', ' ', text2)

    #Convert to lowercase
    text = text.lower()

    #remove tags
    text=re.sub("&lt;/?.*?&gt;"," &lt;&gt; ",text)

    # remove special characters and digits
    text=re.sub("(\\d|\\W)+"," ",text)

    ##Convert to list from string
    text = text.split()

    ##Stemming
    ps=PorterStemmer()
    #Lemmatisation
    lem = WordNetLemmatizer()
    text = [lem.lemmatize(word) for word in text if not word in  
            stop_words] 
    text = " ".join(text)
    corpus.append(text)
    cv=CountVectorizer(stop_words=stop_words, max_features=10000, ngram_range=(1,3))
    X=cv.fit_transform(corpus)
    keys = list(cv.vocabulary_.keys())
    keys2 = []
    for i in keys:
        if len(i.split(" ")) > 1:
            break
        keys2.append(i)
    return keys2

In [6]:
def get_cost_line(text):
    tokens = sent_tokenize(text)
    tokens.append("$300 shipped")
    tokens.append("$300 shipping")
    tokens.append("cost")
    tokens.append("about $450")
    tokens.append("asking $450")
    tokens.append("single $450")
    vecs = get_cosine_sim(*tokens)
    common_sents = []
    for j,evec in enumerate(vecs):
        if j == len(vecs) - 6:
            break
        if (evec[-1] != 0 and round(evec[-1]) != 1) or (evec[-2] != 0 and round(evec[-2]) != 1) or (evec[-3] != 0 and round(evec[-3]) != 1) or (evec[-4] != 0 and round(evec[-4]) != 1 or (evec[-5] != 0 and round(evec[-5]) != 1)):
            common_sents.append(tokens[j])
    return common_sents

In [29]:
def filter_price(money):
    irr = ["*","ft","/","+","lb","eating","1.0","0.1","hrs","g","-","in","year","yr","sq","mon","th","hours","nd","gal","x","foot","obo","prs"]
    if len(money) <=2:
        return False
    if money[0].isnumeric() == False:
        return False
    for i in irr:
        if money.find(i) >= 0:
#             print(money,i)
            return False      
    for i in range(1995,2019):
        if money.find(str(i)) >= 0:
            return False
    return True

In [30]:
def get_money_value(text):
    tags = nltk.pos_tag(nltk.word_tokenize(text))
    for j,i in enumerate(tags):
        if i[1] == '$':
            if tags[j+1][1] == 'CD':
                    return tags[j+1][0]
        if i[1] == 'CD' and i[0].isnumeric:
            if len(str(i[0]))>=2 and tags[j+1][0]!='%':
                return tags[j][0]
    return 0

## Functional APIs

In [31]:
def load_json(name="zoo_data_n.json"):
    with open(name) as f:
        data = json.load(f)
    return data

In [32]:
def get_cleaned_data(data):
    data2 = copy.deepcopy(data)
    new_features = []
    j = 0
    fl = 0
    for i in tqdm.tqdm(data2['features']):
        money = 0 
        if fl == 0:
            fl = 1
            continue
        try:
            money_lines = get_cost_line(i['Description'])
        except:
            continue
        for j in money_lines:
            try:
                money = get_money_value(j)
            except:
                continue
            if money != 0:
                break
        if money == 0:
            continue
        else:
            try:
                keys = get_single_keywords(i['Description'])
            except:
                continue
            i.pop('Description',None)
            i['Price'] = money
            i['Keywords'] = keys
            valid = filter_price(money)
            if valid == False:
                continue
            new_features.append(i)
    return new_features

In [39]:
def dump_json(new_features,name='zoo_data_nf.json'):
    data_n={}
    data_n['type'] = 'FeatureCollection'
    data_n['name'] = 'Zoo'
    data_n['features'] = new_features
    with open(name, 'w') as f:
        json.dump(data_n, f)

## Use Case:

In [33]:
data = load_json()
new_features = get_cleaned_data(data)

100%|██████████| 87919/87919 [02:51<00:00, 511.21it/s]


In [36]:
new_features

[{'User_name': 'Red769',
  'AD_name': 'Manokwari GTP',
  'Date': '10-17-2019, 05:37 PM',
  'Type_1': '> Reptile & Amphibian - Classifieds',
  'Type_2': '> Snakes For Sale/Wanteds',
  'Type_3': '> Arboreal Boas/Pythons',
  'Sale_or_no': '[For Sale] Manokwari GTP',
  'Price': '500',
  'Keywords': ['owned',
   'year',
   'since',
   'baby',
   'great',
   'eater',
   'shed',
   'well',
   'asking',
   'pick',
   'hamburg',
   'reptile',
   'believe',
   'male',
   'going',
   'probed',
   'expert',
   'sexing']},
 {'User_name': 'Red769',
  'AD_name': 'Manokwari GTP',
  'Date': '10-17-2019, 05:37 PM',
  'Type_1': '> Reptile & Amphibian - Classifieds',
  'Type_2': '> Snakes For Sale/Wanteds',
  'Type_3': '> Arboreal Boas/Pythons',
  'Sale_or_no': '[For Sale] Manokwari GTP',
  'Price': '500',
  'Keywords': ['owned',
   'year',
   'since',
   'baby',
   'great',
   'eater',
   'shed',
   'well',
   'asking',
   'pick',
   'hamburg',
   'reptile',
   'believe',
   'male',
   'going',
   'probe

In [40]:
dump_json(new_features,"zoo_data_nf2.json")